In [1]:
#!/usr/bin/env python
# coding: utf-8
%matplotlib inline
import re
import json
import datetime
import requests
import subprocess
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as md

matplotlib.style.use('ggplot')

In [2]:
def requestIEEE():
    url = 'http://standards-oui.ieee.org/oui.txt'
    res = requests.get(url)
    vendors = {}
    if res.status_code == 200:
        content = res.content.decode('utf-8')
        content = re.sub(r'\t', ' ', content)
        content = re.sub(r'\ +', ' ', content)
        content = content.splitlines()
        content = [line.strip() for line in content if line]
        return content

def ieee():

    mac_vendors_file = Path("/home/juan/scans/mac_vendors.txt")
    if mac_vendors_file.is_file():
        with open(mac_vendors_file, 'r') as f:
            content = f.readlines()
        content = [x.strip() for x in content] 
    else:
        content = requestIEEE()
        with open(mac_vendors_file, 'w') as f:
            for line in content:
                f.write(line + '\n')
    
    return content
    
def macvendors():
    content = ieee()
    vendors = {}
    for line in content:
        if "(base 16)" in line:
            line = line.split("(base 16)")
            vendors[line[0].strip().lower()] = line[1].strip()
        
    return vendors

In [3]:
vendors = macvendors()

In [12]:
def datetime2iso(o):
    if isinstance(o, (datetime.date, datetime.datetime)):
        return o.isoformat()


def save2ES(data):
    url = 'http://127.0.0.1:9200/wifispy/_doc'
    headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
    data_json = json.dumps(data, default=datetime2iso)
    r = requests.post(url=url, headers=headers, json=json.loads(data_json))

    if r.status_code == 200:
        print("Elasticsearch. Data saved.")
    else:
        print("Elasticsearch. Something went wrong.")


def todate(date_text):
    try:
        # return datetime.datetime.strptime(date_text, '%Y-%m-%d %H:%M:%S')
        return pd.to_datetime(date_text, dayfirst=True, format='%Y-%m-%d %H:%M:%S')
    except:
        return None


def moveCol(df, target, pos=0):
    # move id to first position
    cols = list(df)
    cols.insert(pos, cols.pop(cols.index(target)))
    return df.loc[:, cols]


def createids(df):
    keys = {}
    for key in df['id']:
        if not keys.get(key, None):
            keys[key] = len(keys) + 1

    df['id'] = df['id'].map(keys)
    return df


def data2dataframe(data, primarykey):
    if data:
        data = pd.DataFrame(data)
        data.replace(r'^\s*$', np.nan, regex=True)
        data['timeSeen'] = (data['lastTimeSeen'] - data['firstTimeSeen']).apply(lambda x: x.total_seconds())
        
        totalTimeSeen = data.groupby([primarykey])['timeSeen'].sum()
        data = pd.merge(data, totalTimeSeen, how='left', on=None, left_on=primarykey, right_index=True, suffixes=('', 'Total'))
        data = data.sort_values(['timeSeen', 'lastTimeSeen'], ascending=[False, True])
        
        data['id'] = data[primarykey]
        data = createids(data)
        data = moveCol(df=data, target='id', pos=0)
        
        colnames = list(data.columns)
        
        if 'power' in colnames:
            avgPower = data.groupby([primarykey])['power'].mean()
            data = pd.merge(data, avgPower, how='left', on=None, left_on=primarykey, right_index=True, suffixes=('', 'Avg'))
        
        if 'packets' in colnames:
            avgPackets = data.groupby([primarykey])['packets'].mean()
            sumPackets = data.groupby([primarykey])['packets'].sum()
            data = pd.merge(data, avgPackets, how='left', on=None, left_on=primarykey, right_index=True, suffixes=('', 'Avg'))
            data = pd.merge(data, sumPackets, how='left', on=None, left_on=primarykey, right_index=True, suffixes=('', 'Total'))
            
        data['tick'] = 1
        
        data.set_index('lastTimeSeen', inplace=True)
        print("Data size: {:}".format(data.shape))
    return data


def readData():
    ipath = '/home/juan/scans/outputs/'

    filters = ['PCWifi']

    headers, content = readscans(ipath=ipath, filters=filters)

    print("Headers size: {:,}; Content size: {:,}.".format(len(headers), len(content)))

    return headers, content


def tickDataFrame(dateFrom, dateTo):
    times = np.arange(dateFrom, dateTo, dtype="M8[s]")
    return pd.DataFrame(columns=['times'], data=times)

def readscans(ipath, filters=None):
    headers = []
    content = []

    hdr_lbl = ["BSSID", "First time seen", "Last time seen", "channel", "Speed", "Privacy", "Cipher",
               "Authentication", "Power", "# beacons", "# IV", "LAN IP", "ID-length", "ESSID", "Key"]

    data_lbl = ["Station MAC", "First time seen", "Last time seen", "Power", "# packets", "BSSID",
                "Probed ESSIDs"]

    cmd = "find {ipath} -maxdepth 1 -type f -name '*.csv' -print0 | sort -zV | xargs -0 cat".format(ipath=ipath)
    if filters:
        if isinstance(filters, str):
            filters = list(filters)
        for fltr in filters:
            cmd = "{cmd} | grep -E 'BSSID|{fltr}'".format(cmd=cmd, fltr=fltr)

    result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE)
    lines = result.stdout.decode('utf-8').splitlines()

    isHeader = False
    isContent = False

    for line in lines:

        if line:

            if isinstance(line, str):
                line = line.split(',')

            if len(line) == len(hdr_lbl) or len(line) == len(data_lbl):

                line = [cell.strip() for cell in line]

                if all([col in hdr_lbl for col in line]):
                    isHeader = True
                    isContent = False
                    continue

                elif all([col in data_lbl for col in line]):
                    isHeader = False
                    isContent = True
                    continue

                elif isHeader or isContent:

                    first_time_seen = todate(line[1])
                    last_time_seen = todate(line[2])

                    if isHeader and first_time_seen and last_time_seen:
                        
                        bssidVendor = re.sub(r'[^A-Za-z0-9]', '', line[0])[:6].lower()
                        
                        headers.append({
                            "bssid": line[0],
                            "firstTimeSeen": first_time_seen,
                            "lastTimeSeen": last_time_seen,
                            "channel": int(line[3]),
                            "speed": int(line[4]),
                            "privacy": line[5],
                            "cipher": line[6],
                            "authentication": line[7],
                            "power": int(line[8]),
                            "beacons": line[9],
                            "iv": line[10],
                            "lanIP": line[11],
                            "idLength": line[12],
                            "essid": line[13],
                            "key": line[14],
                            "bssidVendor": vendors.get(bssidVendor, '')
                        })

                    elif isContent and first_time_seen and last_time_seen:

                        first_time_seen = todate(line[1])
                        last_time_seen = todate(line[2])

                        if first_time_seen and last_time_seen:
                            
                            stationMacVendor = re.sub(r'[^A-Za-z0-9]', '', line[0])[:6].lower()
                            bssidVendor = re.sub(r'[^A-Za-z0-9]', '', line[5])[:6].lower()
                            
                            content.append({
                                "stationMac": line[0],
                                "firstTimeSeen": first_time_seen,
                                "lastTimeSeen": last_time_seen,
                                "power": int(line[3]),
                                "packets": int(line[4]),
                                "bssid": line[5],
                                "essids": line[6],
                                "stationMacVendor": vendors.get(stationMacVendor, ''),
                                "bssidVendor": vendors.get(bssidVendor, '')
                            })

    return headers, content

In [5]:
headers, content = readData()

Headers size: 10,250; Content size: 33,981.


In [13]:
dfheaders = data2dataframe(headers, 'bssid')
dfcontent = data2dataframe(content, 'stationMac')

Data size: (10250, 20)
Data size: (33981, 15)


In [14]:
oHeader = '/home/juan/scans/dataframes/header.csv'
oContent = '/home/juan/scans/dataframes/content.csv'
dfheaders.to_csv(path_or_buf=oHeader)
dfcontent.to_csv(path_or_buf=oContent)

In [15]:
dfcontent.head()

Unnamed: 0_level_0,id,stationMac,firstTimeSeen,power,packets,bssid,essids,stationMacVendor,bssidVendor,timeSeen,timeSeenTotal,powerAvg,packetsAvg,packetsTotal,tick
lastTimeSeen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2019-12-09 08:57:43,1,E0:33:8E:17:70:2F,2019-12-09 08:57:13,170,661,E2:55:7D:57:DA:60,PCWifi,"Apple, Inc.",,30.0,10208.0,174.972143,27.683343,48695,1
2019-12-09 08:58:13,1,E0:33:8E:17:70:2F,2019-12-09 08:57:43,160,671,E2:55:7D:57:DA:60,PCWifi,"Apple, Inc.",,30.0,10208.0,174.972143,27.683343,48695,1
2019-12-09 09:00:43,2,EC:2C:E2:35:EA:31,2019-12-09 09:00:13,168,190,E2:55:7D:57:DA:60,PCWifi,"Apple, Inc.",,30.0,8291.0,187.30286,18.258553,32555,1
2019-12-09 09:24:14,3,58:E2:8F:AE:45:73,2019-12-09 09:23:44,183,246,E2:55:7D:57:DA:60,PCWifi,"Apple, Inc.",,30.0,486.0,168.222222,10.763441,3003,1
2019-12-09 09:33:14,4,D8:55:75:95:16:04,2019-12-09 09:32:44,194,49,E2:55:7D:57:DA:60,PCWifi,"Samsung Electronics Co.,Ltd",,30.0,34452.0,188.016058,60.537226,82936,1
