In [328]:
import plotly.express as px
import pandas as pd
import numpy as np

Aggregate Data Sets

In [329]:
dataTemp = []
files = ["SSDP_1.csv" ,"SSDP_135.csv", "SSDP_109.csv", "SSDP_12.csv"]
for i in files:
    dataTemp.append(pd.read_csv(f"/Users/manasg/Downloads/Final_project_ML/AWID/AWID3_Dataset_CSV/CSV/11.SSDP/{i}", low_memory=True))
    print(f"read {i} into memory")

df = pd.concat(dataTemp)
print(df.shape)

read SSDP_1.csv into memory
read SSDP_135.csv into memory
read SSDP_109.csv into memory
read SSDP_12.csv into memory
(200000, 254)


Label analysis

In [330]:
print(df["Label"].unique())
print(df["Label"].value_counts())
fig = px.bar(df["Label"].value_counts())
fig.show()

['Normal' 'SSDP']
Normal    115348
SSDP       84652
Name: Label, dtype: int64


In [331]:
print(df.columns)

frame.encap_type
frame.len
frame.number
frame.time
frame.time_delta
frame.time_delta_displayed
frame.time_epoch
frame.time_relative
radiotap.channel.flags.cck
radiotap.channel.flags.ofdm
radiotap.channel.freq
radiotap.datarate
radiotap.dbm_antsignal
radiotap.length
radiotap.mactime
radiotap.present.tsft
radiotap.rxflags
radiotap.timestamp.ts
radiotap.vendor_oui
wlan.duration
wlan.analysis.kck
wlan.analysis.kek
wlan.bssid
wlan.country_info.fnm
wlan.country_info.code
wlan.da
wlan.fc.ds
wlan.fc.frag
wlan.fc.order
wlan.fc.moredata
wlan.fc.protected
wlan.fc.pwrmgt
wlan.fc.type
wlan.fc.retry
wlan.fc.subtype
wlan.fcs.bad_checksum
wlan.fixed.beacon
wlan.fixed.capabilities.ess
wlan.fixed.capabilities.ibss
wlan.fixed.reason_code
wlan.fixed.timestamp
wlan.ra
wlan_radio.duration
wlan.rsn.ie.gtk.key
wlan.rsn.ie.igtk.key
wlan.rsn.ie.pmkid
wlan.sa
wlan.seq
wlan.ssid
wlan.ta
wlan.tag
wlan.tag.length
wlan_radio.channel
wlan_radio.data_rate
wlan_radio.end_tsf
wlan_radio.frequency
wlan_radio.signal_dbm
w

Null analysis

In [332]:
df.dropna(axis=1, how="all", inplace=True)
nulls = df.isna()
print(df.shape)

(200000, 202)


In [333]:
nonNullCols = []
for col in nulls.columns:
    if len(nulls[col].unique()) == 1 and nulls[col].unique()[0] == False:
       nonNullCols.append(col)
       
nulls.drop(nonNullCols, axis=1, inplace=True)

nullRatios = np.asarray([nulls[c].value_counts()[True] / nulls[c].shape[0] for c in nulls], dtype=float)
nullFilt = np.array([x > 0.5 for x in nullRatios])
colswithvals = np.array(nulls.columns[[not x for x in nullFilt]])
colswithvals = np.concatenate([colswithvals, nonNullCols])
print(f"Cols with greater then 50% values: {colswithvals}")
print(f'Count: {len(colswithvals)}')

Cols with greater then 50% values: ['wlan.bssid' 'wlan.da' 'wlan.sa' 'wlan.seq' 'wlan.ta' 'llc' 'ip.dst'
 'ip.proto' 'ip.src' 'ip.ttl' 'ip.version' 'frame.encap_type' 'frame.len'
 'frame.number' 'frame.time' 'frame.time_delta'
 'frame.time_delta_displayed' 'frame.time_epoch' 'frame.time_relative'
 'radiotap.channel.flags.cck' 'radiotap.channel.flags.ofdm'
 'radiotap.channel.freq' 'radiotap.dbm_antsignal' 'radiotap.length'
 'radiotap.present.tsft' 'radiotap.rxflags' 'radiotap.timestamp.ts'
 'wlan.duration' 'wlan.fc.ds' 'wlan.fc.frag' 'wlan.fc.order'
 'wlan.fc.moredata' 'wlan.fc.protected' 'wlan.fc.pwrmgt' 'wlan.fc.type'
 'wlan.fc.retry' 'wlan.fc.subtype' 'wlan.ra' 'wlan_radio.duration'
 'wlan_radio.channel' 'wlan_radio.data_rate' 'wlan_radio.frequency'
 'wlan_radio.signal_dbm' 'wlan_radio.phy' 'Label']
Count: 45


Ip analysis

In [334]:
ip = ["ip.dst","ip.proto","ip.src","ip.ttl","ip.version"]

ipOnly = df.drop(df.columns.difference(ip), axis=1)
print(ipOnly.shape)
for c in ipOnly.columns:
    print(ipOnly[c].isna().value_counts())

ipOnly.dropna(axis=0, how="any", inplace=True)

print(ipOnly.columns)

(200000, 5)
False    129001
True      70999
Name: ip.dst, dtype: int64
False    129001
True      70999
Name: ip.proto, dtype: int64
False    129001
True      70999
Name: ip.src, dtype: int64
False    129001
True      70999
Name: ip.ttl, dtype: int64
False    129059
True      70941
Name: ip.version, dtype: int64
Index(['ip.dst', 'ip.proto', 'ip.src', 'ip.ttl', 'ip.version'], dtype='object')


#### Deleting the rows from the DataFrame with NA values from the IP columns

In [335]:
print("df shape before: ", df.shape)
for i in ipOnly.columns:
    df.dropna(subset=[i], axis=0, how="any", inplace=True)

print("df shape after:", df.shape)

df shape before:  (200000, 202)
df shape after: (129001, 202)


In [336]:
pd.set_option('display.max_columns', None)
df.head(10)

Unnamed: 0,frame.encap_type,frame.len,frame.number,frame.time,frame.time_delta,frame.time_delta_displayed,frame.time_epoch,frame.time_relative,radiotap.channel.flags.cck,radiotap.channel.flags.ofdm,radiotap.channel.freq,radiotap.datarate,radiotap.dbm_antsignal,radiotap.length,radiotap.mactime,radiotap.present.tsft,radiotap.rxflags,radiotap.timestamp.ts,wlan.duration,wlan.analysis.kck,wlan.analysis.kek,wlan.bssid,wlan.country_info.fnm,wlan.country_info.code,wlan.da,wlan.fc.ds,wlan.fc.frag,wlan.fc.order,wlan.fc.moredata,wlan.fc.protected,wlan.fc.pwrmgt,wlan.fc.type,wlan.fc.retry,wlan.fc.subtype,wlan.fixed.beacon,wlan.fixed.capabilities.ess,wlan.fixed.capabilities.ibss,wlan.fixed.reason_code,wlan.fixed.timestamp,wlan.ra,wlan_radio.duration,wlan.rsn.ie.gtk.key,wlan.rsn.ie.igtk.key,wlan.sa,wlan.seq,wlan.ssid,wlan.ta,wlan.tag,wlan.tag.length,wlan_radio.channel,wlan_radio.data_rate,wlan_radio.end_tsf,wlan_radio.frequency,wlan_radio.signal_dbm,wlan_radio.start_tsf,wlan_radio.phy,wlan_radio.timestamp,wlan.rsn.capabilities.mfpc,wlan_rsna_eapol.keydes.msgnr,wlan_rsna_eapol.keydes.data,wlan_rsna_eapol.keydes.data_len,wlan_rsna_eapol.keydes.key_info.key_mic,wlan_rsna_eapol.keydes.nonce,eapol.keydes.key_len,eapol.keydes.replay_counter,eapol.len,eapol.type,llc,arp,arp.hw.type,arp.proto.type,arp.hw.size,arp.proto.size,arp.opcode,arp.src.hw_mac,arp.src.proto_ipv4,arp.dst.hw_mac,arp.dst.proto_ipv4,ip.dst,ip.proto,ip.src,ip.ttl,ip.version,data.data,data.len,tcp.ack,tcp.ack_raw,tcp.analysis,tcp.analysis.flags,tcp.analysis.retransmission,tcp.analysis.rto_frame,tcp.checksum,tcp.checksum.status,tcp.flags.syn,tcp.dstport,tcp.flags.ack,tcp.flags.fin,tcp.flags.push,tcp.flags.reset,tcp.option_len,tcp.payload,tcp.seq,tcp.seq_raw,tcp.srcport,tcp.time_delta,tcp.time_relative,udp.dstport,udp.srcport,udp.length,udp.payload,udp.time_relative,udp.time_delta,nbns,nbss.type,nbss.length,smb.access.generic_execute,smb.access.generic_read,smb.access.generic_write,smb.flags.notify,smb.flags.response,smb.flags2.nt_error,smb.flags2.sec_sig,smb.mid,smb.server_component,smb.pid.high,smb.tid,smb2.auth_frame,smb2.buffer_code,smb2.cmd,smb2.fid,smb2.header_len,smb2.msg_id,smb2.pid,smb2.protocol_id,smb2.sesid,smb2.session_flags,smb2.tid,dhcp,dhcp.id,dhcp.ip.client,dhcp.ip.relay,dhcp.ip.server,mdns,dns,dns.a,dns.count.add_rr,dns.count.answers,dns.count.auth_rr,dns.count.labels,dns.count.queries,dns.flags.authoritative,dns.flags.checkdisable,dns.flags.opcode,dns.flags.response,dns.id,dns.qry.name,dns.qry.name.len,dns.resp.name,dns.resp.ttl,dns.resp.len.1,dns.retransmit_request,dns.retransmit_response,dns.time,ssdp,http.connection,http.content_length,http.content_type,http.date,http.file_data,http.host,http.last_modified,http.location,http.request.full_uri,http.request.line,http.request.method,http.request.uri.path,http.request.uri.query,http.request.uri.query.parameter,http.request.version,http.request_in,http.response.code,http.response.code.desc,http.response.line,http.response.phrase,http.response.version,http.response_for.uri,http.referer,http.time,http.server,json.value.string,json.key,tls.alert_message.desc,tls.alert_message.level,tls.app_data_proto,tls.compress_certificate.compressed_certificate_message.length,tls.handshake.extension.type,tls.handshake.extensions_key_share_group,tls.handshake.session_ticket_length,tls.handshake.version,tls.record.content_type,tls.record.version,Label
1,23,170,50002,Dec 13- 2020 20:55:27.014771000 GTB Standard Time,9.5e-05,9.5e-05,1607885727,194.715447,0,1,5180,,-96,64,,0-0-0,0x00000000,719218845,48,,,0c:9d:92:54:fe:34,,,94:e9:79:82:c5:77,0x00000002,0,0,0,1,0,2,0,8,,,,,,94:e9:79:82:c5:77,51,,,0c:9d:92:54:fe:30,920.0,,0c:9d:92:54:fe:34,,,36,78.0,,5180,-30,,8,,,,,,,,,,,,llc,,,,,,,,,,,192.168.2.41,6,162.125.66.14,57,4,,,1326268,3311914000,1,1.0,,,0x00004f3b,2,0,37880,1,0,0,0,10.0,,4185,2008319236,443,0.007536,2.58039,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Normal
5,23,1550,50006,Dec 13- 2020 20:55:27.015709000 GTB Standard Time,0.000687,0.000687,1607885727,194.716385,0,1,5180,,-96,64,,0-0-0,0x00000000,719219218,48,,,0c:9d:92:54:fe:34,,,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,,,,,,50:3e:aa:e3:1f:be,108,,,0c:9d:92:54:fe:30,911.0,,0c:9d:92:54:fe:34,,,36,173.333,,5180,-30,,8,,,,,,,,,,,,llc,,,,,,,,,,,192.168.2.190,6,151.101.17.140,58,4,,,6083,725323905,1,,,,0x0000e602,2,0,60789,1,0,0,0,,f53bd36c51fe3243e01a5be4dc1dd385b51a3154edf4b4...,1879177,2191929095,443,0.001308,5.906267,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Normal
6,23,1550,50007,Dec 13- 2020 20:55:27.015726000 GTB Standard Time,1.7e-05,1.7e-05,1607885727,194.716402,0,1,5180,,-96,64,,0-0-0,0x00000000,719219218,48,,,0c:9d:92:54:fe:34,,,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,,,,,,50:3e:aa:e3:1f:be,108,,,0c:9d:92:54:fe:30,912.0,,0c:9d:92:54:fe:34,,,36,173.333,,5180,-30,,8,,,,,,,,,,,,llc,,,,,,,,,,,192.168.2.190,6,151.101.17.140,58,4,,,6083,725323905,1,,,,0x000046da,2,0,60789,1,0,0,0,,9d4ea0cba8737506ab7597b3646548770fb00103476f73...,1880569,2191930487,443,1.7e-05,5.906284,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Normal
7,23,1550,50008,Dec 13- 2020 20:55:27.015729000 GTB Standard Time,3e-06,3e-06,1607885727,194.716405,0,1,5180,,-96,64,,0-0-0,0x00000000,719219218,48,,,0c:9d:92:54:fe:34,,,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,,,,,,50:3e:aa:e3:1f:be,108,,,0c:9d:92:54:fe:30,913.0,,0c:9d:92:54:fe:34,,,36,173.333,,5180,-30,,8,,,,,,,,,,,,llc,,,,,,,,,,,192.168.2.190,6,151.101.17.140,58,4,,,6083,725323905,1,,,,0x0000efee,2,0,60789,1,0,0,0,,88506b941362c6142bbeffc3be955d9a52c1a5ce208ba8...,1881961,2191931879,443,3e-06,5.906287,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Normal
8,23,1550,50009,Dec 13- 2020 20:55:27.015735000 GTB Standard Time,6e-06,6e-06,1607885727,194.716411,0,1,5180,,-96,64,,0-0-0,0x00000000,719219218,48,,,0c:9d:92:54:fe:34,,,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,,,,,,50:3e:aa:e3:1f:be,108,,,0c:9d:92:54:fe:30,914.0,,0c:9d:92:54:fe:34,,,36,173.333,,5180,-30,,8,,,,,,,,,,,,llc,,,,,,,,,,,192.168.2.190,6,151.101.17.140,58,4,,,6083,725323905,1,,,,0x0000b0d4,2,0,60789,1,0,0,0,,5df54088984883d18e0c57139d47ebd6cf873cdd71ec20...,1883353,2191933271,443,6e-06,5.906293,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Normal
9,23,1550,50010,Dec 13- 2020 20:55:27.015738000 GTB Standard Time,3e-06,3e-06,1607885727,194.716414,0,1,5180,,-96,64,,0-0-0,0x00000000,719219218,48,,,0c:9d:92:54:fe:34,,,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,,,,,,50:3e:aa:e3:1f:be,108,,,0c:9d:92:54:fe:30,915.0,,0c:9d:92:54:fe:34,,,36,173.333,,5180,-30,,8,,,,,,,,,,,,llc,,,,,,,,,,,192.168.2.190,6,151.101.17.140,58,4,,,3246,3148673861,1,1.0,,,0x0000795f,2,0,60787,1,0,0,0,,796c4a51202caf1338c4404653ae722b3c675bf3d5f8ad...,446478,2469117635,443,0.00134,5.913162,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Normal
10,23,1550,50011,Dec 13- 2020 20:55:27.015741000 GTB Standard Time,3e-06,3e-06,1607885727,194.716417,0,1,5180,,-96,64,,0-0-0,0x00000000,719219218,48,,,0c:9d:92:54:fe:34,,,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,,,,,,50:3e:aa:e3:1f:be,108,,,0c:9d:92:54:fe:30,916.0,,0c:9d:92:54:fe:34,,,36,173.333,,5180,-30,,8,,,,,,,,,,,,llc,,,,,,,,,,,192.168.2.190,6,151.101.17.140,58,4,,,3246,3148673861,1,,,,0x0000dc16,2,0,60787,1,0,0,0,,2c04b2a17078bbcc16872b28f847a148ff7ca9c63b4dee...,447870,2469119027,443,3e-06,5.913165,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Normal
11,23,1550,50012,Dec 13- 2020 20:55:27.015745000 GTB Standard Time,4e-06,4e-06,1607885727,194.716421,0,1,5180,,-96,64,,0-0-0,0x00000000,719219218,48,,,0c:9d:92:54:fe:34,,,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,,,,,,50:3e:aa:e3:1f:be,108,,,0c:9d:92:54:fe:30,917.0,,0c:9d:92:54:fe:34,,,36,173.333,,5180,-30,,8,,,,,,,,,,,,llc,,,,,,,,,,,192.168.2.190,6,151.101.17.140,58,4,,,6083,725323905,1,,,,0x00003dc8,2,0,60789,1,0,0,0,,f857c58f487891bd459f756c5188254816449d0e833afa...,1884745,2191934663,443,1e-05,5.906303,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Normal
12,23,1550,50013,Dec 13- 2020 20:55:27.015748000 GTB Standard Time,3e-06,3e-06,1607885727,194.716424,0,1,5180,,-96,64,,0-0-0,0x00000000,719219218,48,,,0c:9d:92:54:fe:34,,,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,,,,,,50:3e:aa:e3:1f:be,108,,,0c:9d:92:54:fe:30,918.0,,0c:9d:92:54:fe:34,,,36,173.333,,5180,-30,,8,,,,,,,,,,,,llc,,,,,,,,,,,192.168.2.190,6,151.101.17.140,58,4,,,6083,725323905,1,,,,0x0000989e,2,0,60789,1,0,0,0,,c4eb2101af8b5227ce965b3c0418cd054d35682fb23d43...,1886137,2191936055,443,3e-06,5.906306,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Normal
13,23,1550,50014,Dec 13- 2020 20:55:27.015751000 GTB Standard Time,3e-06,3e-06,1607885727,194.716427,0,1,5180,,-96,64,,0-0-0,0x00000000,719219218,48,,,0c:9d:92:54:fe:34,,,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,,,,,,50:3e:aa:e3:1f:be,108,,,0c:9d:92:54:fe:30,919.0,,0c:9d:92:54:fe:34,,,36,173.333,,5180,-30,,8,,,,,,,,,,,,llc,,,,,,,,,,,192.168.2.190,6,151.101.17.140,58,4,,,6083,725323905,1,,,,0x00006a37,2,0,60789,1,0,0,0,,3695b6062d4141a307b36e6f9d591d6897268f1f9c44c9...,1887529,2191937447,443,3e-06,5.906309,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Normal


#### Further Null Analysis

In [337]:
# finding columns with NaN values and ratio of NaN values to the total
nan_col = df.columns[df.isna().any()].tolist()
print("\nNaN column count: ", len(nan_col))
print("Total columns:", len(df.columns))
nan_col_ratio = df.isna().mean().tolist()


nan_column = {nan_col[i]: nan_col_ratio[i] for i in range(len(nan_col))}
print("Columns with NaN:")
print(nan_column)
print("\n")

# get columns with ratio of NaN values more than 50%
useless_col = dict((k, v) for k, v in nan_column.items() if v >= 0.5)
print("Columns with more than 50% NaN values: ")
print(useless_col)
print(f'\nNumber of useless columns {len(useless_col)}')
print(f'Column with the least ratio: {min(useless_col, key=useless_col.get)}')

#print (df.isin([' ','NULL',0]).mean())


NaN column count:  157
Total columns: 202
Columns with NaN:
{'radiotap.datarate': 0.0, 'radiotap.mactime': 0.0, 'wlan.analysis.kck': 0.0, 'wlan.analysis.kek': 0.0, 'wlan.country_info.fnm': 0.0, 'wlan.country_info.code': 0.0, 'wlan.fixed.beacon': 0.0, 'wlan.fixed.capabilities.ess': 0.0, 'wlan.fixed.capabilities.ibss': 0.0, 'wlan.fixed.reason_code': 0.0, 'wlan.fixed.timestamp': 0.0, 'wlan.rsn.ie.gtk.key': 0.9993255866233595, 'wlan.rsn.ie.igtk.key': 0.0, 'wlan.ssid': 0.0, 'wlan.tag': 0.9993255866233595, 'wlan.tag.length': 0.0, 'wlan_radio.end_tsf': 0.0, 'wlan_radio.start_tsf': 0.0, 'wlan_radio.timestamp': 0.0, 'wlan.rsn.capabilities.mfpc': 1.0, 'wlan_rsna_eapol.keydes.msgnr': 1.0, 'wlan_rsna_eapol.keydes.data': 0.0, 'wlan_rsna_eapol.keydes.data_len': 1.0, 'wlan_rsna_eapol.keydes.key_info.key_mic': 1.0, 'wlan_rsna_eapol.keydes.nonce': 0.0, 'eapol.keydes.key_len': 0.0, 'eapol.keydes.replay_counter': 0.0, 'eapol.len': 0.0, 'eapol.type': 0.0, 'arp': 0.0, 'arp.hw.type': 0.0, 'arp.proto.type':

In [338]:
# dropping the columns with more than 50% null values

df = df.dropna(thresh=df.shape[0]*0.5,axis=1)
#df = df.dropna(thresh=df.shape[0]*0.5, axis=1, subset=df.columns.difference(ip))

print("New df shape:", df.shape)
df.head(10)

New df shape: (129001, 57)


Unnamed: 0,frame.encap_type,frame.len,frame.number,frame.time,frame.time_delta,frame.time_delta_displayed,frame.time_epoch,frame.time_relative,radiotap.channel.flags.cck,radiotap.channel.flags.ofdm,radiotap.channel.freq,radiotap.dbm_antsignal,radiotap.length,radiotap.present.tsft,radiotap.rxflags,radiotap.timestamp.ts,wlan.duration,wlan.bssid,wlan.da,wlan.fc.ds,wlan.fc.frag,wlan.fc.order,wlan.fc.moredata,wlan.fc.protected,wlan.fc.pwrmgt,wlan.fc.type,wlan.fc.retry,wlan.fc.subtype,wlan.ra,wlan_radio.duration,wlan.sa,wlan.seq,wlan.ta,wlan_radio.channel,wlan_radio.data_rate,wlan_radio.frequency,wlan_radio.signal_dbm,wlan_radio.phy,llc,ip.dst,ip.proto,ip.src,ip.ttl,ip.version,udp.dstport,udp.srcport,udp.length,udp.payload,udp.time_relative,udp.time_delta,ssdp,http.host,http.request.full_uri,http.request.line,http.request.method,http.request.version,Label
1,23,170,50002,Dec 13- 2020 20:55:27.014771000 GTB Standard Time,9.5e-05,9.5e-05,1607885727,194.715447,0,1,5180,-96,64,0-0-0,0x00000000,719218845,48,0c:9d:92:54:fe:34,94:e9:79:82:c5:77,0x00000002,0,0,0,1,0,2,0,8,94:e9:79:82:c5:77,51,0c:9d:92:54:fe:30,920.0,0c:9d:92:54:fe:34,36,78.0,5180,-30,8,llc,192.168.2.41,6,162.125.66.14,57,4,,,,,,,,,,,,,Normal
5,23,1550,50006,Dec 13- 2020 20:55:27.015709000 GTB Standard Time,0.000687,0.000687,1607885727,194.716385,0,1,5180,-96,64,0-0-0,0x00000000,719219218,48,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,50:3e:aa:e3:1f:be,108,0c:9d:92:54:fe:30,911.0,0c:9d:92:54:fe:34,36,173.333,5180,-30,8,llc,192.168.2.190,6,151.101.17.140,58,4,,,,,,,,,,,,,Normal
6,23,1550,50007,Dec 13- 2020 20:55:27.015726000 GTB Standard Time,1.7e-05,1.7e-05,1607885727,194.716402,0,1,5180,-96,64,0-0-0,0x00000000,719219218,48,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,50:3e:aa:e3:1f:be,108,0c:9d:92:54:fe:30,912.0,0c:9d:92:54:fe:34,36,173.333,5180,-30,8,llc,192.168.2.190,6,151.101.17.140,58,4,,,,,,,,,,,,,Normal
7,23,1550,50008,Dec 13- 2020 20:55:27.015729000 GTB Standard Time,3e-06,3e-06,1607885727,194.716405,0,1,5180,-96,64,0-0-0,0x00000000,719219218,48,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,50:3e:aa:e3:1f:be,108,0c:9d:92:54:fe:30,913.0,0c:9d:92:54:fe:34,36,173.333,5180,-30,8,llc,192.168.2.190,6,151.101.17.140,58,4,,,,,,,,,,,,,Normal
8,23,1550,50009,Dec 13- 2020 20:55:27.015735000 GTB Standard Time,6e-06,6e-06,1607885727,194.716411,0,1,5180,-96,64,0-0-0,0x00000000,719219218,48,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,50:3e:aa:e3:1f:be,108,0c:9d:92:54:fe:30,914.0,0c:9d:92:54:fe:34,36,173.333,5180,-30,8,llc,192.168.2.190,6,151.101.17.140,58,4,,,,,,,,,,,,,Normal
9,23,1550,50010,Dec 13- 2020 20:55:27.015738000 GTB Standard Time,3e-06,3e-06,1607885727,194.716414,0,1,5180,-96,64,0-0-0,0x00000000,719219218,48,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,50:3e:aa:e3:1f:be,108,0c:9d:92:54:fe:30,915.0,0c:9d:92:54:fe:34,36,173.333,5180,-30,8,llc,192.168.2.190,6,151.101.17.140,58,4,,,,,,,,,,,,,Normal
10,23,1550,50011,Dec 13- 2020 20:55:27.015741000 GTB Standard Time,3e-06,3e-06,1607885727,194.716417,0,1,5180,-96,64,0-0-0,0x00000000,719219218,48,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,50:3e:aa:e3:1f:be,108,0c:9d:92:54:fe:30,916.0,0c:9d:92:54:fe:34,36,173.333,5180,-30,8,llc,192.168.2.190,6,151.101.17.140,58,4,,,,,,,,,,,,,Normal
11,23,1550,50012,Dec 13- 2020 20:55:27.015745000 GTB Standard Time,4e-06,4e-06,1607885727,194.716421,0,1,5180,-96,64,0-0-0,0x00000000,719219218,48,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,50:3e:aa:e3:1f:be,108,0c:9d:92:54:fe:30,917.0,0c:9d:92:54:fe:34,36,173.333,5180,-30,8,llc,192.168.2.190,6,151.101.17.140,58,4,,,,,,,,,,,,,Normal
12,23,1550,50013,Dec 13- 2020 20:55:27.015748000 GTB Standard Time,3e-06,3e-06,1607885727,194.716424,0,1,5180,-96,64,0-0-0,0x00000000,719219218,48,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,50:3e:aa:e3:1f:be,108,0c:9d:92:54:fe:30,918.0,0c:9d:92:54:fe:34,36,173.333,5180,-30,8,llc,192.168.2.190,6,151.101.17.140,58,4,,,,,,,,,,,,,Normal
13,23,1550,50014,Dec 13- 2020 20:55:27.015751000 GTB Standard Time,3e-06,3e-06,1607885727,194.716427,0,1,5180,-96,64,0-0-0,0x00000000,719219218,48,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,50:3e:aa:e3:1f:be,108,0c:9d:92:54:fe:30,919.0,0c:9d:92:54:fe:34,36,173.333,5180,-30,8,llc,192.168.2.190,6,151.101.17.140,58,4,,,,,,,,,,,,,Normal


In [339]:
#### replacing the left over NaN values with 0 or unknown
import warnings
warnings.filterwarnings('ignore')
## checking if we havent dropped IP data
# for col in df.columns:
#     if "ip" in col:
#         print(col)
# print("\n")
cnt_num = 0
cnt = 0
non_num_feature = []
for col in df.columns:
    if (df[col].dtypes == "int64") or (df[col].dtypes == "float64") or ("udp" in col) :
        #print("number col: ", col, df[col].dtypes)
        cnt_num += 1
        df[col] = df[col].fillna(0)
    else:
        #print("string col:", col, df[col].dtypes)
        cnt += 1
        non_num_feature.append(col)
        df[col] = df[col].fillna('Unknown')

print("Numeric count and non numeric count: ",cnt_num, cnt)
print("Non numeric columns: ", non_num_feature)
df.head(10)

Numeric count and non numeric count:  35 22
Non numeric columns:  ['frame.time', 'radiotap.present.tsft', 'radiotap.rxflags', 'wlan.bssid', 'wlan.da', 'wlan.fc.ds', 'wlan.ra', 'wlan.sa', 'wlan.ta', 'llc', 'ip.dst', 'ip.proto', 'ip.src', 'ip.ttl', 'ip.version', 'ssdp', 'http.host', 'http.request.full_uri', 'http.request.line', 'http.request.method', 'http.request.version', 'Label']


Unnamed: 0,frame.encap_type,frame.len,frame.number,frame.time,frame.time_delta,frame.time_delta_displayed,frame.time_epoch,frame.time_relative,radiotap.channel.flags.cck,radiotap.channel.flags.ofdm,radiotap.channel.freq,radiotap.dbm_antsignal,radiotap.length,radiotap.present.tsft,radiotap.rxflags,radiotap.timestamp.ts,wlan.duration,wlan.bssid,wlan.da,wlan.fc.ds,wlan.fc.frag,wlan.fc.order,wlan.fc.moredata,wlan.fc.protected,wlan.fc.pwrmgt,wlan.fc.type,wlan.fc.retry,wlan.fc.subtype,wlan.ra,wlan_radio.duration,wlan.sa,wlan.seq,wlan.ta,wlan_radio.channel,wlan_radio.data_rate,wlan_radio.frequency,wlan_radio.signal_dbm,wlan_radio.phy,llc,ip.dst,ip.proto,ip.src,ip.ttl,ip.version,udp.dstport,udp.srcport,udp.length,udp.payload,udp.time_relative,udp.time_delta,ssdp,http.host,http.request.full_uri,http.request.line,http.request.method,http.request.version,Label
1,23,170,50002,Dec 13- 2020 20:55:27.014771000 GTB Standard Time,9.5e-05,9.5e-05,1607885727,194.715447,0,1,5180,-96,64,0-0-0,0x00000000,719218845,48,0c:9d:92:54:fe:34,94:e9:79:82:c5:77,0x00000002,0,0,0,1,0,2,0,8,94:e9:79:82:c5:77,51,0c:9d:92:54:fe:30,920.0,0c:9d:92:54:fe:34,36,78.0,5180,-30,8,llc,192.168.2.41,6,162.125.66.14,57,4,0,0,0,0,0,0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Normal
5,23,1550,50006,Dec 13- 2020 20:55:27.015709000 GTB Standard Time,0.000687,0.000687,1607885727,194.716385,0,1,5180,-96,64,0-0-0,0x00000000,719219218,48,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,50:3e:aa:e3:1f:be,108,0c:9d:92:54:fe:30,911.0,0c:9d:92:54:fe:34,36,173.333,5180,-30,8,llc,192.168.2.190,6,151.101.17.140,58,4,0,0,0,0,0,0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Normal
6,23,1550,50007,Dec 13- 2020 20:55:27.015726000 GTB Standard Time,1.7e-05,1.7e-05,1607885727,194.716402,0,1,5180,-96,64,0-0-0,0x00000000,719219218,48,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,50:3e:aa:e3:1f:be,108,0c:9d:92:54:fe:30,912.0,0c:9d:92:54:fe:34,36,173.333,5180,-30,8,llc,192.168.2.190,6,151.101.17.140,58,4,0,0,0,0,0,0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Normal
7,23,1550,50008,Dec 13- 2020 20:55:27.015729000 GTB Standard Time,3e-06,3e-06,1607885727,194.716405,0,1,5180,-96,64,0-0-0,0x00000000,719219218,48,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,50:3e:aa:e3:1f:be,108,0c:9d:92:54:fe:30,913.0,0c:9d:92:54:fe:34,36,173.333,5180,-30,8,llc,192.168.2.190,6,151.101.17.140,58,4,0,0,0,0,0,0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Normal
8,23,1550,50009,Dec 13- 2020 20:55:27.015735000 GTB Standard Time,6e-06,6e-06,1607885727,194.716411,0,1,5180,-96,64,0-0-0,0x00000000,719219218,48,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,50:3e:aa:e3:1f:be,108,0c:9d:92:54:fe:30,914.0,0c:9d:92:54:fe:34,36,173.333,5180,-30,8,llc,192.168.2.190,6,151.101.17.140,58,4,0,0,0,0,0,0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Normal
9,23,1550,50010,Dec 13- 2020 20:55:27.015738000 GTB Standard Time,3e-06,3e-06,1607885727,194.716414,0,1,5180,-96,64,0-0-0,0x00000000,719219218,48,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,50:3e:aa:e3:1f:be,108,0c:9d:92:54:fe:30,915.0,0c:9d:92:54:fe:34,36,173.333,5180,-30,8,llc,192.168.2.190,6,151.101.17.140,58,4,0,0,0,0,0,0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Normal
10,23,1550,50011,Dec 13- 2020 20:55:27.015741000 GTB Standard Time,3e-06,3e-06,1607885727,194.716417,0,1,5180,-96,64,0-0-0,0x00000000,719219218,48,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,50:3e:aa:e3:1f:be,108,0c:9d:92:54:fe:30,916.0,0c:9d:92:54:fe:34,36,173.333,5180,-30,8,llc,192.168.2.190,6,151.101.17.140,58,4,0,0,0,0,0,0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Normal
11,23,1550,50012,Dec 13- 2020 20:55:27.015745000 GTB Standard Time,4e-06,4e-06,1607885727,194.716421,0,1,5180,-96,64,0-0-0,0x00000000,719219218,48,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,50:3e:aa:e3:1f:be,108,0c:9d:92:54:fe:30,917.0,0c:9d:92:54:fe:34,36,173.333,5180,-30,8,llc,192.168.2.190,6,151.101.17.140,58,4,0,0,0,0,0,0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Normal
12,23,1550,50013,Dec 13- 2020 20:55:27.015748000 GTB Standard Time,3e-06,3e-06,1607885727,194.716424,0,1,5180,-96,64,0-0-0,0x00000000,719219218,48,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,50:3e:aa:e3:1f:be,108,0c:9d:92:54:fe:30,918.0,0c:9d:92:54:fe:34,36,173.333,5180,-30,8,llc,192.168.2.190,6,151.101.17.140,58,4,0,0,0,0,0,0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Normal
13,23,1550,50014,Dec 13- 2020 20:55:27.015751000 GTB Standard Time,3e-06,3e-06,1607885727,194.716427,0,1,5180,-96,64,0-0-0,0x00000000,719219218,48,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,0,0,0,1,0,2,0,8,50:3e:aa:e3:1f:be,108,0c:9d:92:54:fe:30,919.0,0c:9d:92:54:fe:34,36,173.333,5180,-30,8,llc,192.168.2.190,6,151.101.17.140,58,4,0,0,0,0,0,0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Normal


In [349]:
def check_feature_type(df):
    cnt_num = 0
    cnt = 0
    non_num_feature = []
    for col in df.columns:
        if (df[col].dtypes == "int64") or (df[col].dtypes == "float64"):
            cnt_num += 1
        else:
            #print("string col:", col, df[col].dtypes)
            cnt += 1
            non_num_feature.append(col)

    print("Numeric count and non numeric count: ",cnt_num, cnt)
    print("Non numeric columns: ", non_num_feature)
    
    return non_num_feature

In [340]:
df[non_num_feature].head(10)

Unnamed: 0,frame.time,radiotap.present.tsft,radiotap.rxflags,wlan.bssid,wlan.da,wlan.fc.ds,wlan.ra,wlan.sa,wlan.ta,llc,ip.dst,ip.proto,ip.src,ip.ttl,ip.version,ssdp,http.host,http.request.full_uri,http.request.line,http.request.method,http.request.version,Label
1,Dec 13- 2020 20:55:27.014771000 GTB Standard Time,0-0-0,0x00000000,0c:9d:92:54:fe:34,94:e9:79:82:c5:77,0x00000002,94:e9:79:82:c5:77,0c:9d:92:54:fe:30,0c:9d:92:54:fe:34,llc,192.168.2.41,6,162.125.66.14,57,4,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Normal
5,Dec 13- 2020 20:55:27.015709000 GTB Standard Time,0-0-0,0x00000000,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,50:3e:aa:e3:1f:be,0c:9d:92:54:fe:30,0c:9d:92:54:fe:34,llc,192.168.2.190,6,151.101.17.140,58,4,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Normal
6,Dec 13- 2020 20:55:27.015726000 GTB Standard Time,0-0-0,0x00000000,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,50:3e:aa:e3:1f:be,0c:9d:92:54:fe:30,0c:9d:92:54:fe:34,llc,192.168.2.190,6,151.101.17.140,58,4,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Normal
7,Dec 13- 2020 20:55:27.015729000 GTB Standard Time,0-0-0,0x00000000,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,50:3e:aa:e3:1f:be,0c:9d:92:54:fe:30,0c:9d:92:54:fe:34,llc,192.168.2.190,6,151.101.17.140,58,4,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Normal
8,Dec 13- 2020 20:55:27.015735000 GTB Standard Time,0-0-0,0x00000000,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,50:3e:aa:e3:1f:be,0c:9d:92:54:fe:30,0c:9d:92:54:fe:34,llc,192.168.2.190,6,151.101.17.140,58,4,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Normal
9,Dec 13- 2020 20:55:27.015738000 GTB Standard Time,0-0-0,0x00000000,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,50:3e:aa:e3:1f:be,0c:9d:92:54:fe:30,0c:9d:92:54:fe:34,llc,192.168.2.190,6,151.101.17.140,58,4,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Normal
10,Dec 13- 2020 20:55:27.015741000 GTB Standard Time,0-0-0,0x00000000,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,50:3e:aa:e3:1f:be,0c:9d:92:54:fe:30,0c:9d:92:54:fe:34,llc,192.168.2.190,6,151.101.17.140,58,4,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Normal
11,Dec 13- 2020 20:55:27.015745000 GTB Standard Time,0-0-0,0x00000000,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,50:3e:aa:e3:1f:be,0c:9d:92:54:fe:30,0c:9d:92:54:fe:34,llc,192.168.2.190,6,151.101.17.140,58,4,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Normal
12,Dec 13- 2020 20:55:27.015748000 GTB Standard Time,0-0-0,0x00000000,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,50:3e:aa:e3:1f:be,0c:9d:92:54:fe:30,0c:9d:92:54:fe:34,llc,192.168.2.190,6,151.101.17.140,58,4,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Normal
13,Dec 13- 2020 20:55:27.015751000 GTB Standard Time,0-0-0,0x00000000,0c:9d:92:54:fe:34,50:3e:aa:e3:1f:be,0x00000002,50:3e:aa:e3:1f:be,0c:9d:92:54:fe:30,0c:9d:92:54:fe:34,llc,192.168.2.190,6,151.101.17.140,58,4,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Normal


In [341]:
import ipaddress
import re

ipv4_addr_features = ['ip.src', 'ip.dst']

# removing rows with non IP address format and then convering those IPs to integer values using ipaddress
for col in ipv4_addr_features:
    df = df[df[col].str.contains("^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", regex=True) == True]

for col in ipv4_addr_features:
  df[col] = df[col].apply(lambda x: int(ipaddress.IPv4Address(x)))

print("df shape after dropping non IPv4 rows: ", df.shape)

df shape after dropping non IPv4 rows:  (128693, 57)


In [342]:
mac_addr_features = ['wlan.bssid','wlan.da', 'wlan.ra', 'wlan.sa', 'wlan.ta']

# converting  MAC addresses in the wlan features to int
def mac_to_int(mac):
    res = re.match('^((?:(?:[0-9a-f]{2}):){5}[0-9a-f]{2})$', mac.lower())
    if res is None:
        raise ValueError('invalid mac address')
    return int(res.group(0).replace(':', ''), 16)

for col in mac_addr_features:
  df[col] = df[col].apply(lambda x: mac_to_int(x))


df shape after dropping non IPv6 rows:  (128693, 57)


In [350]:
df["wlan.bssid"].value_counts()

check_feature_type(df)

Numeric count and non numeric count:  36 21
Non numeric columns:  ['frame.time', 'radiotap.present.tsft', 'radiotap.rxflags', 'wlan.fc.ds', 'llc', 'ip.proto', 'ip.ttl', 'ip.version', 'udp.dstport', 'udp.srcport', 'udp.length', 'udp.payload', 'udp.time_relative', 'udp.time_delta', 'ssdp', 'http.host', 'http.request.full_uri', 'http.request.line', 'http.request.method', 'http.request.version', 'Label']


['frame.time',
 'radiotap.present.tsft',
 'radiotap.rxflags',
 'wlan.fc.ds',
 'llc',
 'ip.proto',
 'ip.ttl',
 'ip.version',
 'udp.dstport',
 'udp.srcport',
 'udp.length',
 'udp.payload',
 'udp.time_relative',
 'udp.time_delta',
 'ssdp',
 'http.host',
 'http.request.full_uri',
 'http.request.line',
 'http.request.method',
 'http.request.version',
 'Label']

#### Noticed that wlan.bssid has the same MAC address in all rows - basic service set identifier (BSSID). THis is the MAC address of the Access Point.

#### Train test split

In [344]:
from sklearn.model_selection import train_test_split
print
Y = df["Label"]
X = df.loc[: ,df.columns != 'Label']

train_data, test_data, train_labels , test_labels = train_test_split(X, Y, test_size = 0.2)

print("Train data shape:" , train_data.shape)
print("Test data shape: ", test_data.shape)

Train data shape: (102954, 56)
Test data shape:  (25739, 56)


#### Need to convert data to categorical values before performing PCA - So far able to prune NaN columns and replace leftover NaN with 0 or unknown

## PCA Analysis

In [345]:
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.decomposition import PCA

def PCA_1(k):

  total_var = []
  for i in range(len(k)):
    pca = PCA(n_components=k[i])
    pca.fit(train_data)
    var_components = pca.explained_variance_ratio_
    print("k = "+ str(k[i]) + " ; Variance = "+ str(np.sum(var_components)))
    total_var.append(np.sum(var_components))


  # plotting
  print("\n")
  plt.plot(k, total_var, marker = 'o')
  plt.title('Fraction of total variance vs. number of principal components)') 
  plt.xlabel("Number of principal components")
  plt.ylabel("Variance")

n_components = [ 2, 3, 4, 5, 10, 20, 30, 40, 50]
PCA_1(n_components)

ValueError: could not convert string to float: 'Dec 13- 2020 21:02:02.654154000 GTB Standard Time'

In [None]:
print(pca)

PCA(n_components=1)
