In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests

import os
import random
from datetime import date, datetime
import time

# Path Names
path_name = '/home/anna/datasets/Bluecoat/'
preprocessing_path = path_name + 'preprocess/'
openDNS_path = path_name + 'opendns-fetchstats-master/'

# Web Categories
categories_only = ['Academic_Fraud','Adult_Themes','Adware','Alcohol','Anime_Manga_Webcomic','Auctions','Automotive','Blogs','Business_Services','Chat','Classifieds','Dating','Drugs',    'Ecommerce_Shopping','Educational_Institutions','File_Storage','Financial_Institutions','Forums_Message_boards','Gambling','Games','German_Youth_Protection','Government',
'Hate_Discrimination','Health_and_Fitness','Humor','Instant_Messaging','Jobs_Employment','Lingerie_Bikini','Movies','Music','News_Media','Non-Profits','Nudity','P2P_File_sharing',
'Parked_Domains','Photo_Sharing','Podcasts','Politics','Pornography','Portals','Proxy_Anonymizer','Radio','Religious','Research_Reference','Search_Engines','Sexuality',
'Social_Networking','Software_Technology','Sports','Tasteless','Television','Tobacco',
'Travel','Video_Sharing','Visual_Search_Engines','Weapons','Web_Spam','Webmail']
print len(categories_only)

# Numeric columns used for the scikit-learn matrix (file: sgos422_skfull_106.csv)
numeric_cols = ['weekday','hrs','time_taken','c_ip_1','c_ip_2','c_ip_3','c_ip_4','c_ip_5','c_ip_6','c_ip_7','c_ip_8','sc_filter_result_DENIED','sc_filter_result_OBSERVED','sc_status','s_action_TCP_AUTH_HIT','s_action_TCP_AUTH_MISS','s_action_TCP_CLIENT_REFRESH','s_action_TCP_DENIED','s_action_TCP_ERR_MISS','s_action_TCP_HIT','s_action_TCP_MISS','s_action_TCP_MISS_RST','s_action_TCP_NC_MISS','s_action_TCP_NC_MISS_RST','s_action_TCP_PARTIAL_MISS','s_action_TCP_REFRESH_MISS',
's_action_TCP_TUNNELED','cs_method_CONNECT','cs_method_DESCRIBE','cs_method_GET','cs_method_HEAD','cs_method_OPTIONS','cs_method_POST','cs_method_PROPFIND','cs_method_PUT','cs_method_get','cs_uri_scheme_http','cs_uri_scheme_tcp','cs_uri_port','domain_len','cs_uri_path_len','cs_uri_port_len','cs_uri_query_len','cs_uri_ext_len','url_len','sc_bytes','cs_bytes','Academic_Fraud','Adult_Themes','Adware','Alcohol','Anime_Manga_Webcomic','Auctions','Automotive','Blogs','Business_Services','Chat',
'Classifieds','Dating','Drugs','Ecommerce_Shopping','Educational_Institutions','File_Storage','Financial_Institutions','Forums_Message_boards','Gambling','Games','German_Youth_Protection','Government','Hate_Discrimination','Health_and_Fitness',   'Humor','Instant_Messaging','Jobs_Employment','Lingerie_Bikini','Movies','Music','News_Media','Non-Profits','Nudity','P2P_File_sharing','Parked_Domains','Photo_Sharing','Podcasts','Politics','Pornography','Portals','Proxy_Anonymizer',
'Radio','Religious','Research_Reference','Search_Engines','Sexuality','Social_Networking','Software_Technology','Sports','Tasteless','Television','Tobacco','Travel','Video_Sharing','Visual_Search_Engines','Weapons','Web_Spam','Webmail','sum_cat'
]
ipgeo_numeric_cols = numeric_cols
loc_field = ['lat','lon',
       'country_AR','country_BG', 'country_BR','country_CA',
       'country_CN','country_CZ', 'country_DE','country_FR',
       'country_GB','country_HK', 'country_IE','country_IL',
       'country_IN','country_IT', 'country_JP','country_LV',
       'country_NL','country_RU', 'country_SY','country_TW',
       'country_UA','country_US', 'country_UY','country_VG']
# Should be 106 columns
print len(numeric_cols), len(loc_field)

58
106 26


In [2]:
# This is a commonly used function to check whether a token is an Ip address
import socket

def check_ip(addr):
    try:
        socket.inet_aton(addr)
        return True 
    except socket.error:
        return False

## STEP 1: Pre-processing
a) wget ~55G of Syrian Bluecoat Proxy logs(*):
* This was only from specific files from SG-42 (all others were redacted completely)
* Perform randomized selection of ~1 million records for analysis
* Logs from SG-42 are provided in the zip file below

b) Preprocessing Stage 1:
* Remove malformed records
* Remove records that prevent reading into DataFrame

c) Preprocessing Stage 2:
* Identify logs that have sufficient data (for example, cs-host, c-ip, csUserAgent)

d) Preprocessing Stage 3:
* Obtain OpenDNS web categorization
* Enrich Bluecoat dataset

e) Preprocessing Stage 4:
* Conversion of text fields to numeric to support matrix form

f) Preprocessing Stage 5:
* Further scope project by segmenting data set: Domain names from IP addresses, Internal addresses to external/routable IP addr
  
(*) Note: URLs can be obtained at: http://project-bluesmote.s3-website-us-east-1.amazonaws.com/raw_logs

### a) Processing Stage 1:  Prepare data so it can be read into Data Frame

In [119]:
# These two lines are needed else it will not display all columns (26 total in my dataframe)
from IPython.display import display
pd.options.display.max_columns = None
# These two lines resize the plot area and font size
plt.rcParams['figure.figsize'] = (8,6)
plt.rcParams['font.size'] = 14

In [59]:
# Directly doing a pandas read_csv doesn't work because the data is not perfect.  Needs to be cleaned
# before going into a PD dataframe.
n = 0
bad_idx = []
lines = []
with open(preprocessing_path + 'sgos422_sampled_set.csv', 'r') as f:
    reader = csv.reader(f, delimiter=' ')
    #reader = pd.read_csv(f, delimiter=' ')
    for row in reader:
        # Needed to suppress an extra blank space placed between fields 9 and 10
        # csReferer and sc-status
        row = filter(lambda x: x != '', row)
        lines.append(row)
        if len(row) != 25:
            bad_idx.append(n)
        n += 1

In [60]:
# Out of 2.04 mill records, just 9 bad rows 
bad_idx

[24669, 335866, 335867, 335868, 671733, 1357468, 1703679, 1703680, 2039986]

In [61]:
# Combination of corrupted lines and device restarts (comment lines)
for i in bad_idx:
    print lines[i]

['2011-07-22', '20:38:25', '1554', 'd521fbde365a0e33', '-', '-', '-', 'OBSERVED', 'unavailable', 'http://chitchat.mybrowserbar.com/cgi/errors.cgi?q=http://chitchat.mybrowserbar.com/cgi/errors.cgi%3fq%3dhttp://chitchat.mybrowserbar.com/cgi/errors.cgi%253fq%253dhttp://chitchat.mybrowserbar.com/cgi/errors.cgi%25253fq%25253dhttp://chitchat.mybrowserbar.com/cgi/errors.cgi%2525253fq%2525253dhttp://chitchat.mybrowserbar.com/cgi/errors.cgi%252525253fq%252525253dhttp://chitchat.mybrowserbar.com/cgi/errors.cgi%25252525253fq%25252525253dhttp://chitchat.mybrowserbar.com/cgi/errors.cgi%2525252525253fq%2525252525253dhttp://chitchat.mybrowserbar.com/cgi/errors.cgi%252525252525253fq%252525252525253dhttp://chitchat.mybrowserbar.com/cgi/errors.cgi%25252525252525253fq%25252525252525253dhttp://chitchat.mybrowserbar.com/cgi/errors.cgi%2525252525252525253fq%2525252525252525253dhttp://chitchat.mybrowserbar.com/cgi/errors.cgi%252525252525252525253fq%252525252525252525253dhttp://chitchat.mybrowserbar.com/cgi/e

In [62]:
# started out with 2,040,000 records, removed 9
for i in range(len(bad_idx)-1,-1,-1):
    del lines[bad_idx[i]]

# Validation: They are now good lines
print len(lines)
for i in bad_idx:
    print lines[i]

2039991
['2011-07-22', '20:38:25', '52', '547ee6aa6e9c53a2', '-', '-', '-', 'OBSERVED', 'unavailable', 'http://ads.fling.com/www/delivery/afr.php?zoneid=410', '200', 'TCP_HIT', 'GET', 'text/css', 'http', 'ads.fling.com', '80', '/www/delivery/lib/fe/410/rk_footer_t4.css', '-', 'css', 'Mozilla/5.0 (Windows NT 5.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', '82.137.200.42', '1291', '1199', '-']
['2011-07-22', '21:25:35', '1430', '8d1782d7bb752977', '-', '-', '-', 'OBSERVED', 'unavailable', 'http://www.4shared.com/video/bjhYhata/_-_.htm', '200', 'TCP_NC_MISS', 'GET', '-', 'http', 'www.ltassrv.com', '80', '/track/default.aspx', '?p=56570&v=5&s=35448&c=25129&cc=GB&t=1&r=&o=&a=&pv=0&rnd=5624&as=0', 'aspx', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SIMBAR={FCA9BE86-0442-11D6-8418-0013D44BF6EA}; .NET CLR 2.0.50727; MSN Optimized;US)', '82.137.200.42', '242', '436', '-']
['2011-07-22', '21:25:35', '203', '03b470633854e2f1', '-', '-', '-', 'OBSERVED', 'unavailable', 'http://s

In [63]:
# Validate lines again
n = 0
bad_idx = []
for l in lines:
    l = filter(lambda x: x!= '', row)
    if len(l) != 25:
        bad_idx.append(n)
    n += 1

In [64]:
len(bad_idx)

0

In [66]:
# Example of good well-formed record
print lines[0]
print len(lines)

['2011-07-22', '20:34:51', '282', 'ce6de14af68ce198', '-', '-', '-', 'OBSERVED', 'unavailable', 'http://www.surfjunky.com/members/sj-a.php?r=44864', '200', 'TCP_NC_MISS', 'GET', 'text/html', 'http', 'www.surfjunky.com', '80', '/members/sj-a.php', '?r=66556', 'php', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.65 Safari/534.24', '82.137.200.42', '1395', '663', '-']
2039991


### b) Processing Stage 2:  Further cull data
* Remove row that do not have critical data such as:
  - c_ip
  - cs-host
* Remove malformed data

In [3]:
# Read into pandas dataframe
cols25 = ['dt', 'tm', 'time_taken', 'c_ip', 'cs_username', 'cs_auth_group', 'x_exception_id', 'sc_filter_result', 'cs_categories', 'csReferer', 'sc_status', 's_action', 'cs_method', 'rsContentType', 'cs_uri_scheme', 'cs_host','cs_uri_port', 'cs_uri_path', 'cs_uri_query', 'cs_uri_extension', 'csUserAgent', 's_ip', 'sc_bytes', 'cs_bytes', 'x_virus_id']
sgos = pd.read_csv(path_name + 'sgos422_sampled_clean.csv', names=cols25, sep= ',')

In [4]:
sgos.head(5)
#sgos.tail(5)

Unnamed: 0,dt,tm,time_taken,c_ip,cs_username,cs_auth_group,x_exception_id,sc_filter_result,cs_categories,csReferer,...,cs_host,cs_uri_port,cs_uri_path,cs_uri_query,cs_uri_extension,csUserAgent,s_ip,sc_bytes,cs_bytes,x_virus_id
0,2011-07-22,20:34:51,282,ce6de14af68ce198,-,-,-,OBSERVED,unavailable,http://www.surfjunky.com/members/sj-a.php?r=44864,...,www.surfjunky.com,80,/members/sj-a.php,?r=66556,php,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,82.137.200.42,1395,663,-
1,2011-07-22,20:34:51,216,6154d919f8d56690,-,-,-,OBSERVED,unavailable,http://x31.iloveim.com/build_3.9.2.1/comet.html,...,x31.iloveim.com,80,/servlets/events,?1122064400327,-,Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US...,82.137.200.42,473,1129,-
2,2011-07-22,20:34:51,102,48fd5c69d3cfc0af,-,-,-,OBSERVED,unavailable,http://static.xhamster.com/xplayer17.swf,...,88.208.24.131,80,"/key=io8g3-zl3cM,end=1311337549/data=480312379...",-,flv,Mozilla/4.0 (compatible; MSIE 6.0; Windows NT ...,82.137.200.42,724,319,-
3,2011-07-22,20:34:51,716,f46e16fe0221b453,-,-,-,OBSERVED,unavailable,-,...,media2.lsops.net,80,/idle/1894876757/425,-,-,Shockwave Flash,82.137.200.42,182,160,-
4,2011-07-22,20:34:51,615,96ba5993c403a175,-,-,-,OBSERVED,unavailable,http://ifa.camads.net/dif/?cid=xvideos-shared-...,...,feeds.videosz.com,80,/custom/xvideos/464x244.php,-,php,Mozilla/5.0 (Windows NT 5.1; rv:2.0) Gecko/201...,82.137.200.42,378,474,-


In [5]:
sgos.shape

(1964138, 25)

In [14]:
bad_idx = [ k for k, row in sgos.iterrows() if ((sgos.loc[k, 'cs_host'] == '-') | (sgos.loc[k, 'cs_host'] == '') | (sgos.loc[k, 'cs_host'] == None)) ]

In [10]:
# Not much information to be gleaned also it is 10 rows out of 2M (or .0005%)
dash_host[(dash_host.x_exception_id == 'policy_denied') & (sgos.cs_host == '-')]

Unnamed: 0,dt,tm,time_taken,c_ip,cs_username,cs_auth_group,x_exception_id,sc_filter_result,cs_categories,csReferer,sc_status,s_action,cs_method,rsContentType,cs_uri_scheme,cs_host,cs_uri_port,cs_uri_path,cs_uri_query,cs_uri_extension,csUserAgent,s_ip,sc_bytes,cs_bytes,x_virus_id


In [74]:
print len(bad_idx)
print sgos.loc[bad_idx[75852], 'cs_host']
print len(lines)

75853
-
2039991


In [75]:
for i in range(len(bad_idx)-1,-1,-1):
    del lines[bad_idx[i]]

In [89]:
print len(sgos)
print len(lines)
bad_idx = sgos[ (sgos.cs_host == '-') | (sgos.cs_host == '') | (sgos.cs_host == None) ]
print len(bad_idx)

1964138
1964138
0


In [78]:
# Use these two fields to enrich the data frame
urls = sgos[['csReferer', 'cs_host']]

In [79]:
len(urls)

1964138

In [80]:
urls.head(10)

Unnamed: 0,csReferer,cs_host
0,http://www.surfjunky.com/members/sj-a.php?r=44864,www.surfjunky.com
1,http://x31.iloveim.com/build_3.9.2.1/comet.html,x31.iloveim.com
2,http://static.xhamster.com/xplayer17.swf,88.208.24.131
3,-,media2.lsops.net
4,http://ifa.camads.net/dif/?cid=xvideos-shared-...,feeds.videosz.com
5,-,porn194.xvideos.com
6,http://videogayz.com/,99.192.176.43
7,http://x32.iloveim.com/servlets/ajax,x32.iloveim.com
8,http://www.sham-sat.net/vb/showthread.php?t=63170,www.sham-sat.net
9,http://static.xhamster.com/xplayer17.swf,88.208.24.132


In [90]:
print urls.loc[1, 'csReferer']
print urls.loc[1, 'cs_host']

http://x31.iloveim.com/build_3.9.2.1/comet.html
x31.iloveim.com


In [91]:
# Check to see if there are nulls in csReferer field or cs_host
dash_vals = urls[ (urls.cs_host == '-') | (urls.cs_host == '') | (urls.cs_host == None) ] 
#null_vals = [urls.loc[i] for i in range(0, len(sgos)) if (urls.loc[i, 'cs_host'] in (None, '')) ]       

In [92]:
len(dash_vals)

0

In [94]:
len(urls.cs_host.value_counts())

21146

In [95]:
len(urls.csReferer.value_counts())

201789

### Explore for duplicate rows in Pandas
* All duplicate rows in the dataframe were extracted
* Total of 26985 (out of 1964183 or 1.37%)
* Grouped by client ip, destination host, date, time, user agent
* Total of 18058 groups with average of 1.5 requests (likely the duplicates) per group

In [22]:
duplicate_list = sgos.duplicated()
print sum(duplicate_list)

26985


In [81]:
duplicates_sgos = sgos[duplicate_list]

In [85]:
print 'Volume of duplicates: %4.2f%%' %(sum(duplicate_list) / float(len(sgos)) * 100)

Volume of duplicates: 1.37%


In [106]:
dup_grp = sgos[duplicate_list].groupby(['c_ip','csUserAgent','tm','dt','cs_host'])
print '# of approximate groups: %d' %(len(dup_grp))
print 'average count per group: %4.2f' %(float(sum(duplicate_list))/len(dup_grp))

# of approximate groups: 18058
average count per group: 1.49


In [102]:
dup_df = pd.DataFrame({'count':duplicates_sgos.groupby( [ 'c_ip','csUserAgent','tm','dt','cs_host'] ).size()}).reset_index()

In [103]:
dup_df['count'].describe()

count    18058.000000
mean         1.494352
std          0.988496
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max         11.000000
Name: count, dtype: float64

In [105]:
dup_df.sort_values(by=['count'], ascending=False)[0:10]

Unnamed: 0,c_ip,csUserAgent,tm,dt,cs_host,count
5806,46d31ba110116e03,Mozilla/4.0 (compatible; MSIE 6.0; Windows NT ...,08:18:26,2011-07-23,static.xhamster.com,11
5458,2ca713b0c348513a,Mozilla/4.0 (compatible; MSIE 6.0; Windows NT ...,15:11:20,2011-07-23,74.63.219.34,11
17008,c5f2237b2070c537,Mozilla/4.0 (compatible; MSIE 6.0; Windows NT ...,21:32:46,2011-07-22,adab-sy.com,10
16997,c5f2237b2070c537,Mozilla/4.0 (compatible; MSIE 6.0; Windows NT ...,21:31:20,2011-07-22,adab-sy.com,10
5798,43c1da8e756361eb,Mozilla/4.0 (compatible; MSIE 6.0; Windows NT ...,21:05:44,2011-07-22,www.aboluowang.com,10
5830,48450c6a5189f134,Mozilla/4.0 (compatible; MSIE 6.0; Windows NT ...,23:19:50,2011-07-22,static.xhamster.com,10
18031,fceb541eda093260,Mozilla/4.0 (compatible; MSIE 6.0; Windows NT ...,22:24:01,2011-07-22,sy.travian.com,10
5833,48450c6a5189f134,Mozilla/4.0 (compatible; MSIE 6.0; Windows NT ...,23:39:20,2011-07-22,cdn-2.dvdcdn.com,9
16987,c5f2237b2070c537,Mozilla/4.0 (compatible; MSIE 6.0; Windows NT ...,21:30:05,2011-07-22,adab-sy.com,9
17047,c5f2237b2070c537,Mozilla/4.0 (compatible; MSIE 6.0; Windows NT ...,22:22:41,2011-07-22,adab-sy.com,9


### c) Pre-processing Stage 3: Enrich data
* Obtain categorization by using OpenDNS 
* The script below was run on a separate VM on an isolated machine 
* Please note that a few of the domains were identified as malware and botnet, execute at your own risk!
* For approximately 2M get requests took several hours (as a python script outside of Notebook)


 #### Use OpenDNS FetchStats to get Domain Categories

In [7]:
opendns_col = [
    'Rank','Domain','Total','Blacklisted','Blocked_by_Category','Blocked_as_Botnet',
    'Blocked_as_Malware','Blocked_as_Phishing','Resolved_by_SmartCache']

opendns_col.extend(categories_only)
print len(opendns_col)
webcat = pd.read_csv(openDNS_path + '2016-01-26_openDNS_fetchstats.csv', sep = ',', names= opendns_col, header = 0)
print webcat.shape

67
(21809, 67)


In [8]:
webcat.head(3)

Unnamed: 0,Rank,Domain,Total,Blacklisted,Blocked_by_Category,Blocked_as_Botnet,Blocked_as_Malware,Blocked_as_Phishing,Resolved_by_SmartCache,Academic_Fraud,...,Sports,Tasteless,Television,Tobacco,Travel,Video_Sharing,Visual_Search_Engines,Weapons,Web_Spam,Webmail
0,1,www.google.com,1919,0,1919,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,*.youtube.com,1590,0,1590,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3,caldav.calendar.yahoo.com,852,0,852,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
del webcat['Rank']
del webcat['Total']
del webcat['Blocked_by_Category']
del webcat['Blocked_as_Botnet']
del webcat['Blocked_as_Malware']
del webcat['Blocked_as_Phishing']
del webcat['Resolved_by_SmartCache']
del webcat['Blacklisted']

In [10]:
webcat.head(3)

Unnamed: 0,Domain,Academic_Fraud,Adult_Themes,Adware,Alcohol,Anime_Manga_Webcomic,Auctions,Automotive,Blogs,Business_Services,...,Sports,Tasteless,Television,Tobacco,Travel,Video_Sharing,Visual_Search_Engines,Weapons,Web_Spam,Webmail
0,www.google.com,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,*.youtube.com,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,caldav.calendar.yahoo.com,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Re-read in sgos dataframe and re-name cs_host to Domain
cols25 = ['dt', 'tm', 'time_taken', 'c_ip', 'cs_username', 'cs_auth_group', 'x_exception_id', 'sc_filter_result', 'cs_categories', 'csReferer', 'sc_status', 's_action', 'cs_method', 'rsContentType', 'cs_uri_scheme', 'Domain','cs_uri_port', 'cs_uri_path', 'cs_uri_query', 'cs_uri_extension', 'csUserAgent', 's_ip', 'sc_bytes', 'cs_bytes', 'x_virus_id']
sgos = pd.read_csv(path_name + 'sgos422_sampled_clean.csv', names=cols25, sep= ',')

In [12]:
print sgos.shape
sgos.head(3)

(1964138, 25)


Unnamed: 0,dt,tm,time_taken,c_ip,cs_username,cs_auth_group,x_exception_id,sc_filter_result,cs_categories,csReferer,...,Domain,cs_uri_port,cs_uri_path,cs_uri_query,cs_uri_extension,csUserAgent,s_ip,sc_bytes,cs_bytes,x_virus_id
0,2011-07-22,20:34:51,282,ce6de14af68ce198,-,-,-,OBSERVED,unavailable,http://www.surfjunky.com/members/sj-a.php?r=44864,...,www.surfjunky.com,80,/members/sj-a.php,?r=66556,php,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,82.137.200.42,1395,663,-
1,2011-07-22,20:34:51,216,6154d919f8d56690,-,-,-,OBSERVED,unavailable,http://x31.iloveim.com/build_3.9.2.1/comet.html,...,x31.iloveim.com,80,/servlets/events,?1122064400327,-,Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US...,82.137.200.42,473,1129,-
2,2011-07-22,20:34:51,102,48fd5c69d3cfc0af,-,-,-,OBSERVED,unavailable,http://static.xhamster.com/xplayer17.swf,...,88.208.24.131,80,"/key=io8g3-zl3cM,end=1311337549/data=480312379...",-,flv,Mozilla/4.0 (compatible; MSIE 6.0; Windows NT ...,82.137.200.42,724,319,-


In [13]:
sgos_cat = sgos.merge(webcat, how='left', on='Domain')

In [14]:
sgos_cat.head(3)

Unnamed: 0,dt,tm,time_taken,c_ip,cs_username,cs_auth_group,x_exception_id,sc_filter_result,cs_categories,csReferer,...,Sports,Tasteless,Television,Tobacco,Travel,Video_Sharing,Visual_Search_Engines,Weapons,Web_Spam,Webmail
0,2011-07-22,20:34:51,282,ce6de14af68ce198,-,-,-,OBSERVED,unavailable,http://www.surfjunky.com/members/sj-a.php?r=44864,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2011-07-22,20:34:51,216,6154d919f8d56690,-,-,-,OBSERVED,unavailable,http://x31.iloveim.com/build_3.9.2.1/comet.html,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2011-07-22,20:34:51,102,48fd5c69d3cfc0af,-,-,-,OBSERVED,unavailable,http://static.xhamster.com/xplayer17.swf,...,,,,,,,,,,


In [15]:
# should be 1964138 x 83
sgos_cat.shape

(1964138, 83)

In [16]:
cats = sgos_cat[categories_only]

In [17]:
import math
no_webcat = sgos_cat['Academic_Fraud'].apply(math.isnan)

In [18]:
sgos[no_webcat].Domain.value_counts()

88.208.24.131                                                    85613
88.208.24.132                                                    84316
88.208.24.194                                                    19527
46.229.160.7                                                     10698
88.208.24.196                                                     7596
88.208.24.138                                                     7508
216.245.211.226                                                   6281
194.187.98.230                                                    4882
194.187.98.229                                                    4834
194.187.98.231                                                    4659
173.193.242.218                                                   3433
174.140.154.22                                                    3274
69.162.125.244                                                    2508
46.229.160.162                                                    1988
46.229

In [19]:
print "Total number of unique domains:", len(sgos.Domain.value_counts())
print "Total number of requests:", len(sgos)

Total number of unique domains: 21146
Total number of requests: 1964138


In [20]:
print "Number of unique domains not matched: %d" %( len(sgos[no_webcat].Domain.value_counts()) ) 
a = len(sgos[no_webcat].Domain.value_counts()) * 100 / float(len(sgos.Domain.value_counts()))
print "\t or %4.2f%%" %( a )
print "Total number of requests not matched: %d" %( len(sgos[no_webcat].Domain) )
a = len(sgos[no_webcat].Domain) * 100 / float(len(sgos)) 
print "\t or %4.2f%%" %( a )
                                            

Number of unique domains not matched: 1687
	 or 7.98%
Total number of requests not matched: 318205
	 or 16.20%


In [21]:
vals_nocat = sgos[no_webcat].Domain.value_counts()
ip_addr = pd.Series(vals_nocat.index)
is_ip = ip_addr.apply(check_ip)
a = len(sgos[no_webcat].Domain.value_counts())
b = is_ip.sum() *100 / float(a)
print '%d out of %d with no category matches are IP addresses (%4.2f%%)' %( is_ip.sum(), a, b)

rows_nocat = sgos[no_webcat].Domain
is_ip = rows_nocat.apply(check_ip)
a = len(sgos[no_webcat].Domain)
b = 100*is_ip.sum() /float(len(sgos[no_webcat].Domain))
print '%d out of %d are IP addresses (%4.2f%%)' %( is_ip.sum(), a, b )
c = a - is_ip.sum()
d = c *100 / float(len(sgos))
print '%d requests uncategorized (%4.2f%%)' %( c, d )

1019 out of 1687 with no category matches are IP addresses (60.40%)
297793 out of 318205 are IP addresses (93.59%)
20412 requests uncategorized (1.04%)


In [22]:
cols83 = ['dt', 'tm', 'time_taken', 'c_ip', 'cs_username', 'cs_auth_group', 'x_exception_id', 'sc_filter_result', 'cs_categories', 'csReferer', 'sc_status', 's_action', 'cs_method', 'rsContentType', 'cs_uri_scheme', 'Domain', 'cs_uri_port', 'cs_uri_path', 'cs_uri_query', 'cs_uri_extension', 'csUserAgent', 's_ip', 'sc_bytes', 'cs_bytes', 'x_virus_id', 'Academic_Fraud', 'Adult_Themes', 'Adware', 'Alcohol', 'Anime_Manga_Webcomic', 'Auctions', 'Automotive', 'Blogs', 'Business_Services', 'Chat', 'Classifieds', 'Dating', 'Drugs', 'Ecommerce_Shopping', 'Educational_Institutions', 'File_Storage', 'Financial_Institutions', 'Forums_Message_boards', 'Gambling', 'Games', 'German_Youth_Protection', 'Government', 'Hate_Discrimination', 'Health_and_Fitness', 'Humor', 'Instant_Messaging', 'Jobs_Employment', 'Lingerie_Bikini', 'Movies', 'Music', 'News_Media', 'Non-Profits', 'Nudity', 'P2P_File_sharing', 'Parked_Domains', 'Photo_Sharing', 'Podcasts', 'Politics', 'Pornography', 'Portals', 'Proxy_Anonymizer', 'Radio', 'Religious', 'Research_Reference', 'Search_Engines', 'Sexuality', 'Social_Networking', 'Software_Technology', 'Sports', 'Tasteless', 'Television', 'Tobacco', 'Travel', 'Video_Sharing', 'Visual_Search_Engines', 'Weapons', 'Web_Spam', 'Webmail']
sgos_matrix = pd.read_csv(path_name + 'sgos422_clean_cats.csv', names = cols83, sep = ',', header=0)

In [23]:
# Should be 1964138 x 83 cols (orig 25 + 58 categories)
print sgos_matrix.shape
print list(sgos_matrix.columns)
sgos_matrix.head(2)

(1964138, 83)
['dt', 'tm', 'time_taken', 'c_ip', 'cs_username', 'cs_auth_group', 'x_exception_id', 'sc_filter_result', 'cs_categories', 'csReferer', 'sc_status', 's_action', 'cs_method', 'rsContentType', 'cs_uri_scheme', 'Domain', 'cs_uri_port', 'cs_uri_path', 'cs_uri_query', 'cs_uri_extension', 'csUserAgent', 's_ip', 'sc_bytes', 'cs_bytes', 'x_virus_id', 'Academic_Fraud', 'Adult_Themes', 'Adware', 'Alcohol', 'Anime_Manga_Webcomic', 'Auctions', 'Automotive', 'Blogs', 'Business_Services', 'Chat', 'Classifieds', 'Dating', 'Drugs', 'Ecommerce_Shopping', 'Educational_Institutions', 'File_Storage', 'Financial_Institutions', 'Forums_Message_boards', 'Gambling', 'Games', 'German_Youth_Protection', 'Government', 'Hate_Discrimination', 'Health_and_Fitness', 'Humor', 'Instant_Messaging', 'Jobs_Employment', 'Lingerie_Bikini', 'Movies', 'Music', 'News_Media', 'Non-Profits', 'Nudity', 'P2P_File_sharing', 'Parked_Domains', 'Photo_Sharing', 'Podcasts', 'Politics', 'Pornography', 'Portals', 'Proxy_Ano

Unnamed: 0,dt,tm,time_taken,c_ip,cs_username,cs_auth_group,x_exception_id,sc_filter_result,cs_categories,csReferer,...,Sports,Tasteless,Television,Tobacco,Travel,Video_Sharing,Visual_Search_Engines,Weapons,Web_Spam,Webmail
0,2011-07-22,20:34:51,282,ce6de14af68ce198,-,-,-,OBSERVED,unavailable,http://www.surfjunky.com/members/sj-a.php?r=44864,...,0,0,0,0,0,0,0,0,0,0
1,2011-07-22,20:34:51,216,6154d919f8d56690,-,-,-,OBSERVED,unavailable,http://x31.iloveim.com/build_3.9.2.1/comet.html,...,0,0,0,0,0,0,0,0,0,0


### d) Preprocessing Stage 4: Convert Fields to Numeric 
**Categorical:**
* **dt:** convert to weekday or weekend (unordered) **[Add 1+1 field]**
* **tm:** convert to business hours, after hours, weekend hours (ordered) **[Add 1 field]**
* **sc_filter_result:** observed, proxied, denied (unordered) **[Add 2 fields]**
* **cs_uri_scheme:** http, few https, and other junk (unordered) **[Add 2 fields]**
* **sc_status:** 200, and others (ordered) **[Add 0 fields]**
* **s_action:** action taken to process request: allowed, denied,
  failed, server_error (unordered) **[Add 13 fields]**
* **cs_method:** GETs, PUTs, etc **[Add 9 fields]**
* **rsContentType:** text/html, jpeg, etc
* **cs_port:** (ordered) **[Add 0 fields]**
* **csUserAgent:** what browser or client **[Add many fields!]**
* **c_ip:** ipv6 address in octets **[Add 8 fields]**
* **s_ip:** this is the sgos device ip (always the same)
* **x_virus_id:** not sure if this is ever populated
* **cs_host:** mapped to openDNS web cats **[Add 58 fields + 1]**

**Continuous:**
* **time_taken:** ms 
* **sc_bytes:** server to client bytes
* **cs_bytes:** client to server bytes 
* **cs_host/Domain:** length **[Add 1 field]**
* **cs_uri_path:** just get length or skip **[Add 1 field]**
* **cs_uri_query:** see above **[Add 1 field]**
* **cs_uri_extension:** see above **[Add 1 field]**
* **cs_uri_port:** see above **[Add 1 field]**
* **url_len:** total length **[Add 1 field]**

Adding 102 enriched or converted columns for total of 25 + 102 = 127 features in the full matrix.  This will be filtered to include only numeric columns for sklearn


In [24]:
from datetime import date, datetime
import time
import pandas as pd


In [25]:
#cols = ['dt', 'tm', 'time_taken', 'c_ip', 'cs_username', 'cs_auth_group', 'x_exception_id', 'sc_filter_result', 'cs_categories', 'csReferer', 'sc_status', 's_action', 'cs_method', 'rsContentType', 'cs_uri_scheme', 'Domain','cs_uri_port', 'cs_uri_path', 'cs_uri_query', 'cs_uri_extension', 'csUserAgent', 's_ip', 'sc_bytes', 'cs_bytes', 'x_virus_id']
cols83 = ['dt', 'tm', 'time_taken', 'c_ip', 'cs_username', 'cs_auth_group', 'x_exception_id', 'sc_filter_result', 'cs_categories', 'csReferer', 'sc_status', 's_action', 'cs_method', 'rsContentType', 'cs_uri_scheme', 'Domain', 'cs_uri_port', 'cs_uri_path', 'cs_uri_query', 'cs_uri_extension', 'csUserAgent', 's_ip', 'sc_bytes', 'cs_bytes', 'x_virus_id', 'Academic_Fraud', 'Adult_Themes', 'Adware', 'Alcohol', 'Anime_Manga_Webcomic', 'Auctions', 'Automotive', 'Blogs', 'Business_Services', 'Chat', 'Classifieds', 'Dating', 'Drugs', 'Ecommerce_Shopping', 'Educational_Institutions', 'File_Storage', 'Financial_Institutions', 'Forums_Message_boards', 'Gambling', 'Games', 'German_Youth_Protection', 'Government', 'Hate_Discrimination', 'Health_and_Fitness', 'Humor', 'Instant_Messaging', 'Jobs_Employment', 'Lingerie_Bikini', 'Movies', 'Music', 'News_Media', 'Non-Profits', 'Nudity', 'P2P_File_sharing', 'Parked_Domains', 'Photo_Sharing', 'Podcasts', 'Politics', 'Pornography', 'Portals', 'Proxy_Anonymizer', 'Radio', 'Religious', 'Research_Reference', 'Search_Engines', 'Sexuality', 'Social_Networking', 'Software_Technology', 'Sports', 'Tasteless', 'Television', 'Tobacco', 'Travel', 'Video_Sharing', 'Visual_Search_Engines', 'Weapons', 'Web_Spam', 'Webmail']
sg_matrix = pd.read_csv(path_name + 'sgos422_clean_cats.csv', names=cols83, sep= ',', header=0)
# should be 1964138 x 83
print sg_matrix.shape

(1964138, 83)


#### DT/TM field conversion

In [5]:
# Combine date time fields into one for easier processing
sg_matrix['dtm'] = sg_matrix['dt'] + " " + sg_matrix['tm']

In [6]:
# python isoweekday() defines Mon=1, Tue=2, ..., Sun=7
def detmWeekday(dtm):
    dt = time.strptime(dtm, '%Y-%m-%d %H:%M:%S')
    #print dt.tm_wday
    if dt.tm_wday in range(0,5):
        return 1
    else:
        return 0

In [7]:
def detmHours(dtm):
    dt = time.strptime(dtm, '%Y-%m-%d %H:%M:%S')
    #print dt, dt.tm_wday, dt.tm_hour
    return dt.tm_hour
    

In [8]:
detmWeekday('2011-07-22 12:00:00')

1

In [9]:
detmHours('2016-01-31 09:00:00')

9

In [10]:
sg_matrix['weekday'] = sg_matrix['dtm'].apply(detmWeekday)
sg_matrix['hrs'] = sg_matrix['dtm'].apply(detmHours)

In [11]:
# should be 1964138 x 86
print sg_matrix.shape
sg_matrix.tail(5)
sg_matrix.head(5)

(1964138, 86)


Unnamed: 0,dt,tm,time_taken,c_ip,cs_username,cs_auth_group,x_exception_id,sc_filter_result,cs_categories,csReferer,...,Tobacco,Travel,Video_Sharing,Visual_Search_Engines,Weapons,Web_Spam,Webmail,dtm,weekday,hrs
0,2011-07-22,20:34:51,282,ce6de14af68ce198,-,-,-,OBSERVED,unavailable,http://www.surfjunky.com/members/sj-a.php?r=44864,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2011-07-22 20:34:51,1,20
1,2011-07-22,20:34:51,216,6154d919f8d56690,-,-,-,OBSERVED,unavailable,http://x31.iloveim.com/build_3.9.2.1/comet.html,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2011-07-22 20:34:51,1,20
2,2011-07-22,20:34:51,102,48fd5c69d3cfc0af,-,-,-,OBSERVED,unavailable,http://static.xhamster.com/xplayer17.swf,...,,,,,,,,2011-07-22 20:34:51,1,20
3,2011-07-22,20:34:51,716,f46e16fe0221b453,-,-,-,OBSERVED,unavailable,-,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2011-07-22 20:34:51,1,20
4,2011-07-22,20:34:51,615,96ba5993c403a175,-,-,-,OBSERVED,unavailable,http://ifa.camads.net/dif/?cid=xvideos-shared-...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2011-07-22 20:34:51,1,20


### sc_filter_result field conversion

In [12]:
print sg_matrix.sc_filter_result.value_counts()

OBSERVED    1932188
DENIED        23512
PROXIED        8438
Name: sc_filter_result, dtype: int64


In [13]:
sc_filter_dummies = pd.get_dummies(sg_matrix.sc_filter_result, prefix = 'sc_filter_result')
sc_filter_dummies.head(2)

Unnamed: 0,sc_filter_result_DENIED,sc_filter_result_OBSERVED,sc_filter_result_PROXIED
0,0,1,0
1,0,1,0


In [14]:
sc_filter_dummies.drop('sc_filter_result_PROXIED', axis=1, inplace=True)

In [15]:
sc_filter_dummies.head(2)

Unnamed: 0,sc_filter_result_DENIED,sc_filter_result_OBSERVED
0,0,1
1,0,1


In [16]:
if sg_matrix.shape[1] == 86:
    sg_matrix = pd.concat([sg_matrix, sc_filter_dummies], axis=1)
    print "should be (1964138, 88):", sg_matrix.shape
    print 'concatenated successfully'
else:
    print 'did not concatenate'
    print sg_matrix.shape

should be (1964138, 88): (1964138, 88)
concatenated successfully


In [17]:
sg_matrix.head(2)

Unnamed: 0,dt,tm,time_taken,c_ip,cs_username,cs_auth_group,x_exception_id,sc_filter_result,cs_categories,csReferer,...,Video_Sharing,Visual_Search_Engines,Weapons,Web_Spam,Webmail,dtm,weekday,hrs,sc_filter_result_DENIED,sc_filter_result_OBSERVED
0,2011-07-22,20:34:51,282,ce6de14af68ce198,-,-,-,OBSERVED,unavailable,http://www.surfjunky.com/members/sj-a.php?r=44864,...,0,0,0,0,0,2011-07-22 20:34:51,1,20,0,1
1,2011-07-22,20:34:51,216,6154d919f8d56690,-,-,-,OBSERVED,unavailable,http://x31.iloveim.com/build_3.9.2.1/comet.html,...,0,0,0,0,0,2011-07-22 20:34:51,1,20,0,1


In [18]:
# should be 1964138 x 88
sg_matrix.shape

(1964138, 88)

### sc_status field (no conversion needed)

In [19]:
print "Value_counts for sc_status:", len(sg_matrix.sc_status.value_counts())
print "All records have an sc_status:", sg_matrix.sc_status.value_counts().sum()
print "\n", sg_matrix.sc_status.value_counts()

Value_counts for sc_status: 31
All records have an sc_status: 1964138

200    1165966
403     265051
304     181508
400     103404
302      81001
206      60876
404      33145
0        22472
503      21791
301      19039
204       6510
416       1306
500       1250
410        316
401        236
509         73
307         42
303         42
406         35
100         25
502         17
504         11
408          6
405          5
501          3
203          3
411          1
300          1
412          1
202          1
201          1
Name: sc_status, dtype: int64


### cs_uri_scheme field conversion

In [20]:
sg_matrix.cs_uri_scheme.value_counts()

http    1962254
tcp        1864
rtsp         20
Name: cs_uri_scheme, dtype: int64

In [21]:
uri_scheme_dummies = pd.get_dummies(sg_matrix.cs_uri_scheme, prefix = 'cs_uri_scheme')
uri_scheme_dummies.head(3)

Unnamed: 0,cs_uri_scheme_http,cs_uri_scheme_rtsp,cs_uri_scheme_tcp
0,1,0,0
1,1,0,0
2,1,0,0


In [22]:
uri_scheme_dummies.drop('cs_uri_scheme_rtsp', axis=1, inplace=True)
uri_scheme_dummies.head(2)

Unnamed: 0,cs_uri_scheme_http,cs_uri_scheme_tcp
0,1,0
1,1,0


In [23]:
if sg_matrix.shape[1] == 88:
    sg_matrix = pd.concat([sg_matrix, uri_scheme_dummies], axis=1)
    print 'concatenated successfully'
    # should be 1964138 x 90
    print 'should be 1964138 x 90: ', sg_matrix.shape
else:
    print 'did not concatenate: ', sg_matrix.shape
    

concatenated successfully
should be 1964138 x 90:  (1964138, 90)


In [24]:
sg_matrix.head(3)

Unnamed: 0,dt,tm,time_taken,c_ip,cs_username,cs_auth_group,x_exception_id,sc_filter_result,cs_categories,csReferer,...,Weapons,Web_Spam,Webmail,dtm,weekday,hrs,sc_filter_result_DENIED,sc_filter_result_OBSERVED,cs_uri_scheme_http,cs_uri_scheme_tcp
0,2011-07-22,20:34:51,282,ce6de14af68ce198,-,-,-,OBSERVED,unavailable,http://www.surfjunky.com/members/sj-a.php?r=44864,...,0.0,0.0,0.0,2011-07-22 20:34:51,1,20,0,1,1,0
1,2011-07-22,20:34:51,216,6154d919f8d56690,-,-,-,OBSERVED,unavailable,http://x31.iloveim.com/build_3.9.2.1/comet.html,...,0.0,0.0,0.0,2011-07-22 20:34:51,1,20,0,1,1,0
2,2011-07-22,20:34:51,102,48fd5c69d3cfc0af,-,-,-,OBSERVED,unavailable,http://static.xhamster.com/xplayer17.swf,...,,,,2011-07-22 20:34:51,1,20,0,1,1,0


### cs_uri_port field (no conversion)

In [25]:
sg_matrix.cs_uri_port.value_counts()

80      1962235
443        1845
8080         38
2041          9
81            5
9001          2
1990          1
1935          1
9998          1
84            1
Name: cs_uri_port, dtype: int64

In [26]:
# All rows have port populated with an int64
print sg_matrix.cs_uri_port.value_counts().sum()

1964138


### s_action field (conversion)

In [27]:
print len(sg_matrix.s_action.value_counts())
sg_matrix.s_action.value_counts()

14


TCP_NC_MISS           1180178
TCP_HIT                560251
TCP_MISS               168445
TCP_ERR_MISS            29186
TCP_CLIENT_REFRESH      12565
TCP_DENIED               4840
TCP_REFRESH_MISS         2819
TCP_PARTIAL_MISS         2719
TCP_TUNNELED             1760
TCP_NC_MISS_RST          1273
TCP_AUTH_HIT               50
-                          35
TCP_MISS_RST               12
TCP_AUTH_MISS               5
Name: s_action, dtype: int64

In [28]:
sg_matrix[(sg_matrix.sc_filter_result == 'OBSERVED')].s_action.value_counts()

TCP_NC_MISS           1175403
TCP_HIT                557765
TCP_MISS               167507
TCP_CLIENT_REFRESH      12522
TCP_ERR_MISS            10330
TCP_REFRESH_MISS         2811
TCP_PARTIAL_MISS         2719
TCP_TUNNELED             1760
TCP_NC_MISS_RST          1271
TCP_AUTH_HIT               50
-                          33
TCP_MISS_RST               12
TCP_AUTH_MISS               5
Name: s_action, dtype: int64

In [29]:
sg_matrix[(sg_matrix.sc_filter_result == 'DENIED')].s_action.value_counts()

TCP_ERR_MISS    18603
TCP_DENIED       4840
TCP_NC_MISS        67
-                   2
Name: s_action, dtype: int64

In [30]:
sg_matrix[(sg_matrix.sc_filter_result == 'PROXIED')].s_action.value_counts()

TCP_NC_MISS           4708
TCP_HIT               2486
TCP_MISS               938
TCP_ERR_MISS           253
TCP_CLIENT_REFRESH      43
TCP_REFRESH_MISS         8
TCP_NC_MISS_RST          2
Name: s_action, dtype: int64

In [31]:
s_action_dummies = pd.get_dummies(sg_matrix.s_action, prefix = 's_action')
print s_action_dummies.shape
s_action_dummies.head(2)

(1964138, 14)


Unnamed: 0,s_action_-,s_action_TCP_AUTH_HIT,s_action_TCP_AUTH_MISS,s_action_TCP_CLIENT_REFRESH,s_action_TCP_DENIED,s_action_TCP_ERR_MISS,s_action_TCP_HIT,s_action_TCP_MISS,s_action_TCP_MISS_RST,s_action_TCP_NC_MISS,s_action_TCP_NC_MISS_RST,s_action_TCP_PARTIAL_MISS,s_action_TCP_REFRESH_MISS,s_action_TCP_TUNNELED
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [32]:
s_action_dummies.drop('s_action_-', axis=1, inplace=True)
print s_action_dummies.shape

(1964138, 13)


In [33]:
s_action_dummies.head(2)

Unnamed: 0,s_action_TCP_AUTH_HIT,s_action_TCP_AUTH_MISS,s_action_TCP_CLIENT_REFRESH,s_action_TCP_DENIED,s_action_TCP_ERR_MISS,s_action_TCP_HIT,s_action_TCP_MISS,s_action_TCP_MISS_RST,s_action_TCP_NC_MISS,s_action_TCP_NC_MISS_RST,s_action_TCP_PARTIAL_MISS,s_action_TCP_REFRESH_MISS,s_action_TCP_TUNNELED
0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0


In [34]:
if sg_matrix.shape[1] == 90:
    sg_matrix = pd.concat([sg_matrix, s_action_dummies], axis=1)
    print 'concatenated successfully'
    # should be 1964138 x 103
    print 'should be (1964138 x 103):', sg_matrix.shape
else:
    print 'did not concatenate columns should be (1964138 x 103)'
    print sg_matrix.shape
    
sg_matrix.head(3)


concatenated successfully
should be (1964138 x 103): (1964138, 103)


Unnamed: 0,dt,tm,time_taken,c_ip,cs_username,cs_auth_group,x_exception_id,sc_filter_result,cs_categories,csReferer,...,s_action_TCP_DENIED,s_action_TCP_ERR_MISS,s_action_TCP_HIT,s_action_TCP_MISS,s_action_TCP_MISS_RST,s_action_TCP_NC_MISS,s_action_TCP_NC_MISS_RST,s_action_TCP_PARTIAL_MISS,s_action_TCP_REFRESH_MISS,s_action_TCP_TUNNELED
0,2011-07-22,20:34:51,282,ce6de14af68ce198,-,-,-,OBSERVED,unavailable,http://www.surfjunky.com/members/sj-a.php?r=44864,...,0,0,0,0,0,1,0,0,0,0
1,2011-07-22,20:34:51,216,6154d919f8d56690,-,-,-,OBSERVED,unavailable,http://x31.iloveim.com/build_3.9.2.1/comet.html,...,0,0,0,0,0,1,0,0,0,0
2,2011-07-22,20:34:51,102,48fd5c69d3cfc0af,-,-,-,OBSERVED,unavailable,http://static.xhamster.com/xplayer17.swf,...,0,0,0,0,0,1,0,0,0,0


In [35]:
# should be 1964138 x 103
print sg_matrix.shape

(1964138, 103)


### cs_method field (conversion)

In [36]:
sg_matrix.cs_method.value_counts()

GET                                                              1814559
POST                                                              145724
CONNECT                                                             1864
HEAD                                                                1325
OPTIONS                                                              414
get                                                                  111
PUT                                                                   88
DESCRIBE                                                              20
PROPFIND                                                              12
l%17���%133���2%10�w�7%1Eۖ��6M|W�%0FD%03%04%05��                       1
!GET                                                                   1
��oN�%08ƶ�aBC���+�VX%0B�s`�@�%14�y��i�%18%0ET_H�'�7x                   1
�E��D%18                                                               1
�����                                              

In [37]:
def clean_csmethod(csm):
    if csm not in ('GET', 'POST', 'CONNECT', 'HEAD', 'OPTIONS', 'get', 'PUT', 'DESCRIBE', 'PROPFIND'):
        return True
    else:
        return False

In [38]:
# Had to do this as a loop, could not do this via pandas approach
g = sg_matrix[sg_matrix.cs_method.apply(clean_csmethod)].iterrows()
for k, row in g:
    sg_matrix.loc[k, 'cs_method'] = '-'

In [39]:
sg_matrix['cs_method'].value_counts()

GET         1814559
POST         145724
CONNECT        1864
HEAD           1325
OPTIONS         414
get             111
PUT              88
-                21
DESCRIBE         20
PROPFIND         12
Name: cs_method, dtype: int64

In [40]:
cs_method_dummies = pd.get_dummies(sg_matrix.cs_method, prefix = 'cs_method')
print cs_method_dummies.shape
cs_method_dummies.head(3)

(1964138, 10)


Unnamed: 0,cs_method_-,cs_method_CONNECT,cs_method_DESCRIBE,cs_method_GET,cs_method_HEAD,cs_method_OPTIONS,cs_method_POST,cs_method_PROPFIND,cs_method_PUT,cs_method_get
0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0


In [41]:
cs_method_dummies.drop('cs_method_-', axis=1, inplace=True)
print cs_method_dummies.shape
cs_method_dummies.head(3)

(1964138, 9)


Unnamed: 0,cs_method_CONNECT,cs_method_DESCRIBE,cs_method_GET,cs_method_HEAD,cs_method_OPTIONS,cs_method_POST,cs_method_PROPFIND,cs_method_PUT,cs_method_get
0,0,0,1,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0


In [42]:
if sg_matrix.shape[1] == 103:
    sg_matrix = pd.concat([sg_matrix, cs_method_dummies], axis=1)
    print 'concatenated successfully'
    # should be 1964138 x 112
    print 'should be (1964138 x 112): ', sg_matrix.shape
else:
    print 'did not concatenate columns'
    print sg_matrix.shape

sg_matrix.head(3)

concatenated successfully
should be (1964138 x 112):  (1964138, 112)


Unnamed: 0,dt,tm,time_taken,c_ip,cs_username,cs_auth_group,x_exception_id,sc_filter_result,cs_categories,csReferer,...,s_action_TCP_TUNNELED,cs_method_CONNECT,cs_method_DESCRIBE,cs_method_GET,cs_method_HEAD,cs_method_OPTIONS,cs_method_POST,cs_method_PROPFIND,cs_method_PUT,cs_method_get
0,2011-07-22,20:34:51,282,ce6de14af68ce198,-,-,-,OBSERVED,unavailable,http://www.surfjunky.com/members/sj-a.php?r=44864,...,0,0,0,1,0,0,0,0,0,0
1,2011-07-22,20:34:51,216,6154d919f8d56690,-,-,-,OBSERVED,unavailable,http://x31.iloveim.com/build_3.9.2.1/comet.html,...,0,0,0,1,0,0,0,0,0,0
2,2011-07-22,20:34:51,102,48fd5c69d3cfc0af,-,-,-,OBSERVED,unavailable,http://static.xhamster.com/xplayer17.swf,...,0,0,0,1,0,0,0,0,0,0


### c_ip addresses (conversion to octets)

In [43]:
sg_matrix.c_ip.value_counts()
print len(sg_matrix.c_ip.value_counts())

38017


In [44]:
# function that returns 16 integers representing sixteen octets of IPv6 address
def convert_cip(ip):
    rep = []
    rep.append(ip)
    if len(ip) != 16:
        exit

    for i in range(0,16,2):
        c_int = int(ip[i:i+2], 16)
        rep.append(c_int)
    return rep

In [45]:
print convert_cip('48fd5c69d3cfc0af')

['48fd5c69d3cfc0af', 72, 253, 92, 105, 211, 207, 192, 175]


In [46]:
cip_list = list(sg_matrix.c_ip.value_counts().index)
tup_list = []
for cip in cip_list:
    tup_list.append(convert_cip(cip))

In [47]:
columns = ['c_ip', 'c_ip_1', 'c_ip_2', 'c_ip_3', 'c_ip_4', 'c_ip_5', 'c_ip_6', 'c_ip_7', 'c_ip_8']
cip_df = pd.DataFrame(tup_list, columns = columns)

In [48]:
# should be (38017 x 9)
cip_df.shape

(38017, 9)

In [49]:
cip_df.head(3)

Unnamed: 0,c_ip,c_ip_1,c_ip_2,c_ip_3,c_ip_4,c_ip_5,c_ip_6,c_ip_7,c_ip_8
0,48fd5c69d3cfc0af,72,253,92,105,211,207,192,175
1,195e047d75339fd1,25,94,4,125,117,51,159,209
2,0e8fe56c260eb807,14,143,229,108,38,14,184,7


In [50]:
if sg_matrix.shape[1] == 112:
    sg_matrix = sg_matrix.merge(cip_df,how = 'left', on='c_ip')
    print 'successfully joined to c_ip dummmy columns'
    print 'should be  (1964138 x 120)', sg_matrix.shape
else:
    print 'did not join to c_ip dummy columns'
    print sg_matrix.shape

sg_matrix.head(3)

successfully joined to c_ip dummmy columns
should be  (1964138 x 120) (1964138, 120)


Unnamed: 0,dt,tm,time_taken,c_ip,cs_username,cs_auth_group,x_exception_id,sc_filter_result,cs_categories,csReferer,...,cs_method_PUT,cs_method_get,c_ip_1,c_ip_2,c_ip_3,c_ip_4,c_ip_5,c_ip_6,c_ip_7,c_ip_8
0,2011-07-22,20:34:51,282,ce6de14af68ce198,-,-,-,OBSERVED,unavailable,http://www.surfjunky.com/members/sj-a.php?r=44864,...,0,0,206,109,225,74,246,140,225,152
1,2011-07-22,20:34:51,216,6154d919f8d56690,-,-,-,OBSERVED,unavailable,http://x31.iloveim.com/build_3.9.2.1/comet.html,...,0,0,97,84,217,25,248,213,102,144
2,2011-07-22,20:34:51,102,48fd5c69d3cfc0af,-,-,-,OBSERVED,unavailable,http://static.xhamster.com/xplayer17.swf,...,0,0,72,253,92,105,211,207,192,175


In [51]:
# should be 1964138 x 120 cols
sg_matrix.shape

(1964138, 120)

In [52]:
pd.options.display.max_columns = 200
sg_matrix.head(2)

Unnamed: 0,dt,tm,time_taken,c_ip,cs_username,cs_auth_group,x_exception_id,sc_filter_result,cs_categories,csReferer,sc_status,s_action,cs_method,rsContentType,cs_uri_scheme,Domain,cs_uri_port,cs_uri_path,cs_uri_query,cs_uri_extension,csUserAgent,s_ip,sc_bytes,cs_bytes,x_virus_id,Academic_Fraud,Adult_Themes,Adware,Alcohol,Anime_Manga_Webcomic,Auctions,Automotive,Blogs,Business_Services,Chat,Classifieds,Dating,Drugs,Ecommerce_Shopping,Educational_Institutions,File_Storage,Financial_Institutions,Forums_Message_boards,Gambling,Games,German_Youth_Protection,Government,Hate_Discrimination,Health_and_Fitness,Humor,Instant_Messaging,Jobs_Employment,Lingerie_Bikini,Movies,Music,News_Media,Non-Profits,Nudity,P2P_File_sharing,Parked_Domains,Photo_Sharing,Podcasts,Politics,Pornography,Portals,Proxy_Anonymizer,Radio,Religious,Research_Reference,Search_Engines,Sexuality,Social_Networking,Software_Technology,Sports,Tasteless,Television,Tobacco,Travel,Video_Sharing,Visual_Search_Engines,Weapons,Web_Spam,Webmail,dtm,weekday,hrs,sc_filter_result_DENIED,sc_filter_result_OBSERVED,cs_uri_scheme_http,cs_uri_scheme_tcp,s_action_TCP_AUTH_HIT,s_action_TCP_AUTH_MISS,s_action_TCP_CLIENT_REFRESH,s_action_TCP_DENIED,s_action_TCP_ERR_MISS,s_action_TCP_HIT,s_action_TCP_MISS,s_action_TCP_MISS_RST,s_action_TCP_NC_MISS,s_action_TCP_NC_MISS_RST,s_action_TCP_PARTIAL_MISS,s_action_TCP_REFRESH_MISS,s_action_TCP_TUNNELED,cs_method_CONNECT,cs_method_DESCRIBE,cs_method_GET,cs_method_HEAD,cs_method_OPTIONS,cs_method_POST,cs_method_PROPFIND,cs_method_PUT,cs_method_get,c_ip_1,c_ip_2,c_ip_3,c_ip_4,c_ip_5,c_ip_6,c_ip_7,c_ip_8
0,2011-07-22,20:34:51,282,ce6de14af68ce198,-,-,-,OBSERVED,unavailable,http://www.surfjunky.com/members/sj-a.php?r=44864,200,TCP_NC_MISS,GET,text/html,http,www.surfjunky.com,80,/members/sj-a.php,?r=66556,php,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,82.137.200.42,1395,663,-,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,206,109,225,74,246,140,225,152
1,2011-07-22,20:34:51,216,6154d919f8d56690,-,-,-,OBSERVED,unavailable,http://x31.iloveim.com/build_3.9.2.1/comet.html,200,TCP_NC_MISS,GET,text/html;charset=UTF-8,http,x31.iloveim.com,80,/servlets/events,?1122064400327,-,Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US...,82.137.200.42,473,1129,-,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,97,84,217,25,248,213,102,144


### s_ip field (should be removed)

In [53]:
sg_matrix.s_ip.value_counts()

82.137.200.42    1964138
Name: s_ip, dtype: int64

### x_virus_id field (should be removed)

In [54]:
sg_matrix.x_virus_id.value_counts()

-    1964138
Name: x_virus_id, dtype: int64

### URL Length (fields):
* cs_uri_path
* cs_uri_port
* cs_uri_query
* cs_uri_extension


In [55]:
sg_matrix['domain_len'] = sg_matrix.Domain.apply(len)
sg_matrix['cs_uri_path_len'] = sg_matrix.cs_uri_path.apply(len)
sg_matrix['cs_uri_port_len'] = sg_matrix.cs_uri_port.astype('str').apply(len)
sg_matrix['cs_uri_query_len'] = sg_matrix.cs_uri_query.apply(len)
sg_matrix['cs_uri_ext_len'] = sg_matrix.cs_uri_extension.apply(len)

In [56]:
# should be (1964138 x 125 cols)
print sg_matrix.shape
sg_matrix.head(2)

(1964138, 125)


Unnamed: 0,dt,tm,time_taken,c_ip,cs_username,cs_auth_group,x_exception_id,sc_filter_result,cs_categories,csReferer,sc_status,s_action,cs_method,rsContentType,cs_uri_scheme,Domain,cs_uri_port,cs_uri_path,cs_uri_query,cs_uri_extension,csUserAgent,s_ip,sc_bytes,cs_bytes,x_virus_id,Academic_Fraud,Adult_Themes,Adware,Alcohol,Anime_Manga_Webcomic,Auctions,Automotive,Blogs,Business_Services,Chat,Classifieds,Dating,Drugs,Ecommerce_Shopping,Educational_Institutions,File_Storage,Financial_Institutions,Forums_Message_boards,Gambling,Games,German_Youth_Protection,Government,Hate_Discrimination,Health_and_Fitness,Humor,Instant_Messaging,Jobs_Employment,Lingerie_Bikini,Movies,Music,News_Media,Non-Profits,Nudity,P2P_File_sharing,Parked_Domains,Photo_Sharing,Podcasts,Politics,Pornography,Portals,Proxy_Anonymizer,Radio,Religious,Research_Reference,Search_Engines,Sexuality,Social_Networking,Software_Technology,Sports,Tasteless,Television,Tobacco,Travel,Video_Sharing,Visual_Search_Engines,Weapons,Web_Spam,Webmail,dtm,weekday,hrs,sc_filter_result_DENIED,sc_filter_result_OBSERVED,cs_uri_scheme_http,cs_uri_scheme_tcp,s_action_TCP_AUTH_HIT,s_action_TCP_AUTH_MISS,s_action_TCP_CLIENT_REFRESH,s_action_TCP_DENIED,s_action_TCP_ERR_MISS,s_action_TCP_HIT,s_action_TCP_MISS,s_action_TCP_MISS_RST,s_action_TCP_NC_MISS,s_action_TCP_NC_MISS_RST,s_action_TCP_PARTIAL_MISS,s_action_TCP_REFRESH_MISS,s_action_TCP_TUNNELED,cs_method_CONNECT,cs_method_DESCRIBE,cs_method_GET,cs_method_HEAD,cs_method_OPTIONS,cs_method_POST,cs_method_PROPFIND,cs_method_PUT,cs_method_get,c_ip_1,c_ip_2,c_ip_3,c_ip_4,c_ip_5,c_ip_6,c_ip_7,c_ip_8,domain_len,cs_uri_path_len,cs_uri_port_len,cs_uri_query_len,cs_uri_ext_len
0,2011-07-22,20:34:51,282,ce6de14af68ce198,-,-,-,OBSERVED,unavailable,http://www.surfjunky.com/members/sj-a.php?r=44864,200,TCP_NC_MISS,GET,text/html,http,www.surfjunky.com,80,/members/sj-a.php,?r=66556,php,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,82.137.200.42,1395,663,-,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,206,109,225,74,246,140,225,152,17,17,2,8,3
1,2011-07-22,20:34:51,216,6154d919f8d56690,-,-,-,OBSERVED,unavailable,http://x31.iloveim.com/build_3.9.2.1/comet.html,200,TCP_NC_MISS,GET,text/html;charset=UTF-8,http,x31.iloveim.com,80,/servlets/events,?1122064400327,-,Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US...,82.137.200.42,473,1129,-,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,97,84,217,25,248,213,102,144,15,16,2,14,1


In [57]:
sg_matrix['url_len'] = sg_matrix.domain_len +\
    sg_matrix.cs_uri_path_len + sg_matrix.cs_uri_port_len +\
    sg_matrix.cs_uri_query_len + sg_matrix.cs_uri_ext_len

In [58]:
# should be 1964138 x 126 cols
print sg_matrix.shape
sg_matrix.head(2)

(1964138, 126)


Unnamed: 0,dt,tm,time_taken,c_ip,cs_username,cs_auth_group,x_exception_id,sc_filter_result,cs_categories,csReferer,sc_status,s_action,cs_method,rsContentType,cs_uri_scheme,Domain,cs_uri_port,cs_uri_path,cs_uri_query,cs_uri_extension,csUserAgent,s_ip,sc_bytes,cs_bytes,x_virus_id,Academic_Fraud,Adult_Themes,Adware,Alcohol,Anime_Manga_Webcomic,Auctions,Automotive,Blogs,Business_Services,Chat,Classifieds,Dating,Drugs,Ecommerce_Shopping,Educational_Institutions,File_Storage,Financial_Institutions,Forums_Message_boards,Gambling,Games,German_Youth_Protection,Government,Hate_Discrimination,Health_and_Fitness,Humor,Instant_Messaging,Jobs_Employment,Lingerie_Bikini,Movies,Music,News_Media,Non-Profits,Nudity,P2P_File_sharing,Parked_Domains,Photo_Sharing,Podcasts,Politics,Pornography,Portals,Proxy_Anonymizer,Radio,Religious,Research_Reference,Search_Engines,Sexuality,Social_Networking,Software_Technology,Sports,Tasteless,Television,Tobacco,Travel,Video_Sharing,Visual_Search_Engines,Weapons,Web_Spam,Webmail,dtm,weekday,hrs,sc_filter_result_DENIED,sc_filter_result_OBSERVED,cs_uri_scheme_http,cs_uri_scheme_tcp,s_action_TCP_AUTH_HIT,s_action_TCP_AUTH_MISS,s_action_TCP_CLIENT_REFRESH,s_action_TCP_DENIED,s_action_TCP_ERR_MISS,s_action_TCP_HIT,s_action_TCP_MISS,s_action_TCP_MISS_RST,s_action_TCP_NC_MISS,s_action_TCP_NC_MISS_RST,s_action_TCP_PARTIAL_MISS,s_action_TCP_REFRESH_MISS,s_action_TCP_TUNNELED,cs_method_CONNECT,cs_method_DESCRIBE,cs_method_GET,cs_method_HEAD,cs_method_OPTIONS,cs_method_POST,cs_method_PROPFIND,cs_method_PUT,cs_method_get,c_ip_1,c_ip_2,c_ip_3,c_ip_4,c_ip_5,c_ip_6,c_ip_7,c_ip_8,domain_len,cs_uri_path_len,cs_uri_port_len,cs_uri_query_len,cs_uri_ext_len,url_len
0,2011-07-22,20:34:51,282,ce6de14af68ce198,-,-,-,OBSERVED,unavailable,http://www.surfjunky.com/members/sj-a.php?r=44864,200,TCP_NC_MISS,GET,text/html,http,www.surfjunky.com,80,/members/sj-a.php,?r=66556,php,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,82.137.200.42,1395,663,-,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,206,109,225,74,246,140,225,152,17,17,2,8,3,47
1,2011-07-22,20:34:51,216,6154d919f8d56690,-,-,-,OBSERVED,unavailable,http://x31.iloveim.com/build_3.9.2.1/comet.html,200,TCP_NC_MISS,GET,text/html;charset=UTF-8,http,x31.iloveim.com,80,/servlets/events,?1122064400327,-,Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US...,82.137.200.42,473,1129,-,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,97,84,217,25,248,213,102,144,15,16,2,14,1,48


### Web Category Summary:


In [61]:
len(categories_only)
print categories_only

['Academic_Fraud', 'Adult_Themes', 'Adware', 'Alcohol', 'Anime_Manga_Webcomic', 'Auctions', 'Automotive', 'Blogs', 'Business_Services', 'Chat', 'Classifieds', 'Dating', 'Drugs', 'Ecommerce_Shopping', 'Educational_Institutions', 'File_Storage', 'Financial_Institutions', 'Forums_Message_boards', 'Gambling', 'Games', 'German_Youth_Protection', 'Government', 'Hate_Discrimination', 'Health_and_Fitness', 'Humor', 'Instant_Messaging', 'Jobs_Employment', 'Lingerie_Bikini', 'Movies', 'Music', 'News_Media', 'Non-Profits', 'Nudity', 'P2P_File_sharing', 'Parked_Domains', 'Photo_Sharing', 'Podcasts', 'Politics', 'Pornography', 'Portals', 'Proxy_Anonymizer', 'Radio', 'Religious', 'Research_Reference', 'Search_Engines', 'Sexuality', 'Social_Networking', 'Software_Technology', 'Sports', 'Tasteless', 'Television', 'Tobacco', 'Travel', 'Video_Sharing', 'Visual_Search_Engines', 'Weapons', 'Web_Spam', 'Webmail']


In [62]:
print sg_matrix.sc_filter_result.value_counts()

OBSERVED    1932188
DENIED        23512
PROXIED        8438
Name: sc_filter_result, dtype: int64


In [63]:
# Had some problems with converting this
d_replace = {'OBSERVED':'1', 'DENIED':'2', 'PROXIED':'0'}
sg_matrix.replace('sc_filter_result', d_replace)

Unnamed: 0,dt,tm,time_taken,c_ip,cs_username,cs_auth_group,x_exception_id,sc_filter_result,cs_categories,csReferer,sc_status,s_action,cs_method,rsContentType,cs_uri_scheme,Domain,cs_uri_port,cs_uri_path,cs_uri_query,cs_uri_extension,csUserAgent,s_ip,sc_bytes,cs_bytes,x_virus_id,Academic_Fraud,Adult_Themes,Adware,Alcohol,Anime_Manga_Webcomic,Auctions,Automotive,Blogs,Business_Services,Chat,Classifieds,Dating,Drugs,Ecommerce_Shopping,Educational_Institutions,File_Storage,Financial_Institutions,Forums_Message_boards,Gambling,Games,German_Youth_Protection,Government,Hate_Discrimination,Health_and_Fitness,Humor,Instant_Messaging,Jobs_Employment,Lingerie_Bikini,Movies,Music,News_Media,Non-Profits,Nudity,P2P_File_sharing,Parked_Domains,Photo_Sharing,Podcasts,Politics,Pornography,Portals,Proxy_Anonymizer,Radio,Religious,Research_Reference,Search_Engines,Sexuality,Social_Networking,Software_Technology,Sports,Tasteless,Television,Tobacco,Travel,Video_Sharing,Visual_Search_Engines,Weapons,Web_Spam,Webmail,dtm,weekday,hrs,sc_filter_result_DENIED,sc_filter_result_OBSERVED,cs_uri_scheme_http,cs_uri_scheme_tcp,s_action_TCP_AUTH_HIT,s_action_TCP_AUTH_MISS,s_action_TCP_CLIENT_REFRESH,s_action_TCP_DENIED,s_action_TCP_ERR_MISS,s_action_TCP_HIT,s_action_TCP_MISS,s_action_TCP_MISS_RST,s_action_TCP_NC_MISS,s_action_TCP_NC_MISS_RST,s_action_TCP_PARTIAL_MISS,s_action_TCP_REFRESH_MISS,s_action_TCP_TUNNELED,cs_method_CONNECT,cs_method_DESCRIBE,cs_method_GET,cs_method_HEAD,cs_method_OPTIONS,cs_method_POST,cs_method_PROPFIND,cs_method_PUT,cs_method_get,c_ip_1,c_ip_2,c_ip_3,c_ip_4,c_ip_5,c_ip_6,c_ip_7,c_ip_8,domain_len,cs_uri_path_len,cs_uri_port_len,cs_uri_query_len,cs_uri_ext_len,url_len
0,2011-07-22,20:34:51,282,ce6de14af68ce198,-,-,-,OBSERVED,unavailable,http://www.surfjunky.com/members/sj-a.php?r=44864,200,TCP_NC_MISS,GET,text/html,http,www.surfjunky.com,80,/members/sj-a.php,?r=66556,php,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,82.137.200.42,1395,663,-,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,206,109,225,74,246,140,225,152,17,17,2,8,3,47
1,2011-07-22,20:34:51,216,6154d919f8d56690,-,-,-,OBSERVED,unavailable,http://x31.iloveim.com/build_3.9.2.1/comet.html,200,TCP_NC_MISS,GET,text/html;charset=UTF-8,http,x31.iloveim.com,80,/servlets/events,?1122064400327,-,Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US...,82.137.200.42,473,1129,-,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,97,84,217,25,248,213,102,144,15,16,2,14,1,48
2,2011-07-22,20:34:51,102,48fd5c69d3cfc0af,-,-,-,OBSERVED,unavailable,http://static.xhamster.com/xplayer17.swf,403,TCP_NC_MISS,GET,text/html,http,88.208.24.131,80,"/key=io8g3-zl3cM,end=1311337549/data=480312379...",-,flv,Mozilla/4.0 (compatible; MSIE 6.0; Windows NT ...,82.137.200.42,724,319,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,72,253,92,105,211,207,192,175,13,119,2,1,3,138
3,2011-07-22,20:34:51,716,f46e16fe0221b453,-,-,-,OBSERVED,unavailable,-,200,TCP_NC_MISS,POST,application/x-fcs,http,media2.lsops.net,80,/idle/1894876757/425,-,-,Shockwave Flash,82.137.200.42,182,160,-,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,244,110,22,254,2,33,180,83,16,20,2,1,1,40
4,2011-07-22,20:34:51,615,96ba5993c403a175,-,-,-,OBSERVED,unavailable,http://ifa.camads.net/dif/?cid=xvideos-shared-...,302,TCP_NC_MISS,GET,text/html,http,feeds.videosz.com,80,/custom/xvideos/464x244.php,-,php,Mozilla/5.0 (Windows NT 5.1; rv:2.0) Gecko/201...,82.137.200.42,378,474,-,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,150,186,89,147,196,3,161,117,17,27,2,1,3,50
5,2011-07-22,20:34:51,110575,000d0313eaffc545,-,-,-,OBSERVED,unavailable,-,200,TCP_MISS,GET,video/x-flv,http,porn194.xvideos.com,80,/videos/flv/c/4/3/xvideos.com_c43e187db2d95310...,?e=1311377578&ri=1024&rs=85&h=530f4148268756f1...,flv,Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US...,82.137.200.42,10849429,522,-,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,13,3,19,234,255,197,69,19,66,2,62,3,152
6,2011-07-22,20:34:51,687,dee58fa2188103d6,-,-,-,OBSERVED,unavailable,http://videogayz.com/,200,TCP_HIT,GET,image/jpeg,http,99.192.176.43,80,/videogayz.com/rt/content/16509_763.jpg,-,jpg,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...,82.137.200.42,13120,311,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,222,229,143,162,24,129,3,214,13,39,2,1,3,58
7,2011-07-22,20:34:51,301,1bd3f55462a00bb8,-,-,-,OBSERVED,unavailable,http://x32.iloveim.com/servlets/ajax,200,TCP_NC_MISS,GET,text/html;charset=UTF-8,http,x32.iloveim.com,80,/servlets/typingEventNotification,?value=TVNOX3dhbGFhXzk5MEBob3RtYWlsLmNvbQ==,-,Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US...,82.137.200.42,224,1132,-,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,27,211,245,84,98,160,11,184,15,33,2,43,1,94
8,2011-07-22,20:34:51,288,7934e484899376c0,-,-,-,OBSERVED,unavailable,http://www.sham-sat.net/vb/showthread.php?t=63170,200,TCP_HIT,GET,image/gif,http,www.sham-sat.net,80,/vb/images/buttons/sendtofriend.gif,-,gif,Opera/9.80 (Windows NT 6.1; U; en) Presto/2.9....,82.137.200.42,1433,656,-,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,121,52,228,132,137,147,118,192,16,35,2,1,3,57
9,2011-07-22,20:34:51,147,48fd5c69d3cfc0af,-,-,-,OBSERVED,unavailable,http://static.xhamster.com/xplayer17.swf,403,TCP_NC_MISS,GET,text/html,http,88.208.24.132,80,"/key=GUnBrc.cmPI,end=1311337882/data=480312684...",-,flv,Mozilla/4.0 (compatible; MSIE 6.0; Windows NT ...,82.137.200.42,724,317,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,72,253,92,105,211,207,192,175,13,117,2,1,3,136


In [64]:
sg_matrix['sum_cat'] = sg_matrix[categories_only].sum(axis=1)

In [91]:
# should be 1964138 x 127 cols
print sg_matrix.shape
sg_matrix.head(3)

(1964138, 127)


Unnamed: 0,dt,tm,time_taken,c_ip,cs_username,cs_auth_group,x_exception_id,sc_filter_result,cs_categories,csReferer,sc_status,s_action,cs_method,rsContentType,cs_uri_scheme,Domain,cs_uri_port,cs_uri_path,cs_uri_query,cs_uri_extension,csUserAgent,s_ip,sc_bytes,cs_bytes,x_virus_id,Academic_Fraud,Adult_Themes,Adware,Alcohol,Anime_Manga_Webcomic,Auctions,Automotive,Blogs,Business_Services,Chat,Classifieds,Dating,Drugs,Ecommerce_Shopping,Educational_Institutions,File_Storage,Financial_Institutions,Forums_Message_boards,Gambling,Games,German_Youth_Protection,Government,Hate_Discrimination,Health_and_Fitness,Humor,Instant_Messaging,Jobs_Employment,Lingerie_Bikini,Movies,Music,News_Media,Non-Profits,Nudity,P2P_File_sharing,Parked_Domains,Photo_Sharing,Podcasts,Politics,Pornography,Portals,Proxy_Anonymizer,Radio,Religious,Research_Reference,Search_Engines,Sexuality,Social_Networking,Software_Technology,Sports,Tasteless,Television,Tobacco,Travel,Video_Sharing,Visual_Search_Engines,Weapons,Web_Spam,Webmail,dtm,weekday,hrs,sc_filter_result_DENIED,sc_filter_result_OBSERVED,cs_uri_scheme_http,cs_uri_scheme_tcp,s_action_TCP_AUTH_HIT,s_action_TCP_AUTH_MISS,s_action_TCP_CLIENT_REFRESH,s_action_TCP_DENIED,s_action_TCP_ERR_MISS,s_action_TCP_HIT,s_action_TCP_MISS,s_action_TCP_MISS_RST,s_action_TCP_NC_MISS,s_action_TCP_NC_MISS_RST,s_action_TCP_PARTIAL_MISS,s_action_TCP_REFRESH_MISS,s_action_TCP_TUNNELED,cs_method_CONNECT,cs_method_DESCRIBE,cs_method_GET,cs_method_HEAD,cs_method_OPTIONS,cs_method_POST,cs_method_PROPFIND,cs_method_PUT,cs_method_get,c_ip_1,c_ip_2,c_ip_3,c_ip_4,c_ip_5,c_ip_6,c_ip_7,c_ip_8,domain_len,cs_uri_path_len,cs_uri_port_len,cs_uri_query_len,cs_uri_ext_len,url_len,sum_cat
0,2011-07-22,20:34:51,282,ce6de14af68ce198,-,-,-,OBSERVED,unavailable,http://www.surfjunky.com/members/sj-a.php?r=44864,200,TCP_NC_MISS,GET,text/html,http,www.surfjunky.com,80,/members/sj-a.php,?r=66556,php,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,82.137.200.42,1395,663,-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,206,109,225,74,246,140,225,152,17,17,2,8,3,47,0
1,2011-07-22,20:34:51,216,6154d919f8d56690,-,-,-,OBSERVED,unavailable,http://x31.iloveim.com/build_3.9.2.1/comet.html,200,TCP_NC_MISS,GET,text/html;charset=UTF-8,http,x31.iloveim.com,80,/servlets/events,?1122064400327,-,Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US...,82.137.200.42,473,1129,-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,97,84,217,25,248,213,102,144,15,16,2,14,1,48,3
2,2011-07-22,20:34:51,102,48fd5c69d3cfc0af,-,-,-,OBSERVED,unavailable,http://static.xhamster.com/xplayer17.swf,403,TCP_NC_MISS,GET,text/html,http,88.208.24.131,80,"/key=io8g3-zl3cM,end=1311337549/data=480312379...",-,flv,Mozilla/4.0 (compatible; MSIE 6.0; Windows NT ...,82.137.200.42,724,319,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,72,253,92,105,211,207,192,175,13,119,2,1,3,138,0


In [67]:
ip_rows = sg_matrix[sg_matrix.Domain.apply(check_ip)].shape[0]
dn_rows = sg_matrix.shape[0] - ip_rows
print 'Percent of domains OpenDNS detected and categorized: %4.1f%%' %(sg_matrix[sg_matrix.sum_cat > 0].shape[0] * 100 / float(dn_rows))

Percent of domains OpenDNS detected and categorized: 71.6%


In [68]:
# Validating individual rows for sum_cats field
pd.options.display.max_rows = 200
sg_matrix.loc[101]

dt                                                                    2011-07-22
tm                                                                      20:34:52
time_taken                                                                   368
c_ip                                                            feb1081dd67e0141
cs_username                                                                    -
cs_auth_group                                                                  -
x_exception_id                                                                 -
sc_filter_result                                                        OBSERVED
cs_categories                                                        unavailable
csReferer                                             http://www.gayteacher.net/
sc_status                                                                    200
s_action                                                                TCP_MISS
cs_method                   

## Write scikit-learn matrix to file
All data files reside in path_name set at top of notebook.  
For files containing all records in the data set:
* (1) sgos422_sampled_clean.csv:  Contains clean records, comma-separated records, minimal psg  (1964138 x 25)
* (2) 2016-01-26_openDNS_fetchstats.csv: (21809 x 67) All of the web categories
  detected per domain on January 26th after running through all 2M domains
  from BCP logs.  Maps web cat(s) to each domain requested.
* (3) sgos422_clean_cats.csv: Contains the above, plus web categories found from OpenDNS (1964138 x 83)
* (4) sgos422_enrich_127.csv: Contains all the above, plus enriched fields that convert text fields to numeric (1964138 x 127)
* (5) sgos422_skfull_106.csv: Removes all text columns from sgos422_enrich_127.csv (1964138 x 105)

For smaller matrix containing only IP addresses in Domain (or cs_host): 
* (6) sgos422_ipinfo_loc.csv: Domain to geo mapping obtained from ipinfo.io api (1019 x 7)
* (7) sgos422_ipgeo_155.csv: Contains all of the columns, including enrichment of lat/long/country from ipinfo.io (297793 x 155)
* (8) sgos422_ipskl_123.csv: Removes all text columns from sgos422_ipgeo_155.csv (297793 x 123)

In [27]:
#cols127 = list(sg_matrix.columns.values)
cols127 = ['dt','tm','time_taken','c_ip','cs_username','cs_auth_group','x_exception_id','sc_filter_result','cs_categories','csReferer','sc_status','s_action','cs_method','rsContentType','cs_uri_scheme','Domain','cs_uri_port','cs_uri_path','cs_uri_query','cs_uri_extension','csUserAgent','s_ip','sc_bytes','cs_bytes','x_virus_id','Academic_Fraud','Adult_Themes','Adware','Alcohol','Anime_Manga_Webcomic','Auctions','Automotive','Blogs','Business_Services','Chat','Classifieds','Dating','Drugs','Ecommerce_Shopping','Educational_Institutions','File_Storage','Financial_Institutions','Forums_Message_boards','Gambling','Games','German_Youth_Protection','Government','Hate_Discrimination','Health_and_Fitness','Humor','Instant_Messaging','Jobs_Employment','Lingerie_Bikini','Movies','Music','News_Media','Non-Profits','Nudity','P2P_File_sharing','Parked_Domains','Photo_Sharing','Podcasts','Politics','Pornography','Portals','Proxy_Anonymizer',
 'Radio','Religious','Research_Reference','Search_Engines','Sexuality','Social_Networking','Software_Technology','Sports','Tasteless','Television','Tobacco','Travel','Video_Sharing','Visual_Search_Engines','Weapons','Web_Spam','Webmail','dtm','weekday','hrs','sc_filter_result_DENIED','sc_filter_result_OBSERVED','cs_uri_scheme_http','cs_uri_scheme_tcp','s_action_TCP_AUTH_HIT','s_action_TCP_AUTH_MISS','s_action_TCP_CLIENT_REFRESH','s_action_TCP_DENIED','s_action_TCP_ERR_MISS','s_action_TCP_HIT','s_action_TCP_MISS','s_action_TCP_MISS_RST','s_action_TCP_NC_MISS','s_action_TCP_NC_MISS_RST','s_action_TCP_PARTIAL_MISS','s_action_TCP_REFRESH_MISS','s_action_TCP_TUNNELED','cs_method_CONNECT','cs_method_DESCRIBE','cs_method_GET','cs_method_HEAD','cs_method_OPTIONS','cs_method_POST','cs_method_PROPFIND','cs_method_PUT','cs_method_get','c_ip_1','c_ip_2','c_ip_3','c_ip_4','c_ip_5','c_ip_6','c_ip_7','c_ip_8','domain_len','cs_uri_path_len','cs_uri_port_len','cs_uri_query_len',
 'cs_uri_ext_len','url_len','sum_cat']
print len(cols127)

127


In [28]:
sg127_test = pd.read_csv(path_name + 'sgos422_enrich_127.csv', names = cols127, sep =',', header=0)

In [29]:
# Should be 1964138 x 127 cols
print sg127_test.shape
sg127_test.tail(3)
sg127_test.head(3)

(1964138, 127)


Unnamed: 0,dt,tm,time_taken,c_ip,cs_username,cs_auth_group,x_exception_id,sc_filter_result,cs_categories,csReferer,...,c_ip_6,c_ip_7,c_ip_8,domain_len,cs_uri_path_len,cs_uri_port_len,cs_uri_query_len,cs_uri_ext_len,url_len,sum_cat
0,2011-07-22,20:34:51,282,ce6de14af68ce198,-,-,-,OBSERVED,unavailable,http://www.surfjunky.com/members/sj-a.php?r=44864,...,140,225,152,17,17,2,8,3,47,0
1,2011-07-22,20:34:51,216,6154d919f8d56690,-,-,-,OBSERVED,unavailable,http://x31.iloveim.com/build_3.9.2.1/comet.html,...,213,102,144,15,16,2,14,1,48,3
2,2011-07-22,20:34:51,102,48fd5c69d3cfc0af,-,-,-,OBSERVED,unavailable,http://static.xhamster.com/xplayer17.swf,...,207,192,175,13,119,2,1,3,138,0


### CONSTRUCT scikit-learn matrix
* Find columns where NaN or Null's exist
* Replace NaN and Nulls
* Remove transformed text-based columns

In [23]:
# should be (1964138, 127) after converting all categoricals above, if those steps were skipped cols == 83
list(sg_matrix.columns)
print sg_matrix.shape

(1964138, 83)


In [31]:
print len(numeric_cols)

106


In [32]:
sg_scikit_mat = sg127_test[numeric_cols]

In [33]:
# should be (1964138, 106)
print sg_scikit_mat.shape
sg_scikit_mat.head(3)

(1964138, 106)


Unnamed: 0,weekday,hrs,time_taken,c_ip_1,c_ip_2,c_ip_3,c_ip_4,c_ip_5,c_ip_6,c_ip_7,...,Tasteless,Television,Tobacco,Travel,Video_Sharing,Visual_Search_Engines,Weapons,Web_Spam,Webmail,sum_cat
0,1,20,282,206,109,225,74,246,140,225,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1,20,216,97,84,217,25,248,213,102,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
2,1,20,102,72,253,92,105,211,207,192,...,,,,,,,,,,0


In [13]:
# nan_idx = [i for i in range(0, len(sg_scikit_mat)) if (sg_scikit_mat.loc[i,'cs_uri_port'] in [np.nan, None, '']) == True]

In [86]:
nan_idx

[]

In [85]:
# identify only those columns that have NaN values
# added csi_uri_port --> Need to figure out why it doesn't like isnan()

# null_cols = [(col, sum(sg_scikit_mat[col].apply(np.isnan))) for col in cols if sum(sg_scikit_mat[col].apply(np.isnan)) > 0]

In [88]:
# null_cols is all of the Opendns web categories
# empty_sum = [(col, sum(pd.isnull(sg_scikit_mat[col]))) for col in sg_scikit_mat.columns if sum(pd.isnull(sg_scikit_mat[col])) > 0]

In [34]:
print sg_scikit_mat.apply(np.isnan).sum().sum()

18455890


In [17]:
print sg_scikit_mat.isnull().sum().sum()

18455890


In [35]:
for col in sg_scikit_mat.columns:
    sg_scikit_mat[col].fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [36]:
print sg_scikit_mat.apply(np.isnan).sum().sum()
print sg_scikit_mat.isnull().sum().sum()

0
0


In [37]:
# Spot-checking
print sum(pd.isnull(sg_scikit_mat['Automotive'])), sum(sg_scikit_mat['Automotive'])
print sum(pd.isnull(sg_scikit_mat['Chat'])), sum(sg_scikit_mat['Chat'])
print sum(pd.isnull(sg_scikit_mat['Webmail'])), sum(sg_scikit_mat['Webmail'])

0 658.0
0 186738.0
0 37654.0


In [38]:
#empty_sum = [(col, sum(pd.isnull(sg_scikit_mat[col]))) for col in sg_scikit_mat.columns if sum(pd.isnull(sg_scikit_mat[col])) > 0]
#print len(empty_sum)

In [38]:
# should be (1964138, 106) last added cs_uri_port 
print sg_scikit_mat.shape

(1964138, 106)


In [39]:
sg106_test = pd.read_csv(path_name + 'sgos422_skfull_106.csv', names=numeric_cols, sep=',',header=0)

In [40]:
# should be (1964138, 106)
print sg106_test.shape

(1964138, 106)


In [41]:
sg106_test.head(3)

Unnamed: 0,weekday,hrs,time_taken,c_ip_1,c_ip_2,c_ip_3,c_ip_4,c_ip_5,c_ip_6,c_ip_7,...,Tasteless,Television,Tobacco,Travel,Video_Sharing,Visual_Search_Engines,Weapons,Web_Spam,Webmail,sum_cat
0,1,20,282,206,109,225,74,246,140,225,...,0,0,0,0,0,0,0,0,0,0
1,1,20,216,97,84,217,25,248,213,102,...,0,0,0,0,0,0,0,0,0,3
2,1,20,102,72,253,92,105,211,207,192,...,0,0,0,0,0,0,0,0,0,0


In [42]:
#sg106_test.isnull().values.any()
sg106_test.isnull().sum().sum()
sg106_test.apply(np.isnan).sum().sum()

0

### cs_host: geolocate IP Addresses (ipinfo.io api)
Use IPInfo api:
* Latitude
* Longitude
* Country

Caveats:
* If Lat/Long is null, replace with (-99, -189)
* If Country is null and for other uncommon countries set to OTHER

In [43]:
sg_matrix_cols = ['dt', 'tm', 'time_taken', 'c_ip', 'cs_username', 'cs_auth_group', 'x_exception_id', 'sc_filter_result', 'cs_categories', 'csReferer', 'sc_status', 's_action', 'cs_method', 'rsContentType', 'cs_uri_scheme', 'Domain', 'cs_uri_port', 'cs_uri_path', 'cs_uri_query', 'cs_uri_extension', 'csUserAgent', 's_ip', 'sc_bytes', 'cs_bytes', 'x_virus_id', 'Academic_Fraud', 'Adult_Themes', 'Adware', 'Alcohol', 'Anime_Manga_Webcomic', 'Auctions', 'Automotive', 'Blogs', 'Business_Services', 'Chat', 'Classifieds', 'Dating', 'Drugs', 'Ecommerce_Shopping', 'Educational_Institutions', 'File_Storage', 'Financial_Institutions', 'Forums_Message_boards', 'Gambling', 'Games', 'German_Youth_Protection', 'Government', 'Hate_Discrimination', 'Health_and_Fitness', 'Humor', 'Instant_Messaging', 'Jobs_Employment', 'Lingerie_Bikini', 'Movies', 'Music', 'News_Media', 'Non-Profits', 'Nudity', 'P2P_File_sharing', 'Parked_Domains', 'Photo_Sharing', 'Podcasts', 'Politics', 'Pornography', 'Portals', 'Proxy_Anonymizer', 'Radio', 'Religious', 'Research_Reference', 'Search_Engines', 'Sexuality', 'Social_Networking', 'Software_Technology', 'Sports', 'Tasteless', 'Television', 'Tobacco', 'Travel', 'Video_Sharing', 'Visual_Search_Engines', 'Weapons', 'Web_Spam', 'Webmail', 'dtm', 'weekday', 'hrs', 'sc_filter_result_DENIED', 'sc_filter_result_OBSERVED', 'cs_uri_scheme_http', 'cs_uri_scheme_tcp', 's_action_TCP_AUTH_HIT', 's_action_TCP_AUTH_MISS', 's_action_TCP_CLIENT_REFRESH', 's_action_TCP_DENIED', 's_action_TCP_ERR_MISS', 's_action_TCP_HIT', 's_action_TCP_MISS', 's_action_TCP_MISS_RST', 's_action_TCP_NC_MISS', 's_action_TCP_NC_MISS_RST', 's_action_TCP_PARTIAL_MISS', 's_action_TCP_REFRESH_MISS', 's_action_TCP_TUNNELED', 'cs_method_CONNECT', 'cs_method_DESCRIBE', 'cs_method_GET', 'cs_method_HEAD', 'cs_method_OPTIONS', 'cs_method_POST', 'cs_method_PROPFIND', 'cs_method_PUT', 'cs_method_get', 'c_ip_1', 'c_ip_2', 'c_ip_3', 'c_ip_4', 'c_ip_5', 'c_ip_6', 'c_ip_7', 'c_ip_8', 'domain_len', 'cs_uri_path_len', 'cs_uri_port_len', 'cs_uri_query_len', 'cs_uri_ext_len', 'url_len', 'sum_cat']
print '# cols:', len(sg_matrix_cols)
sg_matrix = pd.read_csv(path_name + 'sgos422_enrich_127.csv', sep =',', names = sg_matrix_cols, header=0)

# cols: 127


In [44]:
# should be (1964138, 127)
print sg_matrix.shape
sg_matrix.head(3)

(1964138, 127)


Unnamed: 0,dt,tm,time_taken,c_ip,cs_username,cs_auth_group,x_exception_id,sc_filter_result,cs_categories,csReferer,...,c_ip_6,c_ip_7,c_ip_8,domain_len,cs_uri_path_len,cs_uri_port_len,cs_uri_query_len,cs_uri_ext_len,url_len,sum_cat
0,2011-07-22,20:34:51,282,ce6de14af68ce198,-,-,-,OBSERVED,unavailable,http://www.surfjunky.com/members/sj-a.php?r=44864,...,140,225,152,17,17,2,8,3,47,0
1,2011-07-22,20:34:51,216,6154d919f8d56690,-,-,-,OBSERVED,unavailable,http://x31.iloveim.com/build_3.9.2.1/comet.html,...,213,102,144,15,16,2,14,1,48,3
2,2011-07-22,20:34:51,102,48fd5c69d3cfc0af,-,-,-,OBSERVED,unavailable,http://static.xhamster.com/xplayer17.swf,...,207,192,175,13,119,2,1,3,138,0


In [45]:
ip_domain_df = sg_matrix[sg_matrix.Domain.apply(check_ip)]
# should be 297793 x 127 cols
ip_domain_df.shape

(297793, 127)

In [31]:
len(ip_domain_df[ip_domain_df.Domain.str.contains('192.168.')])

189

In [32]:
set(ip_domain_df[ip_domain_df.Domain.str.contains('192.168.')].Domain)

{'192.168.0.10',
 '192.168.0.100',
 '192.168.0.2',
 '192.168.1.100',
 '192.168.1.101',
 '192.168.1.103',
 '192.168.1.104',
 '192.168.1.108',
 '192.168.1.111',
 '192.168.1.2',
 '192.168.1.3',
 '192.168.1.33',
 '192.168.1.4',
 '192.168.1.56',
 '192.168.1.70',
 '192.168.16.70',
 '192.168.2.100',
 '192.168.2.101'}

In [33]:
unique_ips = set(ip_domain_df.Domain.values)
print "Number of unique IPs:", len(unique_ips)

Number of unique IPs: 1019


In [118]:
import requests
import json
url = 'http://ipinfo.io/'
unique_ip_attr_list = []
for ip in unique_ips:
    print ip
    r = requests.get(url + ip)
    if r.ok:
        tmp_dict = json.loads(r.text)
        location = tmp_dict['loc'] if 'loc' in tmp_dict.keys() else ''
        country = tmp_dict['country'] if 'country' in tmp_dict.keys() else ''
        region = tmp_dict['region'] if 'region' in tmp_dict.keys() else ''
        city = tmp_dict['city'] if 'city' in tmp_dict.keys() else ''
        org = tmp_dict['org'] if 'org' in tmp_dict.keys() else ''
        postal = tmp_dict['postal'] if 'postal' in tmp_dict.keys() else ''
            
    unique_ip_attr_list.append((ip, location, country, region, city, org, postal))
    

178.254.237.37
222.216.29.212
91.121.30.50
177.156.102.154
83.167.226.236
83.167.226.237
83.167.226.234
68.67.185.215
92.122.219.13
201.205.202.98
77.47.179.140
83.248.163.29
46.163.124.46
110.2.86.209
220.181.66.105
212.96.161.238
85.183.110.136
98.127.135.133
94.100.186.201
94.100.186.200
59.172.11.166
82.249.64.34
67.18.217.141
94.100.189.106
149.13.32.75
38.229.70.34
87.248.223.167
129.2.126.215
83.169.61.21
46.4.116.16
94.100.189.108
81.170.205.81
83.128.84.144
109.71.162.194
174.120.185.50
85.17.254.150
61.206.114.94
2.33.11.136
223.165.24.159
78.140.144.131
96.31.119.99
46.163.124.72
78.140.136.173
91.207.4.250
188.40.33.217
114.94.170.22
211.232.229.111
83.138.135.250
78.140.144.10
95.168.172.158
117.213.39.50
65.49.14.76
131.216.132.20
208.123.237.208
91.188.34.220
174.120.185.26
78.108.178.195
94.100.187.193
94.100.187.191
94.100.187.197
213.142.129.69
195.13.160.15
72.241.185.197
94.30.235.56
50.23.83.87
218.26.235.38
74.125.43.99
207.158.39.75
173.212.249.20
46.229.160.163


In [127]:
len(unique_ip_attr_list)

NameError: name 'unique_ip_attr_list' is not defined

In [120]:
columns = ['Domain', 'location', 'country', 'region', 'city', 'org', 'postal']
unique_ip_df = pd.DataFrame(unique_ip_attr_list, columns = columns)

In [121]:
unique_ip_df.head(3)

Unnamed: 0,Domain,location,country,region,city,org,postal
0,178.254.237.37,"43.4125,23.2250",BG,Oblast Montana,Montana,AS20911 Net-Surf.net Ltd.,3400.0
1,222.216.29.212,"22.8167,108.3167",CN,Guangxi Zhuangzu Zizhiqu,Nanning,"AS4134 No.31,Jin-rong Street",
2,91.121.30.50,"48.8600,2.3500",FR,,,AS16276 OVH SAS,


In [122]:
unique_ip_df.shape

(1019, 7)

In [46]:
ipgeo_cols = ['Domain', 'location', 'country', 'region', 'city', 'org', 'postal']
ipgeo_mat = pd.read_csv(path_name+'sgos422_ipinfo_loc.csv', sep=',', names=ipgeo_cols, header=0)

In [47]:
# should be 1019 x 7
print ipgeo_mat.shape
pd.options.display.max_rows = 2000
ipgeo_mat.head(3)

(1019, 7)


Unnamed: 0,Domain,location,country,region,city,org,postal
0,178.254.237.37,"43.4125,23.2250",BG,Oblast Montana,Montana,AS20911 Net-Surf.net Ltd.,3400.0
1,222.216.29.212,"22.8167,108.3167",CN,Guangxi Zhuangzu Zizhiqu,Nanning,"AS4134 No.31,Jin-rong Street",
2,91.121.30.50,"48.8600,2.3500",FR,,,AS16276 OVH SAS,


In [57]:
ipgeo_mat[ipgeo_mat['country'].apply(pd.isnull) == True]

Unnamed: 0,Domain,location,country,region,city,org,postal
8,92.122.219.13,"47.0000,8.0000",,,,AS20940 Akamai International B.V.,
130,192.168.0.2,,,,,,
175,192.168.0.10,,,,,,
188,192.168.0.100,,,,,,
215,192.168.1.3,,,,,,
216,192.168.1.2,,,,,,
254,192.168.1.111,,,,,,
422,192.168.1.33,,,,,,
495,192.168.2.101,,,,,,
496,192.168.2.100,,,,,,


In [150]:
for i, row in unique_ip_df.iterrows():
    if unique_ip_df.loc[i, 'Domain'].startswith('192.168.'):
        print 'location = %r' % (unique_ip_df.loc[i, 'location'])

NameError: name 'unique_ip_df' is not defined

In [48]:
import numpy as np
null_latlon = []
lat_long = []
for i, row in ipgeo_mat.iterrows():
    if (ipgeo_mat.loc[i, 'location'] not in ['', np.nan, None]):
        location = ipgeo_mat.loc[i, 'location'].split(',')
        country = ipgeo_mat.loc[i, 'country']

        lat_long.append(
            (ipgeo_mat.loc[i, 'Domain'], 
            float(location[0]), 
            float(location[1]),
            country))
    else:
        lat_long.append(
            (ipgeo_mat.loc[i, 'Domain'],
            -99.0,
            -189.0,
            country))
        null_latlon.append(ipgeo_mat.loc[i, 'Domain'])
    

In [49]:
print len(null_latlon), len(lat_long)

19 1019


In [50]:
latlon_df = pd.DataFrame(lat_long, columns = ['Domain', 'lat', 'lon','country'])
# should be 1019 x 4
latlon_df.shape

(1019, 4)

In [63]:
print latlon_df.loc[0, 'lat'], latlon_df.loc[0, 'lon'], latlon_df.loc[0, 'country']
latlon_df

43.4125 23.225 BG


Unnamed: 0,Domain,lat,lon,country
0,178.254.237.37,43.4125,23.225,BG
1,222.216.29.212,22.8167,108.3167,CN
2,91.121.30.50,48.86,2.35,FR
3,177.156.102.154,-23.5477,-46.6358,BR
4,83.167.226.236,50.0833,14.4167,CZ
5,83.167.226.237,50.0833,14.4167,CZ
6,83.167.226.234,50.0833,14.4167,CZ
7,68.67.185.215,40.7391,-73.9826,US
8,92.122.219.13,47.0,8.0,
9,201.205.202.98,10.0162,-84.2116,CR


In [64]:
ipgeo_df = ip_domain_df.merge(latlon_df, how='left', on='Domain')

In [65]:
pd.options.display.max_columns = 100
# should be 297793 x 130 cols
print ipgeo_df.shape
ipgeo_df.head(3)

(297793, 130)


Unnamed: 0,dt,tm,time_taken,c_ip,cs_username,cs_auth_group,x_exception_id,sc_filter_result,cs_categories,csReferer,sc_status,s_action,cs_method,rsContentType,cs_uri_scheme,Domain,cs_uri_port,cs_uri_path,cs_uri_query,cs_uri_extension,csUserAgent,s_ip,sc_bytes,cs_bytes,x_virus_id,Academic_Fraud,Adult_Themes,Adware,Alcohol,Anime_Manga_Webcomic,Auctions,Automotive,Blogs,Business_Services,Chat,Classifieds,Dating,Drugs,Ecommerce_Shopping,Educational_Institutions,File_Storage,Financial_Institutions,Forums_Message_boards,Gambling,Games,German_Youth_Protection,Government,Hate_Discrimination,Health_and_Fitness,Humor,...,Weapons,Web_Spam,Webmail,dtm,weekday,hrs,sc_filter_result_DENIED,sc_filter_result_OBSERVED,cs_uri_scheme_http,cs_uri_scheme_tcp,s_action_TCP_AUTH_HIT,s_action_TCP_AUTH_MISS,s_action_TCP_CLIENT_REFRESH,s_action_TCP_DENIED,s_action_TCP_ERR_MISS,s_action_TCP_HIT,s_action_TCP_MISS,s_action_TCP_MISS_RST,s_action_TCP_NC_MISS,s_action_TCP_NC_MISS_RST,s_action_TCP_PARTIAL_MISS,s_action_TCP_REFRESH_MISS,s_action_TCP_TUNNELED,cs_method_CONNECT,cs_method_DESCRIBE,cs_method_GET,cs_method_HEAD,cs_method_OPTIONS,cs_method_POST,cs_method_PROPFIND,cs_method_PUT,cs_method_get,c_ip_1,c_ip_2,c_ip_3,c_ip_4,c_ip_5,c_ip_6,c_ip_7,c_ip_8,domain_len,cs_uri_path_len,cs_uri_port_len,cs_uri_query_len,cs_uri_ext_len,url_len,sum_cat,lat,lon,country
0,2011-07-22,20:34:51,102,48fd5c69d3cfc0af,-,-,-,OBSERVED,unavailable,http://static.xhamster.com/xplayer17.swf,403,TCP_NC_MISS,GET,text/html,http,88.208.24.131,80,"/key=io8g3-zl3cM,end=1311337549/data=480312379...",-,flv,Mozilla/4.0 (compatible; MSIE 6.0; Windows NT ...,82.137.200.42,724,319,-,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,72,253,92,105,211,207,192,175,13,119,2,1,3,138,0,52.3667,4.9,NL
1,2011-07-22,20:34:51,687,dee58fa2188103d6,-,-,-,OBSERVED,unavailable,http://videogayz.com/,200,TCP_HIT,GET,image/jpeg,http,99.192.176.43,80,/videogayz.com/rt/content/16509_763.jpg,-,jpg,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...,82.137.200.42,13120,311,-,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,222,229,143,162,24,129,3,214,13,39,2,1,3,58,0,42.5181,-83.263,US
2,2011-07-22,20:34:51,147,48fd5c69d3cfc0af,-,-,-,OBSERVED,unavailable,http://static.xhamster.com/xplayer17.swf,403,TCP_NC_MISS,GET,text/html,http,88.208.24.132,80,"/key=GUnBrc.cmPI,end=1311337882/data=480312684...",-,flv,Mozilla/4.0 (compatible; MSIE 6.0; Windows NT ...,82.137.200.42,724,317,-,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,2011-07-22 20:34:51,1,20,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,72,253,92,105,211,207,192,175,13,117,2,1,3,136,0,52.3667,4.9,NL


In [66]:
ipgeo_df.country.value_counts()

NL    225876
US     62562
GB      2881
CN      1755
SY      1035
DE       885
IE       602
CZ       357
UA       305
CA       294
RU       292
IL       185
BG       184
BR       113
IT        92
AR        54
FR        39
HK        29
UY        24
JP        22
VG        19
LV        18
TW        15
IN        11
NZ         9
PL         9
EG         7
AT         7
MY         6
TH         5
LU         5
GR         5
OM         5
ZA         4
BE         4
SE         3
KR         3
ES         3
GI         2
LT         2
MK         2
SI         2
HU         2
TR         2
FI         2
HR         1
CH         1
ID         1
BY         1
PH         1
GH         1
RO         1
AU         1
SD         1
DK         1
CR         1
Name: country, dtype: int64

In [67]:
# Take top 24 countries based on value_counts above and allow these to remain in the dataset, other values 
# will be set as "OTHER"
#country_list = ['NL','US','GB','CN','SY','DE','IE','CZ','UA','CA','RU','IL','BG','BR','IT','AR']
country_list  = ['NL','US','GB','CN','SY','DE','IE','CZ','UA','CA','RU','IL','BG','BR','IT','AR',
                 'FR','HK','UY','JP','VG','LV','TW','IN']

ipgeo_df['country_neo'] = ipgeo_df.country.apply(lambda x: x if x in country_list else 'OTHER')

In [68]:
print len(ipgeo_df.country_neo.value_counts())
ipgeo_df.country_neo.value_counts()

25


NL       225876
US        62562
GB         2881
CN         1755
SY         1035
DE          885
IE          602
CZ          357
UA          305
CA          294
RU          292
IL          185
BG          184
OTHER       144
BR          113
IT           92
AR           54
FR           39
HK           29
UY           24
JP           22
VG           19
LV           18
TW           15
IN           11
Name: country_neo, dtype: int64

In [150]:
ipgeo_df.country.value_counts()

NL    225876
US     62562
GB      2881
CN      1755
SY      1035
DE       885
IE       602
CZ       357
UA       305
CA       294
RU       292
IL       185
BG       184
BR       113
IT        92
AR        54
FR        39
HK        29
UY        24
JP        22
VG        19
LV        18
TW        15
IN        11
NZ         9
PL         9
EG         7
AT         7
MY         6
TH         5
LU         5
GR         5
OM         5
ZA         4
BE         4
SE         3
KR         3
ES         3
GI         2
LT         2
MK         2
SI         2
HU         2
TR         2
FI         2
HR         1
CH         1
ID         1
BY         1
PH         1
GH         1
RO         1
AU         1
SD         1
DK         1
CR         1
Name: country, dtype: int64

In [69]:
country_dummies = pd.get_dummies(ipgeo_df.country_neo, prefix = 'country')
print country_dummies.shape
country_dummies.head(3)

(297793, 25)


Unnamed: 0,country_AR,country_BG,country_BR,country_CA,country_CN,country_CZ,country_DE,country_FR,country_GB,country_HK,country_IE,country_IL,country_IN,country_IT,country_JP,country_LV,country_NL,country_OTHER,country_RU,country_SY,country_TW,country_UA,country_US,country_UY,country_VG
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [70]:
country_dummies.drop('country_OTHER', axis=1, inplace=True)

In [71]:
print country_dummies.columns

Index([u'country_AR', u'country_BG', u'country_BR', u'country_CA',
       u'country_CN', u'country_CZ', u'country_DE', u'country_FR',
       u'country_GB', u'country_HK', u'country_IE', u'country_IL',
       u'country_IN', u'country_IT', u'country_JP', u'country_LV',
       u'country_NL', u'country_RU', u'country_SY', u'country_TW',
       u'country_UA', u'country_US', u'country_UY', u'country_VG'],
      dtype='object')


In [72]:
if ipgeo_df.shape[1] == 131:
    ipgeo_df = pd.concat([ipgeo_df, country_dummies], axis=1)
    print "should be (297793, 155):", ipgeo_df.shape
    print 'concatenated successfully'
else:
    print 'did not concatenate, should be (297793 x 155)'
    print ipgeo_df.shape

should be (297793, 155): (297793, 155)
concatenated successfully


In [73]:
# should be (297793, 155)
ipgeo_df.shape

(297793, 155)

In [52]:
-

127 28
155


In [53]:
# should be 297793 x 155
print full_ip_matrix.shape
full_ip_matrix.head(3)

(297793, 155)


Unnamed: 0,dt,tm,time_taken,c_ip,cs_username,cs_auth_group,x_exception_id,sc_filter_result,cs_categories,csReferer,...,country_JP,country_LV,country_NL,country_RU,country_SY,country_TW,country_UA,country_US,country_UY,country_VG
0,2011-07-22,20:34:51,102,48fd5c69d3cfc0af,-,-,-,OBSERVED,unavailable,http://static.xhamster.com/xplayer17.swf,...,0,0,1,0,0,0,0,0,0,0
1,2011-07-22,20:34:51,687,dee58fa2188103d6,-,-,-,OBSERVED,unavailable,http://videogayz.com/,...,0,0,0,0,0,0,0,1,0,0
2,2011-07-22,20:34:51,147,48fd5c69d3cfc0af,-,-,-,OBSERVED,unavailable,http://static.xhamster.com/xplayer17.swf,...,0,0,1,0,0,0,0,0,0,0


In [54]:
# should be 106 and 26
print len(ipgeo_numeric_cols), len(loc_field)

106 26


In [58]:
ipgeo_numeric_cols == numeric_cols

True

In [59]:
ipgeo_numeric_cols.extend(loc_field)
# should be 132
len(ipgeo_numeric_cols)

132

In [74]:
ipgeo_skl_matrix = ipgeo_df[ipgeo_numeric_cols]

In [75]:
# should be 297793 x 132
print ipgeo_skl_matrix.shape

(297793, 132)


In [76]:
# Remove web categories (they are all null)
for col in categories_only:
    del ipgeo_skl_matrix[col]

In [77]:
#should be (297793 x 74)
ipgeo_skl_matrix.shape

(297793, 74)

In [78]:
ipgeo_skl_matrix.head(3)

Unnamed: 0,weekday,hrs,time_taken,c_ip_1,c_ip_2,c_ip_3,c_ip_4,c_ip_5,c_ip_6,c_ip_7,c_ip_8,sc_filter_result_DENIED,sc_filter_result_OBSERVED,sc_status,s_action_TCP_AUTH_HIT,s_action_TCP_AUTH_MISS,s_action_TCP_CLIENT_REFRESH,s_action_TCP_DENIED,s_action_TCP_ERR_MISS,s_action_TCP_HIT,s_action_TCP_MISS,s_action_TCP_MISS_RST,s_action_TCP_NC_MISS,s_action_TCP_NC_MISS_RST,s_action_TCP_PARTIAL_MISS,s_action_TCP_REFRESH_MISS,s_action_TCP_TUNNELED,cs_method_CONNECT,cs_method_DESCRIBE,cs_method_GET,cs_method_HEAD,cs_method_OPTIONS,cs_method_POST,cs_method_PROPFIND,cs_method_PUT,cs_method_get,cs_uri_scheme_http,cs_uri_scheme_tcp,cs_uri_port,domain_len,cs_uri_path_len,cs_uri_port_len,cs_uri_query_len,cs_uri_ext_len,url_len,sc_bytes,cs_bytes,sum_cat,lat,lon,country_AR,country_BG,country_BR,country_CA,country_CN,country_CZ,country_DE,country_FR,country_GB,country_HK,country_IE,country_IL,country_IN,country_IT,country_JP,country_LV,country_NL,country_RU,country_SY,country_TW,country_UA,country_US,country_UY,country_VG
0,1,20,102,72,253,92,105,211,207,192,175,0,1,403,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,80,13,119,2,1,3,138,724,319,0,52.3667,4.9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,1,20,687,222,229,143,162,24,129,3,214,0,1,200,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,80,13,39,2,1,3,58,13120,311,0,42.5181,-83.263,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,1,20,147,72,253,92,105,211,207,192,175,0,1,403,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,80,13,117,2,1,3,136,724,317,0,52.3667,4.9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [79]:
ipgeo_cols = ['weekday','hrs','time_taken','c_ip_1','c_ip_2','c_ip_3','c_ip_4','c_ip_5','c_ip_6','c_ip_7','c_ip_8','sc_filter_result_DENIED','sc_filter_result_OBSERVED','sc_status','s_action_TCP_AUTH_HIT','s_action_TCP_AUTH_MISS','s_action_TCP_CLIENT_REFRESH','s_action_TCP_DENIED','s_action_TCP_ERR_MISS','s_action_TCP_HIT','s_action_TCP_MISS','s_action_TCP_MISS_RST','s_action_TCP_NC_MISS','s_action_TCP_NC_MISS_RST','s_action_TCP_PARTIAL_MISS','s_action_TCP_REFRESH_MISS',
's_action_TCP_TUNNELED','cs_method_CONNECT','cs_method_DESCRIBE','cs_method_GET','cs_method_HEAD','cs_method_OPTIONS','cs_method_POST','cs_method_PROPFIND','cs_method_PUT','cs_method_get','cs_uri_scheme_http','cs_uri_scheme_tcp','cs_uri_port','domain_len','cs_uri_path_len','cs_uri_port_len','cs_uri_query_len','cs_uri_ext_len','url_len','sc_bytes','cs_bytes','sum_cat','lat','lon','country_AR','country_BG','country_BR','country_CA','country_CN','country_CZ',
'country_DE','country_FR','country_GB','country_HK','country_IE','country_IL','country_IN','country_IT','country_JP','country_LV','country_NL','country_RU','country_SY','country_TW','country_UA','country_US','country_UY','country_VG']
print len(ipgeo_cols)

74


In [80]:
ip74_test = pd.read_csv(path_name + 'sgos422_ipskl_74.csv', names = ipgeo_cols, header=0, sep=',')

In [81]:
# should be 297793x74
ip74_test.shape

(297793, 74)

In [82]:
ip74_test.head(3)

Unnamed: 0,weekday,hrs,time_taken,c_ip_1,c_ip_2,c_ip_3,c_ip_4,c_ip_5,c_ip_6,c_ip_7,c_ip_8,sc_filter_result_DENIED,sc_filter_result_OBSERVED,sc_status,s_action_TCP_AUTH_HIT,s_action_TCP_AUTH_MISS,s_action_TCP_CLIENT_REFRESH,s_action_TCP_DENIED,s_action_TCP_ERR_MISS,s_action_TCP_HIT,s_action_TCP_MISS,s_action_TCP_MISS_RST,s_action_TCP_NC_MISS,s_action_TCP_NC_MISS_RST,s_action_TCP_PARTIAL_MISS,s_action_TCP_REFRESH_MISS,s_action_TCP_TUNNELED,cs_method_CONNECT,cs_method_DESCRIBE,cs_method_GET,cs_method_HEAD,cs_method_OPTIONS,cs_method_POST,cs_method_PROPFIND,cs_method_PUT,cs_method_get,cs_uri_scheme_http,cs_uri_scheme_tcp,cs_uri_port,domain_len,cs_uri_path_len,cs_uri_port_len,cs_uri_query_len,cs_uri_ext_len,url_len,sc_bytes,cs_bytes,sum_cat,lat,lon,country_AR,country_BG,country_BR,country_CA,country_CN,country_CZ,country_DE,country_FR,country_GB,country_HK,country_IE,country_IL,country_IN,country_IT,country_JP,country_LV,country_NL,country_RU,country_SY,country_TW,country_UA,country_US,country_UY,country_VG
0,1,20,102,72,253,92,105,211,207,192,175,0,1,403,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,80,13,119,2,1,3,138,724,319,0,52.3667,4.9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,1,20,687,222,229,143,162,24,129,3,214,0,1,200,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,80,13,39,2,1,3,58,13120,311,0,42.5181,-83.263,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,1,20,147,72,253,92,105,211,207,192,175,0,1,403,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,80,13,117,2,1,3,136,724,317,0,52.3667,4.9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [50]:
#test.isnull().values.any()
print ip74_test.isnull().sum().sum()
print ip74_test.apply(np.isnan).sum().sum()

0
0


### cs(UserAgent) field conversion

In [186]:
from user_agents import parse

In [187]:
len(sg_matrix.csUserAgent.value_counts())
# https://developer.mozilla.org/en-US/docs/Browser_detection_using_the_user_agent

20472

In [188]:
# The observation here is that even at the tail end, when we use the whole string, there are common components
# with the most popular UserAgent strings
ua_series = sg_matrix.csUserAgent.value_counts()
total = ua_series.values.sum()
sum = 0
for i in range(0, total):
    sum += ua_series[i]
    if (ua_series[i] == 2):
        print i, sum
        print ua_series.index[i]
        break

12349 1953198
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; AskTbCLM/5.8.0.12304)


#### Parsing csUserAgent String to get the Browser/App + version and O/S + version

In [190]:
list_ua = list(sg_matrix.csUserAgent.value_counts().index)

In [191]:
ua_series.index[0]

'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'

In [192]:
import httpagentparser

def parseUserAgent(s):
    ua = httpagentparser.detect(s)
    if 'browser' in ua.keys():
        browser = ua['browser']['name']
        # Further splitting by '/'
        browser = browser.split('/')[0]
        if 'version' in ua['browser'].keys():
            browser_v = ua['browser']['version']
        else:
            browser_v = 0
    else:
        browser = s.split(' ')[0]
        # Further splitting by '/'
        browser = browser.split('/')[0]
        browser_v = 0
    
    if 'os' in ua.keys():
        os = ua['os']['name']
        if 'version' in ua['os'].keys():
            os_v = ua['os']['version']
        else:
            os_v = 0
    else:
        os = s
        os_v = 0
        
    tup = (s, browser, browser_v, os, os_v)
    return tup

In [193]:
parsed_ua = []
for i in range(0, len(list_ua)):
    tup = parseUserAgent(list_ua[i]) 
    parsed_ua.append(tup)

In [203]:
columns = ['user_agent', 'browser', 'browser_v', 'os', 'os_v']
ua_df = pd.DataFrame(data=parsed_ua, columns = columns)
ua_df.sort_values(by = ['browser'], inplace=True)

In [211]:
ua_df.head(3)

Unnamed: 0,user_agent,browser,browser_v,os,os_v
4804,%7BPRODUCT_NAME%7D/1.1 CFNetwork/485.13.9 Darwin/11.0.0,%7BPRODUCT_NAME%7D,0,%7BPRODUCT_NAME%7D/1.1 CFNetwork/485.13.9 Darwin/11.0.0,0
10636,%7BPRODUCT_NAME%7D/1.2.1 CFNetwork/485.2 Darwin/10.3.1,%7BPRODUCT_NAME%7D,0,%7BPRODUCT_NAME%7D/1.2.1 CFNetwork/485.2 Darwin/10.3.1,0
13337,%7BPRODUCT_NAME%7D/1.2.3 CFNetwork/485.10.2 Darwin/10.3.1,%7BPRODUCT_NAME%7D,0,%7BPRODUCT_NAME%7D/1.2.3 CFNetwork/485.10.2 Darwin/10.3.1,0


In [204]:
pd.options.display.max_colwidth = 200

In [205]:
print len(ua_df.browser.value_counts())
ua_df.browser.value_counts()

781


Microsoft Internet Explorer                                    12630
Firefox                                                         2086
Safari                                                           684
Chrome                                                           552
AVGINET10-WV7XX86                                                413
AVGINET10-WXPPX86                                                336
AndroidBrowser                                                   245
Mozilla                                                          243
AVGINET9-WXPPX86                                                 205
Opera                                                            190
AVGINET9-WV7XX86                                                 136
KUKU                                                             128
AVGINET10-WV7XX64                                                 86
BrowserNG                                                         77
AVGINET10-WV7HX64                 

In [227]:
nbrowsers = ua_df.browser.value_counts().sum()
print 'Total # of different parsed browsers:', nbrowsers
running = 0
top_browsers = []
for browser,num in ua_df.browser.value_counts().iteritems():
    running += num
    if float(running)/float(nbrowsers) <= .95:
        top_browsers.append((browser,num))
    else:
        break
print 'Cut-off for # of Browsers:', len(top_browsers)

Total # of different parsed browsers: 20472
Cut-off for # of Browsers: 74


### Result for different thresholds:  How to choose?
* .90 threshold: 21 browsers
* .95 threshold: 74 browsers
* .99 threshold: 576 browsers
* 1.0 threshold: 781

In [186]:
a = list(tmp.browser.value_counts().index)
a.sort()
a = set(a)
len(a)
a

{'%7BPRODUCT_NAME%7D',
 '%D8%A7%D9%84%D8%AA%D9%84%D9%81%D8%B2%D9%8A%D9%88%D9%86',
 '%D8%B7%D9%8A%D9%88%D8%B1%20%D8%A7%D9%84%D8%AC%D9%86%D8%A9',
 '%D9%82%D8%B1%D8%A7%D9%86%D9%8A',
 '%E7%94%B5%E8%A7%86%E7%9B%B4%E6%92%AD',
 "'A*1'6J",
 '(WSLib',
 '(null)',
 '-',
 '010-151',
 '119.161.218.27',
 '119.161.218.28',
 '1pBy8D1pBwjsizI0sxqwg_yq3xwwgmtxt29q-rkJAARMTEuMC4xLjQwMA==',
 '2.99.29W7',
 '2WirePKI',
 '3.0.23.1000',
 '360%20Browser',
 '360WestHD',
 '3gpp-gba',
 '3pSim2yGxqPlxR1zn4eCrnDVEcYQA8qTgAAAAALUE',
 '4P_WallpaperBoxHD',
 '4T+FLUfCRmGdFqqc3R4kRLB8P9wwH4qTgAAAAA',
 '4music',
 '4shared',
 '9P_RetinaWallpapers',
 'A1',
 'AAM',
 'ABCNews',
 'AFDM',
 'AHTTPConnection',
 'AOLSearchGadget',
 'APClient',
 'ASF4PW4hzbJoEAZOq1O2wTIN3nkD4YqTgAAAAA',
 'ASTUUpdate',
 'AVGADMINSERVER',
 'AVGADMINSERVER64',
 'AVGDM-WV7HX64',
 'AVGDM-WV7XX64',
 'AVGDM-WV7XX86',
 'AVGDM-WXPPX86',
 'AVGINET-VS6',
 'AVGINET10-2K3EDX86',
 'AVGINET10-2K8XDX64',
 'AVGINET10-8R2EDX64',
 'AVGINET10-ASWINX64',
 'AVGINET10-A

In [148]:
slice_by_slash = [s.split('/')[0] for s in a]
slice_by_slash = set(slice_by_slash)
print len(slice_by_slash)
slice_by_slash

781


{'%7BPRODUCT_NAME%7D',
 '%D8%A7%D9%84%D8%AA%D9%84%D9%81%D8%B2%D9%8A%D9%88%D9%86',
 '%D8%B7%D9%8A%D9%88%D8%B1%20%D8%A7%D9%84%D8%AC%D9%86%D8%A9',
 '%D9%82%D8%B1%D8%A7%D9%86%D9%8A',
 '%E7%94%B5%E8%A7%86%E7%9B%B4%E6%92%AD',
 "'A*1'6J",
 '(WSLib',
 '(null)',
 '-',
 '010-151',
 '119.161.218.27',
 '119.161.218.28',
 '1pBy8D1pBwjsizI0sxqwg_yq3xwwgmtxt29q-rkJAARMTEuMC4xLjQwMA==',
 '2.99.29W7',
 '2WirePKI',
 '3.0.23.1000',
 '360%20Browser',
 '360WestHD',
 '3gpp-gba',
 '3pSim2yGxqPlxR1zn4eCrnDVEcYQA8qTgAAAAALUE',
 '4P_WallpaperBoxHD',
 '4T+FLUfCRmGdFqqc3R4kRLB8P9wwH4qTgAAAAA',
 '4music',
 '4shared',
 '9P_RetinaWallpapers',
 'A1',
 'AAM',
 'ABCNews',
 'AFDM',
 'AHTTPConnection',
 'AOLSearchGadget',
 'APClient',
 'ASF4PW4hzbJoEAZOq1O2wTIN3nkD4YqTgAAAAA',
 'ASTUUpdate',
 'AVGADMINSERVER',
 'AVGADMINSERVER64',
 'AVGDM-WV7HX64',
 'AVGDM-WV7XX64',
 'AVGDM-WV7XX86',
 'AVGDM-WXPPX86',
 'AVGINET-VS6',
 'AVGINET10-2K3EDX86',
 'AVGINET10-2K8XDX64',
 'AVGINET10-8R2EDX64',
 'AVGINET10-ASWINX64',
 'AVGINET10-A

In [149]:
slice_by_dash = [s.split('-')[0] for s in slice_by_slash]
slice_by_dash = set(slice_by_dash)
print len(slice_by_dash)
slice_by_dash

692


{'',
 '%7BPRODUCT_NAME%7D',
 '%D8%A7%D9%84%D8%AA%D9%84%D9%81%D8%B2%D9%8A%D9%88%D9%86',
 '%D8%B7%D9%8A%D9%88%D8%B1%20%D8%A7%D9%84%D8%AC%D9%86%D8%A9',
 '%D9%82%D8%B1%D8%A7%D9%86%D9%8A',
 '%E7%94%B5%E8%A7%86%E7%9B%B4%E6%92%AD',
 "'A*1'6J",
 '(WSLib',
 '(null)',
 '010',
 '119.161.218.27',
 '119.161.218.28',
 '1pBy8D1pBwjsizI0sxqwg_yq3xwwgmtxt29q',
 '2.99.29W7',
 '2WirePKI',
 '3.0.23.1000',
 '360%20Browser',
 '360WestHD',
 '3gpp',
 '3pSim2yGxqPlxR1zn4eCrnDVEcYQA8qTgAAAAALUE',
 '4P_WallpaperBoxHD',
 '4T+FLUfCRmGdFqqc3R4kRLB8P9wwH4qTgAAAAA',
 '4music',
 '4shared',
 '9P_RetinaWallpapers',
 'A1',
 'AAM',
 'ABCNews',
 'AFDM',
 'AHTTPConnection',
 'AOLSearchGadget',
 'APClient',
 'ASF4PW4hzbJoEAZOq1O2wTIN3nkD4YqTgAAAAA',
 'ASTUUpdate',
 'AVGADMINSERVER',
 'AVGADMINSERVER64',
 'AVGDM',
 'AVGINET',
 'AVGINET10',
 'AVGINET8',
 'AVGINET9',
 'AVWebProt',
 'Adobe',
 'AdobeStockPhotos',
 'Agent7759161',
 'Agent9264773',
 'Air%20Penguin',
 'Akamai',
 'Akregator',
 'AlarmClock',
 'AlarmClockFree',
 'AmU5U

### rsContentType

In [198]:
print len(sg_matrix.rsContentType.value_counts())
sg_matrix.rsContentType.value_counts()

550


text/html                                                                                                      487752
image/gif                                                                                                      256756
image/jpeg                                                                                                     246128
text/html;charset=UTF-8                                                                                        146013
-                                                                                                              103056
image/png                                                                                                       75996
text/javascript                                                                                                 70482
text/html;%20charset=utf-8                                                                                      62463
application/x-fcs                                       

In [167]:
tmp_df = pd.DataFrame(list_ips.index, columns = ['ip'])
tmp_df = tmp_df[is_ip]
len(tmp_df)
tmp_df

Unnamed: 0,ip
0,88.208.24.131
1,88.208.24.132
2,88.208.24.194
3,46.229.160.7
4,88.208.24.196
5,88.208.24.138
6,216.245.211.226
7,194.187.98.230
8,194.187.98.229
9,194.187.98.231


In [182]:
len(tmp_df)

1019

### e) Processing Stage 5:  Further segment data set
* Separate out IP addresses from Domain names
* Separate Internal vs. external IP addr

In [86]:
cols127 = ['dt','tm','time_taken','c_ip','cs_username','cs_auth_group','x_exception_id','sc_filter_result','cs_categories','csReferer','sc_status','s_action','cs_method','rsContentType','cs_uri_scheme','Domain','cs_uri_port','cs_uri_path','cs_uri_query','cs_uri_extension','csUserAgent','s_ip','sc_bytes','cs_bytes','x_virus_id','Academic_Fraud','Adult_Themes','Adware','Alcohol','Anime_Manga_Webcomic','Auctions','Automotive','Blogs','Business_Services','Chat','Classifieds','Dating','Drugs','Ecommerce_Shopping','Educational_Institutions','File_Storage','Financial_Institutions','Forums_Message_boards','Gambling','Games','German_Youth_Protection','Government','Hate_Discrimination','Health_and_Fitness','Humor','Instant_Messaging','Jobs_Employment','Lingerie_Bikini','Movies','Music','News_Media','Non-Profits','Nudity','P2P_File_sharing','Parked_Domains','Photo_Sharing','Podcasts','Politics','Pornography','Portals','Proxy_Anonymizer',
 'Radio','Religious','Research_Reference','Search_Engines','Sexuality','Social_Networking','Software_Technology','Sports','Tasteless','Television','Tobacco','Travel','Video_Sharing','Visual_Search_Engines','Weapons','Web_Spam','Webmail','dtm','weekday','hrs','sc_filter_result_DENIED','sc_filter_result_OBSERVED','cs_uri_scheme_http','cs_uri_scheme_tcp','s_action_TCP_AUTH_HIT','s_action_TCP_AUTH_MISS','s_action_TCP_CLIENT_REFRESH','s_action_TCP_DENIED','s_action_TCP_ERR_MISS','s_action_TCP_HIT','s_action_TCP_MISS','s_action_TCP_MISS_RST','s_action_TCP_NC_MISS','s_action_TCP_NC_MISS_RST','s_action_TCP_PARTIAL_MISS','s_action_TCP_REFRESH_MISS','s_action_TCP_TUNNELED','cs_method_CONNECT','cs_method_DESCRIBE','cs_method_GET','cs_method_HEAD','cs_method_OPTIONS','cs_method_POST','cs_method_PROPFIND','cs_method_PUT','cs_method_get','c_ip_1','c_ip_2','c_ip_3','c_ip_4','c_ip_5','c_ip_6','c_ip_7','c_ip_8','domain_len','cs_uri_path_len','cs_uri_port_len','cs_uri_query_len',
 'cs_uri_ext_len','url_len','sum_cat']
print len(cols127)
sg127_test = pd.read_csv(path_name + 'sgos422_enrich_127.csv', names = cols127, sep =',', header=0)
print sg127_test.shape

127
(1964138, 127)


In [99]:
#ip_domain_df = sg_matrix[sg_matrix.Domain.apply(check_ip)]
domain_only = -sg127_test.Domain.apply(check_ip)

In [100]:
sum(domain_only)

1666345

In [101]:
ip_only = sg127_test.Domain.apply(check_ip)

In [102]:
sum(ip_only)

297793

In [103]:
sg127_dom = sg127_test[domain_only]

In [104]:
print sg127_dom.shape

(1666345, 127)


In [110]:
sumcatNot0 = sg127_dom.sum_cat != 0

In [111]:
sum(sumcatNot0)

1193226

In [112]:
sg127_domcat = sg127_dom[sumcatNot0]

In [118]:
sg127_domcat.shape

(1193226, 127)

In [124]:
sum(sg127_domcat.Domain.apply(check_ip))

0

In [115]:
numeric_cols = ['weekday','hrs','time_taken','c_ip_1','c_ip_2','c_ip_3','c_ip_4','c_ip_5','c_ip_6','c_ip_7','c_ip_8','sc_filter_result_DENIED','sc_filter_result_OBSERVED','sc_status','s_action_TCP_AUTH_HIT','s_action_TCP_AUTH_MISS','s_action_TCP_CLIENT_REFRESH','s_action_TCP_DENIED','s_action_TCP_ERR_MISS','s_action_TCP_HIT','s_action_TCP_MISS','s_action_TCP_MISS_RST','s_action_TCP_NC_MISS','s_action_TCP_NC_MISS_RST','s_action_TCP_PARTIAL_MISS','s_action_TCP_REFRESH_MISS',
's_action_TCP_TUNNELED','cs_method_CONNECT','cs_method_DESCRIBE','cs_method_GET','cs_method_HEAD','cs_method_OPTIONS','cs_method_POST','cs_method_PROPFIND','cs_method_PUT','cs_method_get','cs_uri_scheme_http','cs_uri_scheme_tcp','cs_uri_port','domain_len','cs_uri_path_len','cs_uri_port_len','cs_uri_query_len','cs_uri_ext_len','url_len','sc_bytes','cs_bytes','Academic_Fraud','Adult_Themes','Adware','Alcohol','Anime_Manga_Webcomic','Auctions','Automotive','Blogs','Business_Services','Chat',
'Classifieds','Dating','Drugs','Ecommerce_Shopping','Educational_Institutions','File_Storage','Financial_Institutions','Forums_Message_boards','Gambling','Games','German_Youth_Protection','Government','Hate_Discrimination','Health_and_Fitness',   'Humor','Instant_Messaging','Jobs_Employment','Lingerie_Bikini','Movies','Music','News_Media','Non-Profits','Nudity','P2P_File_sharing','Parked_Domains','Photo_Sharing','Podcasts','Politics','Pornography','Portals','Proxy_Anonymizer',
'Radio','Religious','Research_Reference','Search_Engines','Sexuality','Social_Networking','Software_Technology','Sports','Tasteless','Television','Tobacco','Travel','Video_Sharing','Visual_Search_Engines','Weapons','Web_Spam','Webmail','sum_cat'
]
print len(numeric_cols)

106


In [119]:
sgos_domcat_106 = sg127_domcat[numeric_cols]

In [121]:
print sgos_domcat_106.shape
sgos_domcat_106.head(2)

(1193226, 106)


Unnamed: 0,weekday,hrs,time_taken,c_ip_1,c_ip_2,c_ip_3,c_ip_4,c_ip_5,c_ip_6,c_ip_7,c_ip_8,sc_filter_result_DENIED,sc_filter_result_OBSERVED,sc_status,s_action_TCP_AUTH_HIT,s_action_TCP_AUTH_MISS,s_action_TCP_CLIENT_REFRESH,s_action_TCP_DENIED,s_action_TCP_ERR_MISS,s_action_TCP_HIT,s_action_TCP_MISS,s_action_TCP_MISS_RST,s_action_TCP_NC_MISS,s_action_TCP_NC_MISS_RST,s_action_TCP_PARTIAL_MISS,s_action_TCP_REFRESH_MISS,s_action_TCP_TUNNELED,cs_method_CONNECT,cs_method_DESCRIBE,cs_method_GET,cs_method_HEAD,cs_method_OPTIONS,cs_method_POST,cs_method_PROPFIND,cs_method_PUT,cs_method_get,cs_uri_scheme_http,cs_uri_scheme_tcp,cs_uri_port,domain_len,cs_uri_path_len,cs_uri_port_len,cs_uri_query_len,cs_uri_ext_len,url_len,sc_bytes,cs_bytes,Academic_Fraud,Adult_Themes,Adware,...,Chat,Classifieds,Dating,Drugs,Ecommerce_Shopping,Educational_Institutions,File_Storage,Financial_Institutions,Forums_Message_boards,Gambling,Games,German_Youth_Protection,Government,Hate_Discrimination,Health_and_Fitness,Humor,Instant_Messaging,Jobs_Employment,Lingerie_Bikini,Movies,Music,News_Media,Non-Profits,Nudity,P2P_File_sharing,Parked_Domains,Photo_Sharing,Podcasts,Politics,Pornography,Portals,Proxy_Anonymizer,Radio,Religious,Research_Reference,Search_Engines,Sexuality,Social_Networking,Software_Technology,Sports,Tasteless,Television,Tobacco,Travel,Video_Sharing,Visual_Search_Engines,Weapons,Web_Spam,Webmail,sum_cat
1,1,20,216,97,84,217,25,248,213,102,144,0,1,200,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,80,15,16,2,14,1,48,473,1129,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,3
4,1,20,615,150,186,89,147,196,3,161,117,0,1,302,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,80,17,27,2,1,3,50,378,474,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2


In [125]:
print sgos_domcat_106.apply(np.isnan).sum().sum()
print sgos_domcat_106.isnull().sum().sum()

0
0


In [122]:
with open(path_name + 'sgos422_domcat_106.csv', 'w') as fwrite:
   sgos_domcat_106.to_csv(path_name + 'sgos422_domcat_106.csv', sep=',', names = numeric_cols)