<a href="https://colab.research.google.com/github/YG15/DataHack2019/blob/master/Copy_of_The_whale_and_the_petunias_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# import basic packages
import pandas as pd
import numpy as np
import json
from urllib import request
import random
from sklearn.ensemble import IsolationForest

from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# Functions for categorical to dummy variables transformation
def diff_list(li1, li2): 
    return (list(set(li1) - set(li2))) 

def get_categ_col_dummy (df):    
    numerical_columns = df.describe().columns
    categorical_cols=diff_list(df.columns,numerical_columns)
    
    categorial = df[categorical_cols]
    #to prevent case-sensitive separation to different categories:
    categorial=categorial.apply(lambda x: x.str.lower(), axis=1)
    categorial_dummies = pd.get_dummies(categorial)
    df.drop(columns=categorical_cols, inplace=True)
    df=df.join(categorial_dummies)
    return df

In [0]:
# Upload device csv from s3
url = 'https://armis-datahack.s3.amazonaws.com/all_devices.csv'
devices_df = pd.read_csv(url, index_col = 0)
devices_df.head()

In [0]:
# Remove categorical features with very high number of classes and turn to dummy the rest
keep_os = ['Android', 'iOS', 'Mac OS X', 'Windows', 'watchOS', 'Watch OS', 'Tizen', 'Symbian', 'Miui OS', 'Linux',
       'Axis Firmware', 'Firefox OS', 'Cisco IOS-XE', 'Cisco IOS', 'Debian', 'Link-OS', 'Cisco NX-OS']
devices_df['operating_system'] = [ val if val in keep_os else 'OTHER' for val in devices_df.operating_system]
devices_df.drop(columns =['operating_system_version','model','manufacturer'], inplace=True)
devices_df = get_categ_col_dummy(devices_df)
devices_df.head()

In [0]:
# Upload sessions csv from s3
url = 'https://armis-datahack.s3.amazonaws.com/all_sessions.csv'
sessions_df = pd.read_csv(url)
sessions_df['real_device'] = [float(str(id)+'.'+str(net)) for id,net in zip (sessions_df.device_id,sessions_df.network_id)]
sessions_df.drop(columns =['device_id'], inplace=True)
sessions_df.head()

In [0]:
# log transformation for relevant numerical features
numeric_col = ['packets_count','outbound_bytes_count', 'inbound_bytes_count', 
               'packet_loss','retransmit_count', 'latency', 'session_count',
               'outbound_packets_count', 'inbound_packets_count', 'outbound_bytes_max',
               'outbound_bytes_min', 'outbound_bytes_mean', 'outbound_bytes_median',
               'outbound_bytes_stddev', 'inbound_bytes_max', 'inbound_bytes_min',
               'inbound_bytes_mean', 'inbound_bytes_median', 'inbound_bytes_stddev',
               'outbound_packet_size_max', 'outbound_packet_size_min',
               'outbound_packet_size_mean', 'outbound_packet_size_median',
               'outbound_packet_size_stddev', 'inbound_packet_size_max',
               'inbound_packet_size_min', 'inbound_packet_size_mean',
               'inbound_packet_size_median', 'inbound_packet_size_stddev']
for c in numeric_col:
  if sessions_df[c].dtype !='object':
    sessions_df[c] = np.log10(sessions_df[c]+1)
sessions_df.head()

In [0]:
# groupby based on device_id/network using different metric/stats for the various featurs

def range_list(val_list):
    # calculates the range between the max and min values of a feature
    min_val = min(val_list)
    max_val = max(val_list)

    return (max_val-min_val)
  
classic_functions = [np.nanmin,np.nanmax,np.nanmean,np.nanstd, np.nanmedian]

df_gb = sessions_df.groupby(['network_id','device_id'],as_index=False).agg({'host':'nunique',
                                                      'host_ip':'nunique',
                                                      'port_dst':'nunique',
                                                      'timestamp':['nunique','count',('range', lambda x: range_list(x))],
                                                      'transport_protocol':'nunique',
                                                      'packets_count':classic_functions,
                                                      'outbound_bytes_count':classic_functions, 
                                                      'inbound_bytes_count':classic_functions, 
                                                      'packet_loss':classic_functions,
                                                      'retransmit_count':classic_functions, 
                                                      'latency':classic_functions, 
                                                      'session_count':classic_functions,
                                                      'outbound_packets_count':classic_functions, 
                                                      'inbound_packets_count':classic_functions, 
                                                      'outbound_bytes_max':classic_functions,
                                                      'outbound_bytes_min':classic_functions, 
                                                      'outbound_bytes_mean':classic_functions, 
                                                      'outbound_bytes_median':classic_functions,
                                                      'outbound_bytes_stddev':classic_functions, 
                                                      'inbound_bytes_max':classic_functions, 
                                                      'inbound_bytes_min':classic_functions,
                                                      'inbound_bytes_mean':classic_functions, 
                                                      'inbound_bytes_median':classic_functions, 
                                                      'inbound_bytes_stddev':classic_functions,
                                                      'outbound_packet_size_max':classic_functions, 
                                                      'outbound_packet_size_min':classic_functions,
                                                      'outbound_packet_size_mean':classic_functions, 
                                                      'outbound_packet_size_median':classic_functions,
                                                      'outbound_packet_size_stddev':classic_functions, 
                                                      'inbound_packet_size_max':classic_functions,
                                                      'inbound_packet_size_min':classic_functions, 
                                                      'inbound_packet_size_mean':classic_functions,
                                                      'inbound_packet_size_median':classic_functions, 
                                                      'inbound_packet_size_stddev':classic_functions})

# Flaten columns indices
df_gb.columns = ["".join(x) for x in df_gb.columns.ravel()]

In [0]:
# Examine most varied features
variety_df = pd.DataFrame(columns=['feature', 'max','min','std'],)
for i,c in enumerate(df_gb.columns):
  df2 =pd.DataFrame({'feature':c,'max':df_gb[c].max(),'min': df_gb[c].min(),'std':df_gb[c].std()}, index=[0])
  variety_df = variety_df.append(df2, ignore_index=True)

variety_df.sort_values('std', ascending=False)['feature'].values

In [0]:
# merge device and grouby data frames
final_df = pd.merge(devices_df, df_gb, how = 'right', on=['network_id','device_id'])
print(devices_df.shape, df_gb.shape, final_df.shape)

(76261, 27) (76038, 154) (76038, 179)


In [0]:
# save to folder
df_gb.to_csv('only_session_Features_no_log.csv')
!cp only_session_Features_no_log.csv drive/My\ Drive/

In [0]:
# Isolation forest model function
def run_IF(df):
    model = IsolationForest(behaviour = 'new', max_samples=100, 
                          random_state = 1, contamination = 0)
    model.fit(df)
    anomaly_scores = model.decision_function(df)
    
    return anomaly_scores

In [0]:
# Train model on selected features
df_features = df_gb[['device_id', 'network_id', 'outbound_packets_countnanmean',
       'packets_countnanmin',    'inbound_packets_countnanstd', 
        'timestampcount', 'inbound_packets_countnanmin',
       'outbound_packet_size_maxnanmax','retransmit_countnanmean',
       'latencynanmin', 'retransmit_countnanmedian', 'host_ipnunique',
       'latencynanmax', 'hostnunique',]].dropna(axis= 1)

X = df_features.copy()
X.drop(['device_id'], inplace=True, axis=1)

# Run model
anomaly_scores = run_IF(X)
anomaly_scores = -anomaly_scores

# Save results
results = pd.DataFrame({'network_id' : df_features.network_id, 'device_id' : df_features.device_id,'confidence' : anomaly_scores})

In [0]:
display(results.head())
print(results.shape)

Unnamed: 0,network_id,device_id,confidence
0,0,33,-0.429968
1,0,35,-0.266061
2,0,40,-0.191984
3,0,41,-0.42976
4,0,53,-0.434053


(76038, 3)


In [0]:
# Submission

arr_to_submit = results.to_json(orient='values')

leaderboard_name = "armis"
host = "leaderboard.datahack.org.il"

# Name of the user
submitter = "The Whale and the Petunias"

predictions = json.loads(arr_to_submit)

jsonStr = json.dumps({'submitter': submitter, 'predictions': predictions})
data = jsonStr.encode('utf-8')
req = request.Request(f"https://{host}/{leaderboard_name}/api/",
                      headers={'Content-Type': 'application/json'},
                      data=data)
resp = request.urlopen(req)
print(json.load(resp))

{'member': 'The Whale and the Petunias_y2', 'rank': 15, 'score': 0.7992275174894156}
