# AddTracking Fraud Detection: Exploratory Data Analysis

In [61]:
import numpy as np         # linear algebra
import sklearn as sk       # machine learning
import pandas as pd        # reading in data files, data cleaning
import matplotlib.pyplot as plt   # for plotting
import seaborn as sns      # visualization tool
import tensorflow as tf
import keras
import pickle



In [49]:

data = pd.read_csv("data/train.csv", nrows=round(10e6))

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,,0
1,17357,3,1,19,379,2017-11-06 14:33:34,,0
2,35810,3,1,13,379,2017-11-06 14:34:12,,0
3,45745,14,1,13,478,2017-11-06 14:34:52,,0
4,161007,3,1,13,379,2017-11-06 14:35:08,,0
5,18787,3,1,16,379,2017-11-06 14:36:26,,0
6,103022,3,1,23,379,2017-11-06 14:37:44,,0
7,114221,3,1,19,379,2017-11-06 14:37:59,,0
8,165970,3,1,13,379,2017-11-06 14:38:10,,0
9,74544,64,1,22,459,2017-11-06 14:38:23,,0


In [50]:
data.head(10)

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,,0
1,17357,3,1,19,379,2017-11-06 14:33:34,,0
2,35810,3,1,13,379,2017-11-06 14:34:12,,0
3,45745,14,1,13,478,2017-11-06 14:34:52,,0
4,161007,3,1,13,379,2017-11-06 14:35:08,,0
5,18787,3,1,16,379,2017-11-06 14:36:26,,0
6,103022,3,1,23,379,2017-11-06 14:37:44,,0
7,114221,3,1,19,379,2017-11-06 14:37:59,,0
8,165970,3,1,13,379,2017-11-06 14:38:10,,0
9,74544,64,1,22,459,2017-11-06 14:38:23,,0


In [3]:
# Let's see the data types for all our data
data.dtypes

ip                  int64
app                 int64
device              int64
os                  int64
channel             int64
click_time         object
attributed_time    object
is_attributed       int64
dtype: object

In [52]:
# We can see that most off the 'id' types are integer data types when really they should be categorical.  
# Changing them to categorical will let pandas give us statistics such as `unique, top, freq`

features = ["ip","app","device","os","channel","is_attributed"]
for feature in features:
    data[feature] = data[feature].astype("category")

data.dtypes


ip                 category
app                category
device             category
os                 category
channel            category
click_time           object
attributed_time      object
is_attributed      category
dtype: object

In [53]:
# Let's convert the click_time field into a timestamp object that Pandas knows how to manipulate (i.e order)
# Then pandas will give us stastics such as "first" and "last" click_time values 
data["click_time"] = pd.to_datetime(data['click_time'])
data["attributed_time"] = pd.to_datetime(data['attributed_time'])


In [6]:
data.describe()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
count,10000000.0,10000000.0,10000000.0,10000000.0,10000000.0,10000000,18717,10000000.0
unique,68740.0,332.0,940.0,292.0,170.0,29943,15698,2.0
top,73516.0,12.0,1.0,19.0,245.0,2017-11-06 16:05:10,2017-11-06 23:36:23,0.0
freq,51711.0,1291185.0,9381146.0,2410148.0,793105.0,1261,6,9981283.0
first,,,,,,2017-11-06 14:32:21,2017-11-06 16:00:47,
last,,,,,,2017-11-07 00:12:03,2017-11-07 15:59:53,


Let's look at the distribution of '1's and '0's for is_attributed column

### Analysis

`click_time`: Our 10 million clicks were obtained on 2017, December 6th to December 7th.  So the following data spans around 9 hours.  Are there patterns between click count and click time ?  

`attributed_time`: Wow, so it looks like there are only 18717 non-null values for `attributed_time`. Which means only 18717 values have `1` for `is_attributed` and the rest clicks are `0` 

`ip`: If there are only 68740 unique IP addresses out of 10 million clicks.  Also, the IP with the most clicks had over 50,000 clicks!  Either that IP was a bot or IP addresses are network IP's rather than device level IPs

What I'm curious is the click counts for each `app`, `device`, `os`, and `channel` features.  Are they distributed evenly or do most clicks get aggregated to only a few id's for each feature. 


In [21]:
# Let's count the number of clicks with `1` as `is_attributed` to verify that it is only 18717
data[data["is_attributed"]==1]['is_attributed'].count()

18717

So we were correct, there are only 18,717 clicks that are attributed (non fraudalent) the rest are all fraudalent

Let's try to see patterns between click count and click time

In [None]:
# Just for practice, let's plot a bar plot on the frequencies of the most freqeunt app



## Experementing with the Random Forest Model

In [None]:
# Feature engineering for click time

s_sec = data['click_time'].dt.second
m_sec = data['click_time'].dt.minute*60
hr_sec = data['click_time'].dt.hour*3600

data['click_time_secs'] = s_sec + m_sec + hr_sec

In [58]:

X = data[['app', 'device', 'os', 'channel', 'click_time_secs']]
y = data['is_attributed']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

In [59]:
from sklearn.ensemble import RandomForestClassifier
RandomForest = RandomForestClassifier()
RandomForest.fit(X_train,y_train)
rf_score = RandomForest.score(X_test,y_test)
print("Random Forest accuracy:", rf_score)



Random Forest accuracy: 0.997813


In [64]:
# Saving Model
model = RandomForest
model_file = "RF.sav"
with open(model_file,'wb') as out:
    pickle.dump(model, out)


In [65]:
# Testing loading Model
with open(model_file, 'rb') as inp:
    model = pickle.load(inp)
model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [67]:
rf_score = RandomForest.score(X_test,y_test)
rf_score

0.997813

We got over 99% accuracy on our validation set!!! However, it turns out that this model is overfitting teribly on the test set.  In the kaggle given training sets, 99% of clicks are `0` for is_attribute.  However in the kaggle test set, we figured out that is_attribute values are evenly split between `1` and `0`.

This unbalance between the training and test sets explains the overfitting. 

Thus we decided to create an `equalized_training` set with even distrubution of is_attribute values.  