In [11]:
import pandas as pd
import numpy as np
import csv as csv
import math
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score
from sklearn.cross_validation import train_test_split
import time
import time
from datetime import date
import datetime
from sklearn.decomposition import PCA, IncrementalPCA

%matplotlib inline

#### Concatenate the two training DataFrames

In [2]:
train_df1 = pd.read_csv('../hackerrank-predict-email-opens-dataset/training_dataset.csv/training_dataset.csv', header=0)  
train_df2 = pd.read_csv('../hackerrank-predict-email-opens-dataset/training_dataset.csv/training_dataset_complement.csv', header=0)  
train_df = pd.concat([train_df1,train_df2])
train_df.shape

(486048, 54)

#### Dropping few columns

In [3]:
train_df = train_df.drop(['user_id','mail_id','mail_type','clicked','hacker_timezone',
                          'mail_category','sent_time','unsubscribed',
                          'open_time','click_time','unsubscribe_time','hacker_created_at'], axis=1) 
train_df.head()

Unnamed: 0,last_online,contest_login_count,contest_login_count_1_days,contest_login_count_30_days,contest_login_count_365_days,contest_login_count_7_days,contest_participation_count,contest_participation_count_1_days,contest_participation_count_30_days,contest_participation_count_365_days,...,submissions_count_contest,submissions_count_contest_1_days,submissions_count_contest_30_days,submissions_count_contest_365_days,submissions_count_contest_7_days,submissions_count_master,submissions_count_master_1_days,submissions_count_master_30_days,submissions_count_master_365_days,submissions_count_master_7_days
0,1459520000.0,1,0,0,1,0,1,0,0,1,...,0,0,0,0,0,13,0,0,13,0
1,1461210000.0,3,0,1,3,0,3,0,1,3,...,16,0,3,16,0,83,0,43,83,4
2,1463411000.0,3,0,0,3,0,3,0,0,3,...,0,0,0,0,0,16,0,3,16,0
3,1462768000.0,3,0,0,3,0,3,0,0,3,...,16,0,0,16,0,85,0,9,85,0
4,1461248000.0,5,0,0,5,0,13,0,0,13,...,17,0,1,17,0,43,0,0,43,0


#### Converting Boolean to Int

In [4]:
train_df['opened'] = train_df.opened.map({True:1,False:0}).astype(int)
train_df['hacker_confirmation'] = train_df.hacker_confirmation.map({True:1,False:0}).astype(int)
train_df.shape


(486048, 42)

#### Dropping rows with NA or Null Values (last_online is the only attribute with NA or Null values)

In [5]:
train_df = train_df.dropna(subset=['last_online'],axis=0)
train_df.shape

(485471, 42)

#### Sorting Columns to data constancy

In [6]:
train_df = train_df.reindex_axis(sorted(train_df.columns), axis=1)
train_df.shape

(485471, 42)

#### Adding opened attribute to the first column of DataFrame

In [7]:
train_df = train_df.reindex_axis(['opened'] + list([col for col in train_df.columns if col != 'opened']), axis=1)
train_df.head()

Unnamed: 0,opened,contest_login_count,contest_login_count_1_days,contest_login_count_30_days,contest_login_count_365_days,contest_login_count_7_days,contest_participation_count,contest_participation_count_1_days,contest_participation_count_30_days,contest_participation_count_365_days,...,submissions_count_contest,submissions_count_contest_1_days,submissions_count_contest_30_days,submissions_count_contest_365_days,submissions_count_contest_7_days,submissions_count_master,submissions_count_master_1_days,submissions_count_master_30_days,submissions_count_master_365_days,submissions_count_master_7_days
0,1,1,0,0,1,0,1,0,0,1,...,0,0,0,0,0,13,0,0,13,0
1,0,3,0,1,3,0,3,0,1,3,...,16,0,3,16,0,83,0,43,83,4
2,0,3,0,0,3,0,3,0,0,3,...,0,0,0,0,0,16,0,3,16,0
3,0,3,0,0,3,0,3,0,0,3,...,16,0,0,16,0,85,0,9,85,0
4,0,5,0,0,5,0,13,0,0,13,...,17,0,1,17,0,43,0,0,43,0


#### Check for NA values

In [8]:
hasany = False
for cl in train_df.columns.values:
    hasNa = train_df[cl].isnull().sum() > 0
    if hasNa:
        hasany = True
        print cl,train_df[cl].isnull().sum()
if not hasany:
    print 'No NA found'
    

No NA found


#### Check data types for your model

In [9]:
for cl in train_df.columns.values:
    print cl,train_df[cl].dtype

opened int32
contest_login_count int64
contest_login_count_1_days int64
contest_login_count_30_days int64
contest_login_count_365_days int64
contest_login_count_7_days int64
contest_participation_count int64
contest_participation_count_1_days int64
contest_participation_count_30_days int64
contest_participation_count_365_days int64
contest_participation_count_7_days int64
forum_comments_count int64
forum_count int64
forum_expert_count int64
forum_questions_count int64
hacker_confirmation int32
ipn_count int64
ipn_count_1_days int64
ipn_count_30_days int64
ipn_count_365_days int64
ipn_count_7_days int64
ipn_read int64
ipn_read_1_days int64
ipn_read_30_days int64
ipn_read_365_days int64
ipn_read_7_days int64
last_online float64
submissions_count int64
submissions_count_1_days int64
submissions_count_30_days int64
submissions_count_365_days int64
submissions_count_7_days int64
submissions_count_contest int64
submissions_count_contest_1_days int64
submissions_count_contest_30_days int64
su

In [13]:
from sklearn import linear_model
from sklearn import tree
from sklearn import svm
from sklearn import ensemble 
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

train_data = train_df.values
x_train, x_test, y_train, y_test = train_test_split(train_data[0::,1::], train_data[0::,0], 
                            test_size = 0.2, random_state = 123) # Split training/test.


n_components = 2

reduction_model = PCA(n_components=n_components)

reduction_model = reduction_model.fit(x_train)
x_train_transformed = reduction_model.transform(x_train)
x_test_transformed = reduction_model.transform(x_test)

# hipotese = linear_model.LogisticRegression(C=1e5)
# hipotese = tree.DecisionTreeClassifier(random_state=1234)
# hipotese = ensemble.RandomForestClassifier(random_state=1234)

# hipotese = svm.SVC()



Over-sampling
---------------

In [14]:
from imblearn.over_sampling import ADASYN

n_components = 2

reduction_model = PCA(n_components=n_components)

reduction_model = reduction_model.fit(x_train)
x_train_transformed = reduction_model.transform(x_train)

ada = ADASYN()
X_resampled, y_resampled = ada.fit_sample(x_train_transformed, y_train)

hipotese = tree.DecisionTreeClassifier(random_state=1234)

hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit



             precision    recall  f1-score   support

        0.0       0.80      0.77      0.78     64915
        1.0       0.57      0.61      0.59     32180

avg / total       0.72      0.72      0.72     97095



In [15]:
from imblearn.over_sampling import RandomOverSampler

# Apply the random over-sampling
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_sample(x_train_transformed, y_train)

hipotese = tree.DecisionTreeClassifier(random_state=1234)

hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit


             precision    recall  f1-score   support

        0.0       0.80      0.79      0.80     64915
        1.0       0.59      0.60      0.59     32180

avg / total       0.73      0.73      0.73     97095



In [16]:
from imblearn.over_sampling import SMOTE


# Apply regular SMOTE
sm = SMOTE(kind='regular')
X_resampled, y_resampled = sm.fit_sample(x_train_transformed, y_train)

hipotese = tree.DecisionTreeClassifier(random_state=1234)


hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit



             precision    recall  f1-score   support

        0.0       0.80      0.79      0.79     64915
        1.0       0.58      0.60      0.59     32180

avg / total       0.73      0.73      0.73     97095



In [17]:
from imblearn.over_sampling import SMOTE

# Apply Borderline SMOTE 1
sm = SMOTE(kind='borderline1')
X_resampled, y_resampled = sm.fit_sample(x_train_transformed, y_train)

hipotese = tree.DecisionTreeClassifier(random_state=1234)

hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit



             precision    recall  f1-score   support

        0.0       0.80      0.78      0.79     64915
        1.0       0.58      0.61      0.59     32180

avg / total       0.73      0.72      0.73     97095



In [None]:
from imblearn.over_sampling import SMOTE

# Apply Borderline SMOTE 2
sm = SMOTE(kind='borderline2')
X_resampled, y_resampled = sm.fit_sample(x_train_transformed, y_train)

hipotese = tree.DecisionTreeClassifier(random_state=1234)

hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit



             precision    recall  f1-score   support

        0.0       0.80      0.77      0.78     64915
        1.0       0.57      0.61      0.59     32180

avg / total       0.72      0.72      0.72     97095



In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE SVM
sm = SMOTE(kind='svm')
X_resampled, y_resampled = sm.fit_sample(x_train_transformed, y_train)

hipotese = tree.DecisionTreeClassifier(random_state=1234)


hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit



Ensemble
---------

In [None]:
from imblearn.ensemble import BalanceCascade

# Apply Balance Cascade method
bc = BalanceCascade()
X_resampled, y_resampled = bc.fit_sample(x_train_transformed, y_train)

hipotese = tree.DecisionTreeClassifier(random_state=1234)

hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit



In [None]:
from imblearn.ensemble import EasyEnsemble

# Apply Easy Ensemble
ee = EasyEnsemble()
X_resampled, y_resampled = ee.fit_sample(x_train_transformed, y_train)

hipotese = tree.DecisionTreeClassifier(random_state=1234)


hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit



Under-sampling
---------

In [None]:

from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
from imblearn.under_sampling import AllKNN

# Apply the ENN
print('ENN')
enn = EditedNearestNeighbours()
X_resampled, y_resampled = enn.fit_sample(x_train_transformed, y_train)

hipotese = tree.DecisionTreeClassifier(random_state=1234)

hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit


# Apply the RENN
print('RENN')
renn = RepeatedEditedNearestNeighbours()
X_resampled, y_resampled = renn.fit_sample(x_train_transformed, y_train)
hipotese = tree.DecisionTreeClassifier(random_state=1234)

hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit



# Apply the AllKNN
print('AllKNN')
allknn = AllKNN()
X_resampled, y_resampled = allknn.fit_sample(x_train_transformed, y_train)
hipotese = tree.DecisionTreeClassifier(random_state=1234)


hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit




In [None]:
from imblearn.under_sampling import ClusterCentroids

# Apply Cluster Centroids
cc = ClusterCentroids()
X_resampled, y_resampled = cc.fit_sample(x_train_transformed, y_train)

hipotese = tree.DecisionTreeClassifier(random_state=1234)


hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit



In [None]:

from imblearn.under_sampling import CondensedNearestNeighbour

# Apply Condensed Nearest Neighbours
cnn = CondensedNearestNeighbour()
X_resampled, y_resampled = cnn.fit_sample(x_train_transformed, y_train)

hipotese = tree.DecisionTreeClassifier(random_state=1234)


hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit



In [None]:
from imblearn.under_sampling import EditedNearestNeighbours


# Apply Edited Nearest Neighbours
enn = EditedNearestNeighbours()
X_resampled, y_resampled = enn.fit_sample(x_train_transformed, y_train)


hipotese = tree.DecisionTreeClassifier(random_state=1234)


hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit



In [None]:
from imblearn.under_sampling import InstanceHardnessThreshold

for ratio in [0.0, 0.1, 0.3, 0.5]:
        iht = InstanceHardnessThreshold(ratio=ratio)
        X_res, y_res = iht.fit_sample(x_train_transformed, y_train)
        
        hipotese = tree.DecisionTreeClassifier(random_state=1234)

        print("Ratio: "ratio)
        hipotese.fit(X_resampled, y_resampled)
        y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
        print(classification_report(y_true, y_pred)) # Classification on each digit



In [None]:
from imblearn.under_sampling import NearMiss


# Apply Nearmiss 1
nm1 = NearMiss(version=1)
X_resampled, y_resampled = nm1.fit_sample(x_train_transformed, y_train)

hipotese = tree.DecisionTreeClassifier(random_state=1234)


hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit



In [None]:
from imblearn.under_sampling import NearMiss

# Apply Nearmiss 2
nm2 = NearMiss(version=2)
X_resampled, y_resampled = nm2.fit_sample(x_train_transformed, y_train)

hipotese = tree.DecisionTreeClassifier(random_state=1234)


hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit



In [None]:
from imblearn.under_sampling import NearMiss

# Apply Nearmiss 3
nm3 = NearMiss(version=3)
X_resampled, y_resampled = nm3.fit_sample(x_train_transformed, y_train)

hipotese = tree.DecisionTreeClassifier(random_state=1234)


hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit



In [None]:
from imblearn.under_sampling import NeighbourhoodCleaningRule

# Apply neighbourhood cleaning rule
ncl = NeighbourhoodCleaningRule()
X_resampled, y_resampled = ncl.fit_sample(x_train_transformed, y_train)

hipotese = tree.DecisionTreeClassifier(random_state=1234)


hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit



In [None]:
from imblearn.under_sampling import OneSidedSelection

# Apply One-Sided Selection
oss = OneSidedSelection()
X_resampled, y_resampled = oss.fit_sample(x_train_transformed, y_train)

hipotese = tree.DecisionTreeClassifier(random_state=1234)


hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit



In [None]:
from imblearn.under_sampling import RandomUnderSampler


# Apply the random under-sampling
rus = RandomUnderSampler()
X_resampled, y_resampled = rus.fit_sample(x_train_transformed, y_train)

hipotese = tree.DecisionTreeClassifier(random_state=1234)


hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit



In [None]:
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import RepeatedEditedNearestNeighbours


# Apply the ENN
print('ENN')
enn = EditedNearestNeighbours()
X_resampled, y_resampled = enn.fit_sample(x_train_transformed, y_train)

hipotese = tree.DecisionTreeClassifier(random_state=1234)


hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit



# Apply the RENN
print('RENN')
renn = RepeatedEditedNearestNeighbours()
X_resampled, y_resampled = renn.fit_sample(x_train_transformed, y_train)

hipotese = tree.DecisionTreeClassifier(random_state=1234)


hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit



In [None]:
from imblearn.under_sampling import TomekLinks


# Apply Tomek Links cleaning
tl = TomekLinks()
X_resampled, y_resampled = tl.fit_sample(x_train_transformed, y_train)

hipotese = tree.DecisionTreeClassifier(random_state=1234)


hipotese.fit(X_resampled, y_resampled)
y_true, y_pred = y_test, hipotese.predict(x_test_transformed) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit

