In [4]:
import pandas as pd
import numpy as np
import csv as csv
import math
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score
from sklearn.cross_validation import train_test_split
import time
from datetime import date
import datetime
%matplotlib inline

def saveFileForSubmission(predicted_lables,custonFileName='submission.csv',customHeader=''):
    result = np.c_[predicted_lables]

    np.savetxt(custonFileName, 
           result.astype(int), 
           delimiter=',', 
           header = customHeader, 
           comments = '', 
           fmt='%u')

In [5]:
def norm(dataframe,variable):
    mean = dataframe[variable].mean()
    max_val = dataframe[variable].max()
    min_val = dataframe[variable].min()
    dataframe[variable] = dataframe[variable].apply(lambda x: (x - mean) / (max_val -min_val+0.0001))
    return dataframe


In [6]:
def preProcessData(dataframe,train=True):
    mailTypes = ['mail_type_1', 'mail_type_2', 'mail_type_3', 'mail_type_4']
    mailCategories = ['mail_category_1', 'mail_category_10', 'mail_category_11',
                    'mail_category_12', 'mail_category_13', 'mail_category_14',
                    'mail_category_15', 'mail_category_16', 'mail_category_17',
                    'mail_category_18', 'mail_category_2', 'mail_category_3',
                    'mail_category_4', 'mail_category_5', 'mail_category_6',
                    'mail_category_7', 'mail_category_8', 'mail_category_9']
    mail_idRange = range(0,11)
    user_idRange = range(0,11)
    
       
    #last_online
    dataframe.loc[ (dataframe.last_online.notnull()),'last_online']=dataframe['last_online'].dropna().map(lambda x:(datetime.datetime.today()-datetime.datetime.fromtimestamp(x)).days)
    mean_last_online_days = dataframe.last_online.mean()
    dataframe.loc[ (dataframe.last_online.isnull()), 'last_online'] = mean_last_online_days
    dataframe = norm(dataframe,'last_online') 
    
    #mail_type
    mode_mail_type = dataframe.mail_type.dropna().mode().values
    dataframe.loc[ (dataframe.mail_type.isnull()), 'mail_type'] = mode_mail_type
    dataFrameMailTypesDiff = np.setdiff1d(mailTypes, np.unique(dataframe.mail_type)) 
    dummiesMail_type =  pd.get_dummies(dataframe.mail_type,prefix='col')
    dataframe = pd.concat([dataframe, dummiesMail_type], axis=1)
    for mailType in map(lambda x:"col_"+x,dataFrameMailTypesDiff):
        dataframe = pd.concat([dataframe,pd.DataFrame({mailType: np.zeros(dataframe.shape[0])})],axis=1)
    
    #mail_category
    mode_mail_category = dataframe.mail_category.dropna().mode().values
    dataframe.loc[ (dataframe.mail_category.isnull()), 'mail_category'] = mode_mail_category
    dataFrameMailCategoryDiff = np.setdiff1d(mailCategories, np.unique(dataframe.mail_category)) 
    dummiesMail_category =  pd.get_dummies(dataframe.mail_category,prefix='col')
    dataframe = pd.concat([dataframe, dummiesMail_category], axis=1)
    for mailCategoryItem in map(lambda x:"col_"+x,dataFrameMailCategoryDiff):
        dataframe = pd.concat([dataframe,pd.DataFrame({mailCategoryItem: np.zeros(dataframe.shape[0])})],axis=1)
    
    #hacker_confirmation
    dataframe['hacker_confirmation'] = dataframe.hacker_confirmation.map( {False: 0, True: 1} ).astype(int)
    
    #mail_id
    dataTemp  = dataframe['mail_id'].value_counts()
    hist, edges = np.histogram(dataTemp, bins=[ 1.00000000e+00,2.62150000e+03,5.24200000e+03,
                                               7.86250000e+03,1.04830000e+04,1.31035000e+04,
                                               1.57240000e+04,1.83445000e+04,2.09650000e+04,
                                               2.35855000e+04,2.62060000e+04])
    dataTempDict = dataTemp.to_dict()
    dataframe['mail_id'] = dataframe['mail_id'].map(lambda x: [i for i,v in enumerate(edges) if v<=dataTempDict[x]][-1] )
    dummiesMail_id =  pd.get_dummies(dataframe.mail_id,prefix='mail_id')
    dataframe = pd.concat([dataframe, dummiesMail_id], axis=1)
    #mail_id complete
    dataFrameMailIdDiff = np.setdiff1d(mail_idRange, np.unique(dataframe.mail_id)) 
    for mailIdItem in map(lambda x:"mail_id_"+str(x),dataFrameMailIdDiff):
        dataframe = pd.concat([dataframe,pd.DataFrame({mailIdItem: np.zeros(dataframe.shape[0])})],axis=1)
    
    #user_id 
    dataTemp  = dataframe['user_id'].value_counts()
    hist, edges = np.histogram(dataTemp, bins=[1.,11.5,22.,32.5,43.,53.5,64.,74.5,85.,95.5,106.])
    dataTempDict = dataTemp.to_dict()
    dataframe['user_id'] = dataframe['user_id'].map(lambda x: [i for i,v in enumerate(edges) if v<=dataTempDict[x]][-1] )
    dummiesUser_id =  pd.get_dummies(dataframe.user_id,prefix='user_id')
    dataframe = pd.concat([dataframe, dummiesUser_id], axis=1)
    #user_id complete
    dataFrameUserIdDiff = np.setdiff1d(user_idRange, np.unique(dataframe.user_id)) 
    for userIdItem in map(lambda x:"user_id_"+str(x),dataFrameUserIdDiff):
        dataframe = pd.concat([dataframe,pd.DataFrame({userIdItem: np.zeros(dataframe.shape[0])})],axis=1)
    
    #drop attributes
    dataframe = dataframe.drop(['user_id',
                                'mail_id',
                                'hacker_created_at',
                                'sent_time',
                                'hacker_timezone',
                                'mail_category',
                                'mail_type',
                                #'last_online'
                               ], axis=1) 
    dataframe = dataframe.reindex_axis(sorted(dataframe.columns), axis=1)
    if(train):
        dataframe['opened'] = dataframe.clicked.map( {False: 0, True: 1} ).astype(int)
        dataframe = dataframe.drop(['clicked','unsubscribed','open_time','click_time','unsubscribe_time'], axis=1) 
        dataframe = dataframe.reindex_axis(['opened'] + list([a for a in dataframe.columns if a != 'opened']), axis=1)



#     dataframe = dataframe.drop([
#                                 'contest_login_count',
#                                 'contest_login_count_1_days',
#                                 'contest_login_count_30_days',
#                                 'contest_login_count_365_days',
#                                 'contest_login_count_7_days'
#                                 ], axis=1) 
    
    dataframe = norm(dataframe,'contest_login_count') 
    dataframe = norm(dataframe,'contest_login_count_1_days') 
    dataframe = norm(dataframe,'contest_login_count_30_days') 
    dataframe = norm(dataframe,'contest_login_count_365_days') 
    dataframe = norm(dataframe,'contest_login_count_7_days') 

#     dataframe = dataframe.drop([
#                                 'contest_participation_count',
#                                 'contest_participation_count_1_days',
#                                 'contest_participation_count_30_days',
#                                 'contest_participation_count_365_days',
#                                'contest_participation_count_7_days'
#                                 ], axis=1) 

    dataframe = norm(dataframe,'contest_participation_count') 
    dataframe = norm(dataframe,'contest_participation_count_1_days') 
    dataframe = norm(dataframe,'contest_participation_count_30_days') 
    dataframe = norm(dataframe,'contest_participation_count_365_days') 
    dataframe = norm(dataframe,'contest_participation_count_7_days') 

#     dataframe = dataframe.drop([
#                                 'submissions_count',
#                                 'submissions_count_1_days',
#                                 'submissions_count_30_days',
#                                 'submissions_count_365_days',
#                                 'submissions_count_7_days'
#                                 ], axis=1) 
    
    dataframe = norm(dataframe,'submissions_count') 
    dataframe = norm(dataframe,'submissions_count_1_days') 
    dataframe = norm(dataframe,'submissions_count_30_days') 
    dataframe = norm(dataframe,'submissions_count_365_days') 
    dataframe = norm(dataframe,'submissions_count_7_days') 

#     dataframe = dataframe.drop([
#                                 'submissions_count_contest',
#                                'submissions_count_contest_1_days',
#                                'submissions_count_contest_30_days',
#                                'submissions_count_contest_365_days',
#                                'submissions_count_contest_7_days'
#                                 ], axis=1) 

    dataframe = norm(dataframe,'submissions_count_contest') 
    dataframe = norm(dataframe,'submissions_count_contest_1_days') 
    dataframe = norm(dataframe,'submissions_count_contest_30_days') 
    dataframe = norm(dataframe,'submissions_count_contest_365_days') 
    dataframe = norm(dataframe,'submissions_count_contest_7_days') 

#     dataframe = dataframe.drop([
#                             'submissions_count_master',
#                             'submissions_count_master_1_days', 
#                             'submissions_count_master_30_days',
#                             'submissions_count_master_365_days',
#                             'submissions_count_master_7_days'
#                             ], axis=1) 

    dataframe = norm(dataframe,'submissions_count_master') 
    dataframe = norm(dataframe,'submissions_count_master_1_days') 
    dataframe = norm(dataframe,'submissions_count_master_30_days') 
    dataframe = norm(dataframe,'submissions_count_master_365_days') 
    dataframe = norm(dataframe,'submissions_count_master_7_days') 

#     dataframe = dataframe.drop([
#                             'forum_comments_count',
#                             'forum_count',
#                             'forum_expert_count',
#                             'forum_questions_count'
#                           ], axis=1) 

    dataframe = norm(dataframe,'forum_comments_count') 
    dataframe = norm(dataframe,'forum_count') 
    dataframe = norm(dataframe,'forum_expert_count') 
    dataframe = norm(dataframe,'forum_questions_count') 

#     dataframe = dataframe.drop([
#                             'ipn_count',
#                             'ipn_count_1_days',
#                             'ipn_count_30_days',
#                             'ipn_count_365_days',
#                             'ipn_count_7_days'
#                             ], axis=1) 
    dataframe = norm(dataframe,'ipn_count') 
    dataframe = norm(dataframe,'ipn_count_1_days') 
    dataframe = norm(dataframe,'ipn_count_30_days') 
    dataframe = norm(dataframe,'ipn_count_365_days') 
    dataframe = norm(dataframe,'ipn_count_7_days') 

#     dataframe = dataframe.drop([
#                             'ipn_read',
#                             'ipn_read_1_days',
#                             'ipn_read_30_days',
#                             'ipn_read_365_days',
#                             'ipn_read_7_days'
#                             ], axis=1) 

    dataframe = norm(dataframe,'ipn_read') 
    dataframe = norm(dataframe,'ipn_read_1_days') 
    dataframe = norm(dataframe,'ipn_read_30_days') 
    dataframe = norm(dataframe,'ipn_read_365_days') 
    dataframe = norm(dataframe,'ipn_read_7_days') 


    
    return dataframe
    

In [7]:
train_df = pd.read_csv('training_dataset.csv/training_dataset.csv', header=0)  
train_df.shape

(486048, 54)

In [8]:
train_df = preProcessData(train_df)
train_df.shape

(486048, 86)

In [9]:
from sklearn import linear_model
from sklearn import tree
from sklearn import svm
#from nolearn.dbn import DBN
from sklearn import ensemble 
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

train_data = train_df.values
x_train, x_test, y_train, y_test = train_test_split(train_data[0::,1::], train_data[0::,0], 
                            test_size = 0.2, random_state = 123) # Split training/test.
clf = ExtraTreesClassifier()
clf = clf.fit(x_train, y_train)
print(clf.feature_importances_)
model = SelectFromModel(clf, prefit=True)
x_train = model.transform(x_train)
x_test = model.transform(x_test)
print(x_train.shape)
# hipotese = DBN([x_train.shape[1], 300, 2],
#                 learn_rates = 0.01,
#                 learn_rate_decays = 0.9,
#                 epochs = 100,
#                 dropouts = 0, # Express the percentage of nodes that will be randomly dropped as a decimal.
#                 verbose = 1)
#hipotese = linear_model.LogisticRegression(C=1e5)
#hipotese = tree.DecisionTreeClassifier(random_state=1234)
#hipotese = ensemble.RandomForestClassifier(random_state=1234)

hipotese = svm.SVC()

hipotese.fit(x_train, y_train)


[  3.77072205e-03   3.71353814e-04   1.98180273e-04   1.80146950e-04
   7.61782866e-04   2.48278918e-04   1.58089649e-03   2.12166263e-04
   9.94859665e-05   1.58272930e-05   2.51441598e-04   7.04175771e-04
   8.86748393e-04   2.01070738e-04   3.67067638e-04   4.72360506e-04
   2.23181305e-04   3.04143819e-04   1.37541143e-03   2.51226130e-03
   1.99672152e-03   7.21444779e-05   2.22854923e-02   1.70746653e-03
   1.50077125e-02   2.36403382e-02   8.80359638e-03   2.70312148e-02
   3.13085586e-03   1.98066122e-02   2.82454065e-02   1.30996485e-02
   7.97463734e-03   9.04150731e-03   2.63412214e-03   3.52101446e-03
   4.74070999e-03   4.85191169e-02   1.21734577e-02   4.48563303e-02
   4.88919186e-02   2.57217576e-02   2.27838408e-02   3.33534436e-03
   1.47313563e-02   2.23433071e-02   8.66583844e-03   1.53509575e-01
   2.57340360e-03   2.79596478e-03   1.96168371e-03   2.30904352e-03
   3.31781313e-03   7.44730736e-04   2.02896068e-03   7.10025619e-04
   0.00000000e+00   0.00000000e+00

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [10]:
y_true, y_pred = y_test, hipotese.predict(x_test) # Get our predictions
print(classification_report(y_true, y_pred)) # Classification on each digit


             precision    recall  f1-score   support

        0.0       0.94      1.00      0.97     91840
        1.0       0.00      0.00      0.00      5370

avg / total       0.89      0.94      0.92     97210



  'precision', 'predicted', average, warn_for)


In [11]:
test_df = pd.read_csv('test_dataset.csv/test_dataset.csv', header=0)  
test_df.shape

(207424, 48)

In [12]:
test_df = preProcessData(test_df,False)
test_df.shape

(207424, 85)

In [13]:
test_df = model.transform(test_df)
test_df.shape

(207424L, 26L)

In [14]:
test_data = test_df

y_pred = hipotese.predict(test_data).astype(int)


In [15]:
saveFileForSubmission(y_pred,'submissionSVMtWithFeatureSelection.csv')