In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import random
from sklearn import  metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import  cross_val_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import (LinearRegression, Ridge, Lasso, RandomizedLasso, LogisticRegression)
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
import time

start_time = time.time()
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv',)
submission_df = pd.read_csv('sample_submission.csv')

print("--- %s seconds ---" % round((time.time() - start_time),2))

--- 48.45 seconds ---


In [2]:
#### ALL ABOUT PREPPING THE TRAIN DATA ####

start_time = time.time()
target_col = ["target"]
id_dataset = ["ID_code"]
#numerical columns
num_cols   = [x for x in train_df.columns if x not in target_col + id_dataset]

#Scaling Numerical columns
std = StandardScaler()
scaled = std.fit_transform(train_df[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)

#dropping original values merging scaled values for numerical columns
df_data_og = train_df.copy()
data = train_df.drop(columns = num_cols,axis = 1)
data = data.merge(scaled,left_index=True,right_index=True,how = "left")
data = data.drop(columns = ['ID_code'],axis = 1)

print("--- %s seconds ---" % round((time.time() - start_time),2))

--- 1.54 seconds ---


In [3]:
#### ALL ABOUT PREPPING THE TEST DATA ####
start_time = time.time()
target_col = ["target"]
id_dataset = ["ID_code"]
#numerical columns
num_cols   = [x for x in test_df.columns if x not in target_col + id_dataset]

#Scaling Numerical columns
std = StandardScaler()
scaled = std.fit_transform(test_df[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)

#dropping original values merging scaled values for numerical columns
df_test_og = test_df.copy()
test = test_df.drop(columns = num_cols,axis = 1)
test = test.merge(scaled,left_index=True,right_index=True,how = "left")
test = test.drop(columns = ['ID_code'],axis = 1)

print("--- %s seconds ---" % round((time.time() - start_time),2))

--- 1.59 seconds ---


In [4]:
#### Check correlation and drop ####
start_time = time.time()
corr_matrix = pd.DataFrame(data)

corr = corr_matrix.corr().abs()
corr = (corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
                 .stack()
                 .sort_values(ascending=False))
print(corr.head(25))

#### Also check correlation to target variable ####
cor_t = pd.DataFrame(data).apply(lambda x: x.corr(data.target)).abs().sort_values(ascending=False)
#print(cor_t.head(5))
corr.to_csv('corr.csv')

print("--- %s seconds ---" % round((time.time() - start_time),2))

target  var_81     0.080917
        var_139    0.074080
        var_12     0.069489
        var_6      0.066731
        var_110    0.064275
        var_146    0.063644
        var_53     0.063399
        var_26     0.062422
        var_76     0.061917
        var_174    0.061669
        var_22     0.060558
        var_21     0.058483
        var_99     0.058367
        var_166    0.057773
        var_80     0.057609
        var_190    0.055973
        var_2      0.055870
        var_165    0.055734
        var_13     0.055156
        var_148    0.055011
        var_133    0.054548
        var_198    0.053000
        var_34     0.052692
        var_0      0.052390
        var_1      0.050343
dtype: float64
--- 19.51 seconds ---


In [5]:
#### Select columns to use ####
start_time = time.time()
y = data.target
x = data.drop(['target'], axis = 1)

#for i in range(0,1000): 
seed = 420
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2, random_state=seed)

#from imblearn.over_sampling import SMOTE
#smt = SMOTE()
#x_train, y_train = smt.fit_sample(x_train, y_train)

print("--- %s seconds ---" % round((time.time() - start_time),2))

--- 0.58 seconds ---


In [6]:
#### Logistic Regression ####
start_time = time.time()
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(class_weight='balanced', penalty='l2',
                            C=1, solver='liblinear').fit(x_train, y_train)

model_score = cross_val_score(logreg, x_train, y_train, cv=10, )
print('avg of the cv scores: ',model_score.mean())

prediction = pd.DataFrame(logreg.predict_proba(x_train))
prediction = prediction.drop(columns = 0, axis = 1)
print('ROC_AUC is: ',roc_auc_score(y_train, prediction))
#print(logreg.coef_)

gathering_df = submission_df
gathering_df['logreg']= pd.DataFrame(logreg.predict_proba(test)[:,1])

print("--- %s seconds ---" % round((time.time() - start_time),2))

avg of the cv scores:  0.7819875318608888
ROC_AUC is:  0.8619911875349053
--- 29.74 seconds ---


In [7]:
#### Naive Bayes #### 
start_time = time.time()
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB().fit(x_train,y_train)
NB_score = cross_val_score(NB, x_train, y_train, cv=10,)
print('avg of the cv scores: ',NB_score.mean())

NB_prediction = pd.DataFrame(NB.predict_proba(x_train))
NB_prediction = NB_prediction.drop(columns = 0, axis = 1)
print('ROC_AUC is: ',roc_auc_score(y_train, NB_prediction))
#gathering_df['NB']= pd.DataFrame(NB.predict_proba(test)[:,1])

print("--- %s seconds ---" % round((time.time() - start_time),2))

avg of the cv scores:  0.9215500022841798
ROC_AUC is:  0.8903752339812052
--- 12.35 seconds ---


In [8]:
start_time = time.time()
NB_Score = cross_val_score(NB, x_train, y_train, cv = 10)
print(NB_score)

from sklearn import model_selection
seed = 420
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = GaussianNB()
scoring = 'accuracy'
results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
print('Accuracy: ',results.mean(), results.std())

scoring = 'roc_auc'
results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
print('AUC: ',results.mean(), results.std())

print("--- %s seconds ---" % round((time.time() - start_time),2))

[0.92356728 0.921375   0.9226875  0.917125   0.921625   0.923125
 0.917375   0.9220625  0.922625   0.92393275]
Accuracy:  0.9215375 0.003435931460317568
AUC:  0.8884878927049712 0.004406832968379841
--- 29.1 seconds ---


In [9]:
#### Select columns to use ####
start_time = time.time()
y = data.target
x = data.drop(['target'], axis = 1)

#for i in range(0,1000): 
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2, random_state=seed)

model.fit(x_train, y_train)

pred = pd.DataFrame(model.predict_proba(x_test))
pred = pred.drop(columns = 0, axis = 1)
prediction = pred
cnf = confusion_matrix(y_test, prediction.round())
print(cnf)

print(prediction.head())

print("--- %s seconds ---" % round((time.time() - start_time),2))

[[35318   623]
 [ 2562  1497]]
          1
0  0.013287
1  0.010677
2  0.061877
3  0.003353
4  0.852162
--- 1.7 seconds ---


In [12]:
#### Ada Boost Classifer ####
start_time = time.time()
from sklearn.ensemble import AdaBoostClassifier
ABC = AdaBoostClassifier(model).fit(x_train,y_train)
ABC_score = cross_val_score(ABC, x_train, y_train, cv=10)
print('avg of the cv scores: ',ABC_score.mean())

ABC_prediction = pd.DataFrame(ABC.predict_proba(x_train))
ABC_prediction = ABC_prediction.drop(columns = 0, axis = 1)
print('ROC_AUC is: ',roc_auc_score(y_train, ABC_prediction))
gathering_df['AdaBoost']= pd.DataFrame(ABC.predict_proba(test)[:,1])

print("--- %s seconds ---" % round((time.time() - start_time),2))

avg of the cv scores:  0.9215875061901366
ROC_AUC is:  0.8923248695647825
--- 1359.47 seconds ---


In [15]:
#### Output the test results to csv ####
start_time = time.time()
submission_df['target']= pd.DataFrame(ABC.predict_proba(test)[:,1])
submission_df.to_csv('submission_seed.csv', index=False)
#gathering_df.drop(['target'])
gathering_df.to_csv('gathering_df.csv',index = False)

print("--- %s seconds ---" % round((time.time() - start_time),2))

--- 66.55 seconds ---
