In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import random
from sklearn import  metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import  cross_val_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import (LinearRegression, Ridge, Lasso, RandomizedLasso, LogisticRegression)
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
import time

start_time = time.time()
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv',)
submission_df = pd.read_csv('sample_submission.csv')

print('train_df shape: ',train_df.shape)
print('test_df shape: ',test_df.shape)

print("--- %s seconds ---" % round((time.time() - start_time),2))

train_df shape:  (200000, 202)
test_df shape:  (200000, 201)
--- 59.55 seconds ---


In [2]:
#### ALL ABOUT PREPPING THE TRAIN DATA ####
start_time = time.time()


target_col = ["target"]
id_dataset = ["ID_code"]
#numerical columns
num_cols   = [x for x in train_df.columns if x not in target_col + id_dataset]

#Scaling Numerical columns
#std = StandardScaler()
std = MinMaxScaler()
scaled = std.fit_transform(train_df[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)

#dropping original values merging scaled values for numerical columns
df_data_og = train_df.copy()
data = train_df.drop(columns = num_cols,axis = 1)
data = data.merge(scaled,left_index=True,right_index=True,how = "left")
data = data.drop(columns = ['ID_code'],axis = 1)

#data['139+81'] = data['var_81'] + data['var_139']
#data.drop(['var_81'], axis = 1)
#data.drop(['var_139'], axis = 1)

print("--- %s seconds ---" % round((time.time() - start_time),2))

--- 0.95 seconds ---


In [3]:
#### ALL ABOUT PREPPING THE TEST DATA ####
start_time = time.time()

#test_df['new_col_139+81'] = test_df['var_81'] + test_df['var_139']
#test_df.drop(['var_81'], axis = 1)
#test_df.drop(['var_139'], axis = 1)

target_col = ["target"]
id_dataset = ["ID_code"]
#numerical columns
num_cols   = [x for x in test_df.columns if x not in target_col + id_dataset]

#Scaling Numerical columns
#std = StandardScaler()
std = MinMaxScaler()
scaled = std.fit_transform(test_df[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)

#dropping original values merging scaled values for numerical columns
df_test_og = test_df.copy()
test = test_df.drop(columns = num_cols,axis = 1)
test = test.merge(scaled,left_index=True,right_index=True,how = "left")
test = test.drop(columns = ['ID_code'],axis = 1)

#test['139+81'] = test['var_81'] + test['var_139']
#test.drop(['var_81'], axis = 1)
#test.drop(['var_139'], axis = 1)


print("--- %s seconds ---" % round((time.time() - start_time),2))

--- 1.28 seconds ---


In [4]:
#### Check correlation and drop ####
start_time = time.time()
corr_matrix = pd.DataFrame(data)

#corr = corr_matrix.corr().abs()
#corr = (corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool))
#                 .stack()
#                 .sort_values(ascending=False))
#print(corr.tail(25))

#### Also check correlation to target variable ####
cor_t = pd.DataFrame(data).apply(lambda x: x.corr(data.target)).abs().sort_values(ascending=False)
print(cor_t.tail(50))
#corr.to_csv('corr.csv')

print("--- %s seconds ---" % round((time.time() - start_time),2))

var_72     0.013005
var_84     0.012363
var_68     0.011957
var_19     0.011291
var_65     0.011214
var_143    0.011202
var_3      0.011055
var_4      0.010915
var_120    0.010895
var_152    0.010773
var_59     0.010448
var_189    0.009212
var_101    0.009138
var_47     0.008983
var_42     0.008365
var_69     0.008283
var_16     0.008117
var_37     0.007685
var_79     0.007591
var_176    0.007469
var_61     0.007407
var_182    0.007198
var_153    0.007103
var_73     0.006460
var_14     0.006332
var_60     0.006265
var_129    0.005880
var_46     0.005690
var_183    0.005467
var_160    0.005135
var_29     0.004682
var_124    0.004218
var_161    0.004168
var_39     0.004090
var_98     0.004074
var_158    0.003817
var_136    0.003554
var_96     0.003037
var_7      0.003025
var_117    0.002591
var_100    0.002215
var_10     0.002213
var_103    0.001395
var_126    0.001393
var_41     0.001298
var_38     0.000970
var_17     0.000864
var_30     0.000638
var_27     0.000582
var_185    0.000053


In [5]:
#### Select columns to use ####
start_time = time.time()
y = data.target
x = data.drop(['target'], axis = 1)

#for i in range(0,1000): 
seed = 420
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2, random_state=seed)

#from imblearn.over_sampling import SMOTE
#smt = SMOTE()
#x_train, y_train = smt.fit_sample(x_train, y_train)

print("--- %s seconds ---" % round((time.time() - start_time),2))

--- 0.64 seconds ---


In [6]:
#### Logistic Regression ####
start_time = time.time()
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(class_weight='balanced', penalty='l2',
                            C=1, solver='liblinear').fit(x_train, y_train)
seed = 420
kfold = KFold(n_splits=10, random_state=seed)
scoring = 'roc_auc'
results = cross_val_score(logreg, x_train, y_train,
                          cv=kfold, scoring=scoring)
print('avg of the cv scores: ',results.mean())

prediction = pd.DataFrame(logreg.predict_proba(x_train))
prediction = prediction.drop(columns = 0, axis = 1)
print('ROC_AUC is: ',roc_auc_score(y_train, prediction))
#print(logreg.coef_)

#gathering_df = submission_df
#gathering_df['logreg']= pd.DataFrame(logreg.predict_proba(test)[:,1])

print("--- %s seconds ---" % round((time.time() - start_time),2))

avg of the cv scores:  0.8592665854851216
ROC_AUC is:  0.8619880168851922
--- 44.55 seconds ---


In [7]:
parameters = logreg.coef_

param_df =pd.DataFrame(parameters).transpose().abs()
col_df = pd.DataFrame(x_train.columns)

col_df.columns = ['Columns']
param_df.columns = ['Coefficients']
coef_df = param_df.join(col_df)

#coef_df.sort_values('Coefficients')
coef_df.sort_values(by='Coefficients',kind="quicksort",ascending = False)

Unnamed: 0,Coefficients,Columns
139,1.759204,var_139
146,1.627036,var_146
6,1.609851,var_6
81,1.594317,var_81
110,1.484647,var_110
12,1.426073,var_12
76,1.387969,var_76
22,1.364923,var_22
99,1.337574,var_99
53,1.304405,var_53


In [8]:
#### Naive Bayes #### 
start_time = time.time()
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB().fit(x_train,y_train)
seed = 420
kfold = KFold(n_splits=10, random_state=seed)
scoring = 'roc_auc'
results = cross_val_score(NB, x_train, y_train,
                          cv=kfold, scoring=scoring)
print('avg of the cv scores: ',results.mean())

NB_prediction = pd.DataFrame(NB.predict_proba(x_train))
NB_prediction = NB_prediction.drop(columns = 0, axis = 1)
print('ROC_AUC is: ',roc_auc_score(y_train, NB_prediction))
#gathering_df['NB']= pd.DataFrame(NB.predict_proba(test)[:,1])

print("--- %s seconds ---" % round((time.time() - start_time),2))

avg of the cv scores:  0.8884878927049712
ROC_AUC is:  0.8903752339812052
--- 11.43 seconds ---


In [9]:
start_time = time.time()
from sklearn.model_selection import KFold
NB_Score = cross_val_score(NB, x_train, y_train, cv = 10)
print(NB_Score)

from sklearn import model_selection
seed = 420
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = GaussianNB()
scoring = 'accuracy'
results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
print('Accuracy: ',round(results.mean(),4), round(results.std(),4))

scoring = 'roc_auc'
results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
print('AUC: ',round(results.mean(),4), round(results.std(),4))

print("--- %s seconds ---" % round((time.time() - start_time),2))

[0.92356728 0.921375   0.9226875  0.917125   0.921625   0.923125
 0.917375   0.9220625  0.922625   0.92393275]
Accuracy:  0.9215 0.0034
AUC:  0.8885 0.0044
--- 36.85 seconds ---


In [10]:
start_time = time.time()
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection

seed = 420
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = RandomForestClassifier()
scoring = 'accuracy'
results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
print('Accuracy: ',round(results.mean(),4), round(results.std(),4))

scoring = 'roc_auc'
results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
print('AUC: ',round(results.mean(),4), round(results.std(),4))

print("--- %s seconds ---" % round((time.time() - start_time),2))

Accuracy:  0.9004 0.0022
AUC:  0.6988 0.0071
--- 1366.08 seconds ---


In [11]:
#### Select columns to use ####
start_time = time.time()
y = data.target
x = data.drop(['target'], axis = 1)

#for i in range(0,1000): 
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2, random_state=seed)

model.fit(x_train, y_train)

pred = pd.DataFrame(model.predict_proba(x_test))
pred = pred.drop(columns = 0, axis = 1)
prediction = pred
cnf = confusion_matrix(y_test, prediction.round())
print(cnf)

print(prediction.head())

print("--- %s seconds ---" % round((time.time() - start_time),2))

[[35901    40]
 [ 4003    56]]
     1
0  0.0
1  0.1
2  0.1
3  0.0
4  0.2
--- 78.05 seconds ---


In [12]:
#### Ada Boost Classifer ####
#start_time = time.time()
#from sklearn.ensemble import AdaBoostClassifier
#ABC = AdaBoostClassifier(logreg).fit(x_train,y_train)
#ABC_score = cross_val_score(ABC, x_train, y_train, cv=10)
#print('avg of the cv scores: ',ABC_score.mean())

#ABC_prediction = pd.DataFrame(ABC.predict_proba(x_train))
#ABC_prediction = ABC_prediction.drop(columns = 0, axis = 1)
#print('ROC_AUC is: ',roc_auc_score(y_train, ABC_prediction))
#gathering_df['AdaBoost']= pd.DataFrame(ABC.predict_proba(test)[:,1])

#print("--- %s seconds ---" % round((time.time() - start_time),2))

In [13]:
#### Output the test results to csv ####
start_time = time.time()
submission_df['target']= pd.DataFrame(ABC.predict_proba(test)[:,1])
submission_df.to_csv('submission_seed.csv', index=False)
#gathering_df.drop(['target'])
gathering_df.to_csv('gathering_df.csv',index = False)

print("--- %s seconds ---" % round((time.time() - start_time),2))

NameError: name 'ABC' is not defined