In [12]:
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
import itertools

from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics
from sklearn import linear_model
from sklearn import svm

In [13]:
sub = Path('/content/submissions')
m_path = Path('/content/models')

In [25]:
def encoding(X, categorical, enc):
    X_cat = X[:,categorical]
    X_ncat = np.delete(X, categorical, axis=1)
    X_cat_enc = enc.transform(X_cat)
    X_encoded = np.concatenate((X_ncat,X_cat_enc), axis=1)
    return X_encoded

def evaluate(model, X_train, y_train, X_test, y_test, model_name='', print_=True):
    y_tr_pred = model.predict(X_train)
    y_tr_prob = model.predict_proba(X_train)[:,1]
    train_accuracy = metrics.accuracy_score(y_train, y_tr_pred)
    train_auc = metrics.roc_auc_score(y_train, y_tr_prob)

    y_te_pred = model.predict(X_test)
    y_te_prob = model.predict_proba(X_test)[:,1]
    test_accuracy = metrics.accuracy_score(y_test, y_te_pred)
    test_auc = metrics.roc_auc_score(y_test, y_te_prob)

    if print_:
        print("Train accuracy: ",train_accuracy)
        print("Train AUC: ",train_auc)
        print("Test accuracy: ",test_accuracy)
        print("Test AUC: ",test_auc) 
    return train_accuracy, train_auc, test_accuracy, test_auc

def model_call(filen):
    with open(m_path / '{0}'.format(filen+'.pkl'),'rb') as f:
        return pickle.load(f)

def output(model, X_fi_test, final,model_name):
    y_final = model.predict_proba(X_fi_test)[:,1]
    test_fi_pred = pd.DataFrame({'ID':final, 'Prediction':y_final})
    filen = 'submission '+model_name
    test_fi_pred.to_csv(sub / '{0}'.format(filen+'.csv'), index=False)
    with open(m_path / '{0}'.format(filen+'.pkl'),'wb') as f:
        pickle.dump(model,f)
    return filen

In [15]:

train_data = pd.read_csv('/content/train_final.csv')
test_data = pd.read_csv('/content/test_final.csv')
X_f_train = train_data.iloc[:,0:-1].to_numpy()
y_f_train = train_data.iloc[:,-1].to_numpy()
X_fi_test = test_data.iloc[:,1:].to_numpy()
main = test_data.iloc[:,0].to_numpy()

In [16]:
# Preprocessing the data
categorical_data = (train_data.dtypes == 'object')
categorical_cols = list(categorical_data[categorical_data].index)
categorical_ = [train_data.columns.get_loc(c) for c in categorical_cols if c in train_data]
enc = preprocessing.OneHotEncoder(sparse=False, dtype=int)
enc.fit(np.concatenate((X_f_train[:,categorical_],X_fi_test[:,categorical_]), axis=0))

# One hot encoding 
X_train_enc = encoding(X_f_train, categorical_, enc)
X_test_enc = encoding(X_fi_test, categorical_, enc)

# Making zero mean and unit variance 
scaler = preprocessing.StandardScaler()

# Modification to fit the training data
X_train_enc_ = scaler.fit_transform(X_train_enc)
X_test_enc_ = scaler.transform(X_test_enc)

#Split the data
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_train_enc_, y_f_train, test_size=0.20)

In [28]:

model = linear_model.LogisticRegression(solver='lbfgs', max_iter=500) 
model_name = 'Logistic Regression'
model.fit(X_train, y_train)

LogisticRegression(max_iter=500)

In [29]:

model_eval = evaluate(model, X_train, y_train, X_test, y_test, model_name=model_name)

Train accuracy:  0.85355
Train AUC:  0.9096326042704526
Test accuracy:  0.8562
Test AUC:  0.9070031778394801


In [22]:

output(model, X_test_enc_, main,model_name)

'submission Logistic Regression'

In [6]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 1.4 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [23]:
# CAT
from catboost import CatBoostClassifier
model = CatBoostClassifier() 
model_name = 'CATBoost Classification'
model.fit(X_train, y_train)

Learning rate set to 0.037023
0:	learn: 0.6519543	total: 12.3ms	remaining: 12.3s
1:	learn: 0.6181600	total: 20.3ms	remaining: 10.1s
2:	learn: 0.5883896	total: 28.9ms	remaining: 9.6s
3:	learn: 0.5639812	total: 36.5ms	remaining: 9.09s
4:	learn: 0.5397953	total: 44ms	remaining: 8.76s
5:	learn: 0.5201849	total: 51.1ms	remaining: 8.47s
6:	learn: 0.5027545	total: 59.8ms	remaining: 8.48s
7:	learn: 0.4853692	total: 67.4ms	remaining: 8.36s
8:	learn: 0.4686415	total: 74.9ms	remaining: 8.25s
9:	learn: 0.4542183	total: 82.6ms	remaining: 8.17s
10:	learn: 0.4413134	total: 90.5ms	remaining: 8.14s
11:	learn: 0.4312098	total: 98.1ms	remaining: 8.08s
12:	learn: 0.4205597	total: 108ms	remaining: 8.22s
13:	learn: 0.4121991	total: 127ms	remaining: 8.91s
14:	learn: 0.4045659	total: 134ms	remaining: 8.78s
15:	learn: 0.3982225	total: 141ms	remaining: 8.67s
16:	learn: 0.3911552	total: 148ms	remaining: 8.58s
17:	learn: 0.3845969	total: 156ms	remaining: 8.53s
18:	learn: 0.3790867	total: 164ms	remaining: 8.47s
19

<catboost.core.CatBoostClassifier at 0x7fb7bf8e6490>

In [26]:
model_eval = evaluate(model, X_train, y_train, X_test, y_test, model_name=model_name)

Train accuracy:  0.90065
Train AUC:  0.9564051131383993
Test accuracy:  0.8774
Test AUC:  0.9288061659321379


In [27]:
output(model, X_test_enc_, main,model_name)

'submission CATBoost Classification'

In [30]:

model = svm.SVC(kernel='rbf', C=10.0, verbose=True, probability=True)
model.fit(X_train, y_train)
model_name = 'Support Vector Machine'

[LibSVM]

In [31]:
model_eval = evaluate(model, X_train, y_train, X_test, y_test, model_name=model_name)

Train accuracy:  0.88775
Train AUC:  0.9371334250031452
Test accuracy:  0.853
Test AUC:  0.8864679671880284


In [32]:
output(model, X_test_enc_, main,model_name)

'submission Support Vector Machine'