In [33]:
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
import itertools

from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics
from sklearn import linear_model
from sklearn import svm

In [36]:
sub = Path('/content/submissions')
m_path = Path('/content/models')

In [37]:
def encoding(X, categorical, enc):
    X_cat = X[:,categorical]
    X_ncat = np.delete(X, categorical, axis=1)
    X_cat_enc = enc.transform(X_cat)
    X_encoded = np.concatenate((X_ncat,X_cat_enc), axis=1)
    return X_encoded

def evaluate(model, X_train, y_train, X_test, y_test, model_name='', print_=True):
    y_tr_pred = model.predict(X_train)
    train_accuracy = metrics.accuracy_score(y_train, y_tr_pred)

    y_te_pred = model.predict(X_test)
    test_accuracy = metrics.accuracy_score(y_test, y_te_pred)
    
    if print_:
        print("Train accuracy: ",train_accuracy)
        print("Test accuracy: ",test_accuracy)
    return train_accuracy, test_accuracy

def model_call(filen):
    with open(m_path / '{0}'.format(filen+'.pkl'),'rb') as f:
        return pickle.load(f)

def output(model, X_fi_test, final,model_name):
    y_final = model.predict_proba(X_fi_test)[:,1]
    test_fi_pred = pd.DataFrame({'ID':final, 'Prediction':y_final})
    filen = 'submission '+model_name
    test_fi_pred.to_csv(sub / '{0}'.format(filen+'.csv'), index=False)
    with open(m_path / '{0}'.format(filen+'.pkl'),'wb') as f:
        pickle.dump(model,f)
    return filen

In [38]:

train_data = pd.read_csv('/content/train_final.csv')
test_data = pd.read_csv('/content/test_final.csv')
X_f_train = train_data.iloc[:,0:-1].to_numpy()
y_f_train = train_data.iloc[:,-1].to_numpy()
X_fi_test = test_data.iloc[:,1:].to_numpy()
main = test_data.iloc[:,0].to_numpy()

In [39]:
# Preprocessing the data
categorical_data = (train_data.dtypes == 'object')
categorical_cols = list(categorical_data[categorical_data].index)
categorical_ = [train_data.columns.get_loc(c) for c in categorical_cols if c in train_data]
enc = preprocessing.OneHotEncoder(sparse=False, dtype=int)
enc.fit(np.concatenate((X_f_train[:,categorical_],X_fi_test[:,categorical_]), axis=0))

# One hot encoding 
X_train_enc = encoding(X_f_train, categorical_, enc)
X_test_enc = encoding(X_fi_test, categorical_, enc)

# Making zero mean and unit variance 
scaler = preprocessing.StandardScaler()

# Modification to fit the training data
X_train_enc_ = scaler.fit_transform(X_train_enc)
X_test_enc_ = scaler.transform(X_test_enc)

#Split the data
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_train_enc_, y_f_train, test_size=0.20)

In [40]:
model = linear_model.LogisticRegression(solver='lbfgs', max_iter=500) 
model_name = 'Logistic Regression'
model.fit(X_train, y_train)

LogisticRegression(max_iter=500)

In [41]:
model_eval = evaluate(model, X_train, y_train, X_test, y_test, model_name=model_name)

Train accuracy:  0.85335
Test accuracy:  0.855


In [43]:
output(model, X_test_enc_, main,model_name)

'submission Logistic Regression'

In [44]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [45]:
# CAT
from catboost import CatBoostClassifier
model = CatBoostClassifier() 
model_name = 'CATBoost Classification'
model.fit(X_train, y_train)

Learning rate set to 0.037023
0:	learn: 0.6549022	total: 37.9ms	remaining: 37.8s
1:	learn: 0.6198371	total: 85ms	remaining: 42.4s
2:	learn: 0.5886843	total: 105ms	remaining: 34.7s
3:	learn: 0.5633719	total: 115ms	remaining: 28.6s
4:	learn: 0.5390769	total: 125ms	remaining: 25s
5:	learn: 0.5201368	total: 158ms	remaining: 26.2s
6:	learn: 0.5022284	total: 174ms	remaining: 24.6s
7:	learn: 0.4851718	total: 200ms	remaining: 24.8s
8:	learn: 0.4689312	total: 210ms	remaining: 23.2s
9:	learn: 0.4536627	total: 221ms	remaining: 21.8s
10:	learn: 0.4414151	total: 232ms	remaining: 20.9s
11:	learn: 0.4294729	total: 263ms	remaining: 21.7s
12:	learn: 0.4186329	total: 303ms	remaining: 23s
13:	learn: 0.4116242	total: 349ms	remaining: 24.6s
14:	learn: 0.4039753	total: 388ms	remaining: 25.5s
15:	learn: 0.3975376	total: 418ms	remaining: 25.7s
16:	learn: 0.3907002	total: 455ms	remaining: 26.3s
17:	learn: 0.3839987	total: 486ms	remaining: 26.5s
18:	learn: 0.3786185	total: 528ms	remaining: 27.3s
19:	learn: 0.37

<catboost.core.CatBoostClassifier at 0x7fb7bf8d6890>

In [46]:
model_eval = evaluate(model, X_train, y_train, X_test, y_test, model_name=model_name)

Train accuracy:  0.90235
Test accuracy:  0.8778


In [47]:
output(model, X_test_enc_, main,model_name)

'submission CATBoost Classification'

In [48]:

model = svm.SVC(kernel='rbf', C=10.0, verbose=True, probability=True)
model.fit(X_train, y_train)
model_name = 'Support Vector Machine'

[LibSVM]

In [49]:
model_eval = evaluate(model, X_train, y_train, X_test, y_test, model_name=model_name)

Train accuracy:  0.887
Test accuracy:  0.851


In [50]:
output(model, X_test_enc_, main,model_name)

'submission Support Vector Machine'