In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math, random, os
from scipy import stats
sns.set()

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler,MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import auc, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier, MLPRegressor

In [3]:
df = pd.read_csv(r'./train.csv')
df['Education_level'] = df['Education_level'].apply(lambda x: int(x.split('_')[-1]))

cat_columns = [col for col in df.columns if df.dtypes[col]==np.dtype('O')]
cat_columns.extend(['gender',])
num_columns = [col for col in df.columns if col not in cat_columns]
target_column = num_columns.pop()

In [4]:
def noisy_invlogit(p):
#     print(p[p>=1].shape)
    p[p>=1] = 0.9 + (2*np.random.rand(p[p>=1].shape[0])-1)*0.05
#     plt.hist(d)
#     print(p[p<=0].shape)
    p[p<=0] = 0.1 + (2*np.random.rand(p[p<=0].shape[0])-1)*0.05
#     plt.scatter(range(p.size),p)
    return np.log(p/(1-p))

def get_class(x):
    y = 1/(1+np.exp(-x))
    y[y<0.5] = 0
    y[y>0.5] = 1
    return y

X = df.drop(target_column, axis = 1)
y = df[target_column]
X_train, X_test, y_tr, y_te =  train_test_split(X, y, test_size = 0.25, stratify = y)
y_train = noisy_invlogit(y_tr.copy())
y_test = noisy_invlogit(y_te.copy())

In [5]:
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('scaler', StandardScaler())
])
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('encoder', OneHotEncoder())
])
preprocessor = ColumnTransformer([
    ('numerical', num_pipe, num_columns),
    ('categorical', cat_pipe, cat_columns)
])

In [6]:
LR = Pipeline([
    ('prep', preprocessor),
    ('algo', LogisticRegression())
])

In [7]:
# params = {'algo__penalty' : ['l1', 'l2']}

# model = GridSearchCV(LR, params, cv = 4, n_jobs = -1, verbose = 10)
# model.fit(X_train, y_train)

# print('Best Parameter :', end = '')
# print(model.best_params_)
# print(f'Train Score : {model.score(X_train, y_train)}')
# print(f'Test Score : {model.score(X_test, y_test)}')

In [8]:
MLP = Pipeline([
    ('prep', preprocessor),
    ('algo', MLPRegressor(hidden_layer_sizes= (50,25), activation='relu',
                           solver = 'adam', max_iter = 100, alpha=0.1))
])

MLP['algo'].out_activation_ = 'logistic'
MLP.fit(X_train, y_train)
MLP['algo'].out_activation_



'identity'

In [11]:
confusion_matrix(y_te, get_class(MLP.predict(X_test)))
y_te.shape

(2789,)

In [None]:
# params = {'algo__penalty' : ['l1', 'l2']}

# model = GridSearchCV(LR, params, cv = 4, n_jobs = -1, verbose = 10)
# model.fit(X_train, y_train)

# print('Best Parameter :', end = '')
# print(model.best_params_)
# print(f'Train Score : {model.score(X_train, y_train)}')
# print(f'Test Score : {model.score(X_test, y_test)}')

In [14]:
y_p = MLP.predict(X_test)
cla = get_class(y_p)

In [22]:
y_test.max()

-1.7347058989508486

In [25]:
noisy_invlogit(y_te)

302     -2.673412
2289    -1.839916
11133   -2.212921
5118    -2.233733
8888    -2.055898
           ...   
657     -1.986858
3789    -1.919626
5406    -2.646896
4879    -1.909961
9657    -2.030786
Name: Best Performance, Length: 2789, dtype: float64