# LinReg

In [73]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import csv
import re
import string

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform, randint
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Test Train

In [74]:
df = pd.read_csv('census_clean.csv')


In [75]:
train = df[df['ZZ_SPLIT'] == 'train']
X_train = train.drop(columns=['ZA_TARGET','ZZ_SPLIT'])
y_train = train['ZA_TARGET']


test = df[df['ZZ_SPLIT'] == 'test']
X_test = test.drop(columns=['ZA_TARGET','ZZ_SPLIT'])
y_test = test['ZA_TARGET']


In [76]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [77]:
y_train.sum()/y_train.count()

0.08077495031478882

## Model Builing

> Logistic Regression

In [87]:
logR = LogisticRegression(max_iter= 10000)

param_grid_logR = {'C' : [0.001,0.01,0.1,1,10,100]}


grid_logR = GridSearchCV(estimator=logR, param_grid=param_grid_logR,cv=8)
grid_logR = grid_logR.fit(X_train,y_train)

y_pred_train_logR = grid_logR.predict(X_train)
y_pred_test_logR = grid_logR.predict(X_test)

acc_logR = accuracy_score(y_test, y_pred_test_logR)
f1_logR = f1_score(y_test, y_pred_test_logR)

print('\n----- Logistic Regression ----')
print('Accuracy :',accuracy_score(y_train, y_pred_train_logR))
print('f1 :',f1_score(y_train, y_pred_train_logR))

print('Testing')
print('Accuracy :',acc_logR)
print('f1 :',f1_logR)




----- Logistic Regression ----
Accuracy : 0.9325253985240365
f1 : 0.4314988290398127
Testing
Accuracy : 0.9343109926069567
f1 : 0.4293401965372017


> Random Forest

In [79]:
rf = RandomForestClassifier(n_jobs=14)

param_grid_rf = {'max_depth': [4,8,16],
            'n_estimators':[100,150,200],
            'max_features':['sqrt',1,5,10],
            'min_samples_split': [1,2,4]}


grid_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf,cv=8)
grid_rf = grid_rf.fit(X_train,y_train)


y_pred_train_rf = grid_rf.predict(X_train)
y_pred_test_rf = grid_rf.predict(X_test)

acc_rf = accuracy_score(y_test, y_pred_test_rf)
f1_rf = f1_score(y_test, y_pred_test_rf)

print('\n----- Random Forest ----')
print('Accuracy :',accuracy_score(y_train, y_pred_train_rf))
print('f1 :',f1_score(y_train, y_pred_train_rf))

print('Testing')
print('Accuracy :',acc_rf)
print('f1 :',f1_rf)

288 fits failed out of a total of 864.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
288 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "C:\Users\AMGra\AppData\Roaming\Python\Python310\site-packages\joblib\externals\loky\process_executor.py", line 436, in _process_worker
    r = call_item()
  File "C:\Users\AMGra\AppData\Roaming\Python\Python310\site-packages\joblib\externals\loky\process_executor.py", line 288, in __call__
    return self.fn(*self.args, **self.kwargs)
  File "C:\Users\AMGra\AppData\Roaming\Python\Python310\site-packages\joblib\_parallel_backends.py", line 595, in __call__
    return self.func(*args, **kwargs)
  File "C:\User


----- Random Forest ----
Accuracy : 0.951822717607327
f1 : 0.6133511795214991
Testing
Accuracy : 0.9377045206641619
f1 : 0.4461206896551723


> XGBoost

In [80]:
xg = xgb.XGBClassifier(booster='gblinear')

param_grid_xg = { # default 3
                "n_estimators": [100,150,200,250,300],
                "lambda": [0,0.1,1],
                "alpha": [0,0.1,1],
                "feature_selector": ['cyclic','greedy','random']}


grid_xg = GridSearchCV(estimator=xg, param_grid=param_grid_xg,cv=8)
grid_xg = grid_xg.fit(X_train,y_train)


y_pred_train_xg = grid_xg.predict(X_train)
y_pred_test_xg = grid_xg.predict(X_test)

acc_xg = accuracy_score(y_test, y_pred_test_xg)
f1_xg = f1_score(y_test, y_pred_test_xg)

print('\n----- XGBoost ----')
print('Accuracy :',accuracy_score(y_train, y_pred_train_xg))
print('f1 :',f1_score(y_train, y_pred_train_xg))

print('Testing')
print('Accuracy :',acc_xg)
print('f1 :',f1_xg)

720 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\AMGra\AppData\Roaming\Python\Python310\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\AMGra\AppData\Roaming\Python\Python310\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
  File "C:\Users\AMGra\AppData\Roaming\Python\Python310\site-packages\xgboost\sklearn.py", line 1516, in fit
    self._Booster = train(
  File "C:\Users\AMGra\AppData\Roaming\Python\Python310\site-packages\xgboost\core.py", line 620, in inner_f
    return func


----- XGBoost ----
Accuracy : 0.9323169291065001
f1 : 0.43582020389249304
Testing
Accuracy : 0.9345937866117239
f1 : 0.4376519624869746


In [94]:
df = pd.DataFrame()
df['Feature'] = train.drop(columns=['ZZ_SPLIT','ZA_TARGET']).columns.to_list()
df['Coef'] = grid_logR.best_estimator_.coef_.tolist()[0]
df['Coef_abs'] = np.abs(df['Coef'])
df['Coef_odds'] = np.exp(df['Coef'])
df['Coef_prob'] = df['Coef'].apply(lambda x: np.exp(x)/(1+np.exp(x)))
df.sort_values(by='Coef_odds', ascending=False).head(5)

Unnamed: 0,Feature,Coef,Coef_abs,Coef_odds,Coef_prob
3,AHGA,4.293125,4.293125,73.194849,0.986522
0,AAGE,2.46109,2.46109,11.717576,0.921369
12,WKSWORK,1.863244,1.863244,6.444612,0.865675
5,CAPGAIN,1.133288,1.133288,3.105851,0.756445
6,CAPLOSS,1.068971,1.068971,2.91238,0.744401


In [95]:
df2 = pd.DataFrame()
df2['Feature'] = train.drop(columns=['ZZ_SPLIT','ZA_TARGET']).columns.to_list()
df2['Coef'] = grid_xg.best_estimator_.coef_.tolist()
df2['Coef_abs'] = df2['Coef'].apply(lambda x: np.abs(x))
df2['Coef_odds'] = df2['Coef'].apply(lambda x: np.exp(x))
df2['Coef_prob'] = df2['Coef'].apply(lambda x: np.exp(x)/(1+np.exp(x)))
df2.sort_values(by='Coef_odds', ascending=False).head(5)

Unnamed: 0,Feature,Coef,Coef_abs,Coef_odds,Coef_prob
3,AHGA,4.59835,4.59835,99.320302,0.990032
0,AAGE,2.73568,2.73568,15.420226,0.939099
12,WKSWORK,1.90952,1.90952,6.749848,0.870965
5,CAPGAIN,1.14659,1.14659,3.147442,0.758888
6,CAPLOSS,1.09426,1.09426,2.986972,0.749183


np.exp(12.47)