In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
categorical = pd.read_csv('categorical.csv')
numerical = pd.read_csv('numerical.csv')
targets = pd.read_csv('target.csv')

- Yesterday I use Variance Treshold and Kbest to choose my features
- I will apply the result of variance treshold below

#### categoricals

In [3]:
categorical = categorical.drop(columns=['ODATEW_YR', 'ODATEW_MM', 'DOB_YR',
       'DOB_MM', 'MINRDATE_YR', 'MAXRDATE_YR',
       'LASTDATE_YR', 'MAXRDATE_MM','MINRDATE_MM', 'FIRSTDATE_YR', 'FIRSTDATE_MM'], axis=1)

In [4]:
categorical = categorical.applymap(str)

In [5]:
categorical.head()

Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,LASTDATE_MM
0,IL,36,H,F,3,L,E,C,T,2,12
1,CA,14,H,M,3,L,G,A,S,1,12
2,NC,43,U,M,3,L,E,C,R,2,12
3,CA,44,U,F,3,L,E,C,R,2,12
4,FL,16,H,F,3,L,F,A,S,2,1


### numericals

In [6]:
cols = ['AGE', 'INCOME', 'WEALTH1', 'VIETVETS', 'WWIIVETS', 'WEALTH2',
       'POP90C1', 'POP90C2', 'POP90C3', 'ETH1', 'ETH2', 'HHN3', 'DW1', 'DW2',
       'DW4', 'DW5', 'DW6', 'HV1', 'HV2', 'HV3', 'HV4', 'HU1', 'HU2', 'HU5',
       'HHD2', 'HHD3', 'HHD5', 'ETHC2', 'HVP1', 'HVP2', 'HVP3', 'HVP4', 'HVP5',
       'HVP6', 'HUR2', 'HUPA2', 'HUPA3', 'HUPA6', 'RP1', 'RP2', 'RP3', 'RP4',
       'MSA', 'ADI', 'IC6', 'HHAS3', 'MC1', 'MC2', 'PEC2', 'TPE13', 'LFC2',
       'LFC4', 'LFC6', 'LFC7', 'LFC8', 'LFC9', 'VC1', 'VC3', 'POBC2', 'LSC1',
       'VOC2', 'HC2', 'HC4', 'HC5', 'HC6', 'HC7', 'HC8', 'HC11', 'HC13',
       'HC17', 'HC18', 'HC19', 'MHUC1', 'MHUC2', 'CARDPROM', 'CONTROLN',
       'HPHONE_D', 'RFA_2F', 'CLUSTER2']

In [7]:
numerical = numerical.loc[:, cols]

In [8]:
numerical[cols] = numerical[cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [9]:
all_data = pd.concat((categorical,numerical,targets),axis=1)
all_data.head()

Unnamed: 0,STATE,CLUSTER,HOMEOWNR,GENDER,DATASRCE,RFA_2R,RFA_2A,GEOCODE2,DOMAIN_A,DOMAIN_B,...,HC19,MHUC1,MHUC2,CARDPROM,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,TARGET_B,TARGET_D
0,IL,36,H,F,3,L,E,C,T,2,...,40.0,6.0,2.0,27.0,95515.0,0.0,4.0,39.0,0,0.0
1,CA,14,H,M,3,L,G,A,S,1,...,99.0,20.0,4.0,12.0,148535.0,0.0,2.0,1.0,0,0.0
2,NC,43,U,M,3,L,E,C,R,2,...,17.0,9.0,2.0,26.0,15078.0,1.0,4.0,60.0,0,0.0
3,CA,44,U,F,3,L,E,C,R,2,...,22.0,16.0,2.0,27.0,172556.0,1.0,4.0,41.0,0,0.0
4,FL,16,H,F,3,L,F,A,S,2,...,21.0,6.0,2.0,43.0,7112.0,1.0,2.0,26.0,0,0.0


In [10]:
# going to drop TARGET_D column for today
data = all_data.drop(columns=['TARGET_D'], axis=1)

- I know there is imbalance in the data, I will oversample it
- I will oversample only my train set

In [11]:
# X/y split
X = data.drop('TARGET_B',axis = 1)
y = data['TARGET_B']

In [12]:
# train-test-split 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) # default fraction is .2

In [13]:
X_train_imb = pd.DataFrame(X_train)

In [14]:
trainset = pd.concat((X_train, y_train), axis=1)

In [15]:
trainset['TARGET_B'].value_counts()

0    67970
1     3589
Name: TARGET_B, dtype: int64

- oversampling the train data

In [16]:
from sklearn.utils import resample

category_0 = trainset[trainset['TARGET_B'] == 0]
category_1 = trainset[trainset['TARGET_B'] == 1]

In [17]:
category_1_oversampled = resample(category_1, 
                                  replace=True, 
                                  n_samples = len(category_0))

In [18]:
train_upsampled = pd.concat([category_1_oversampled, category_0], axis=0)

- above my train upsampled data is in an order, I will mix them below
- because I will do cross validatyion in future

In [19]:
train_upsampled = train_upsampled.sample(frac =1)

In [20]:
train_upsampled['TARGET_B'].value_counts()

0    67970
1    67970
Name: TARGET_B, dtype: int64

- now again I will do X-y split to my trainset because I will one hot encode my categoricals and scale numericals
- I don't need to scale numericals for Random Forest but I will try logistic regression model too, so preferred to do it now

In [21]:
X_train = train_upsampled.drop('TARGET_B',axis = 1)
y_train = train_upsampled['TARGET_B']

In [22]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [23]:
#splitting into categorical and numerical
X_train_num = X_train.select_dtypes(include = np.number)
X_train_cat = X_train.select_dtypes(include = object)

X_test_num = X_test.select_dtypes(include = np.number)
X_test_cat = X_test.select_dtypes(include = object)

- for both scaling and encoding, I will fit it to my train set, use the same transformer to my test set

In [24]:
#scale numericals
from sklearn.preprocessing import MinMaxScaler

transformer = MinMaxScaler().fit(X_train_num) 
numericals_train_scaled = transformer.transform(X_train_num)
numericals_test_scaled = transformer.transform(X_test_num)

In [25]:
#encode categoricals
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown='error', drop='first').fit(X_train_cat)
categoricals_train_encoded = encoder.transform(X_train_cat).toarray()
categoricals_test_encoded = encoder.transform(X_test_cat).toarray()

In [26]:
X_train = np.concatenate((numericals_train_scaled,categoricals_train_encoded),axis=1)
X_test = np.concatenate((numericals_test_scaled,categoricals_test_encoded),axis=1)

- I will make a logistic regression model

In [27]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(random_state=0, solver='saga')
LR.fit(X_train, y_train)
LR.score(X_test, y_test) 

0.6079319163207982

In [28]:
pred = LR.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred)

array([[13824,  8775],
       [  577,   677]], dtype=int64)

In [29]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


print("precision: ",precision_score(y_test,pred))    # true pos / true pos+false pos
print("recall: ",recall_score(y_test,pred))    #recall is most important, true pos/true pos + false neg
print("f1: ",f1_score(y_test,pred))

precision:  0.07162505289885739
recall:  0.539872408293461
f1:  0.12647113767980572


- I will make a RandomForestClassifier model

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.2,
                             random_state = 42)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

y_pred = clf.predict(X_test)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))


0.6197734294541709
0.5884794365488618


0    22599
1     1254
Name: TARGET_B, dtype: int64

array([[13333,  9266],
       [  550,   704]], dtype=int64)

- I cross validate my Random Forest Classifier
- it is 0.61 on every group

In [31]:
from sklearn.model_selection import cross_validate

results = cross_validate(clf,X_train, y_train, cv = 5)

In [32]:
print(results['test_score'])
print(results['test_score'].mean())

[0.6205679  0.61615419 0.61585994 0.61174047 0.61604384]
0.6160732676180668


- I will create a pipeline to compare my models

In [33]:
#complete the code here
model1 = clf
model2 = LR

import numpy as np


model_pipeline = [model1, model2]
model_names = ['RandomForestClassifier', 'Logistic Regression']
scores = {}
for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train, y_train, cv=10))
    scores[model_name] = mean_score
print(scores)

{'RandomForestClassifier': 0.6168824481388848, 'Logistic Regression': 0.5968147712225982}
