## 2.2. Variable Transformation and Sampling

In [185]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import RobustScaler

### Read in the whole data set, encode categorical var and split

In [186]:
df=pd.read_csv('last_data_v5_prioroutliers.csv')

In [187]:
df_encoded=pd.get_dummies(df, columns=['city', 'gender', 'registered_via'])

In [188]:
X=df_encoded
y=df_encoded['is_churn']

In [189]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)


In [190]:
X_train.to_csv('final_train_after_split.csv')

In [191]:
y_train.to_csv('final_train_target_after_split.csv')

In [192]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(613298, 157)
(262843, 157)
(613298,)
(262843,)


### Apply Isolation forest, Standard Scaler, Undersampling & Evaluate

#### Isolation Forest

In [193]:
from sklearn.ensemble import IsolationForest

X_wihout_outliers=X_train.drop('msno',axis=1)

clf = IsolationForest(max_samples=100)
clf.fit(X_wihout_outliers)
inlier=clf.predict(X_wihout_outliers)# Predict if a particular sample is an outlier or not. 
#If inlier==1, it means this sample is not outlier.

print(len(inlier))
print(np.count_nonzero(inlier == 1))
print(np.count_nonzero(inlier == -1))

613298
551968
61330


In [194]:
merge_data=X_train #msno is in
merge_data['isolation_forest']=inlier
a=merge_data.shape[1]
merge_data= merge_data[(merge_data.iloc[:,a-1]==1)]#remove all the rows which have outliers in LOF
X_train_without_outliers=merge_data.iloc[:,:a-1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [195]:
X_train_without_outliers.shape

(551968, 157)

#### Standard Scaler

In [196]:
# Get all the columns from the dataframe.
columns = df_encoded.columns.tolist()
# Filter the columns to remove ones we don't want. 
columns_subset = [c for c in columns if c not in ["is_churn","Unnamed: 0","Unnamed: 0.1","msno"]]
# Store the variable we'll be predicting on.
target = "is_churn"

X_to_std = X_train_without_outliers[columns_subset]

In [197]:
sc=StandardScaler()
X_std=sc.fit_transform(X_to_std)

In [198]:
X_std.shape

(551968, 154)

In [199]:
#convert back to data frame
data_std=pd.DataFrame(X_std)
#restore original df column names
data_std.columns=X_to_std.columns
data_std = data_std.set_index(X_train_without_outliers.index)
data_std['msno']=X_train_without_outliers['msno']
data_std['is_churn']=X_train_without_outliers['is_churn']
data_std['Temp_index']=X_train_without_outliers['Unnamed: 0']

In [200]:
data_std.head()

Unnamed: 0,bd,num_25_sum,num_50_sum,num_75_sum,num_985_sum,num_100_sum,num_unq_sum,num_totalsec_sum,num_25_mean,num_50_mean,...,gender_1,gender_2,registered_via_3,registered_via_4,registered_via_7,registered_via_9,registered_via_13,msno,is_churn,Temp_index
295235,1.13051,-0.843256,-0.921635,-0.959133,-0.83619,-0.924143,-1.019612,-0.956177,-1.066259,-1.110195,...,1.847228,-1.146185,-0.357804,-0.23291,-1.15751,1.691552,-0.05888,wIPYvtreB5Md5iEwdkrxOUEPYdQrsgOJxLR/9jad8sY=,0,295236
716291,-0.811134,-0.307139,-0.24102,0.117182,0.028874,0.756436,-0.233113,0.814624,-0.646871,-0.606938,...,-0.541352,0.87246,-0.357804,-0.23291,0.863924,-0.591173,-0.05888,03+NQd0KuWm3+uM4icdgWlOBTeD0tWZrtfWcv/NbdJY=,0,716293
222115,1.568945,-0.305101,-0.449742,-0.221931,-0.338482,1.464592,1.239425,1.357828,-0.484902,-0.628346,...,-0.541352,-1.146185,-0.357804,-0.23291,-1.15751,1.691552,-0.05888,8ELW2uv0+Bgsvu9W6Ka86/Sr+fJlKwD9PtGwZpjj+jg=,0,222116
800112,1.506312,0.47767,0.20365,-0.015515,-0.21998,-0.627397,-0.227203,-0.577305,2.679527,1.905992,...,1.847228,-1.146185,-0.357804,-0.23291,-1.15751,1.691552,-0.05888,8x8XmtxbaTyLpAUYxGcn4snJpBfS/nYIt45jvkFyZHc=,0,800114
765230,-0.811134,-0.843256,-0.921635,-0.959133,-0.83619,-0.924143,-1.019612,-0.956177,-1.066259,-1.110195,...,-0.541352,0.87246,-0.357804,-0.23291,0.863924,-0.591173,-0.05888,2V0apgsiDor6e5ZScjMG8Xc4DQMj870rSUCQD6HkxHQ=,0,765232


In [201]:
# Also need to reduce the size of the train target correspondingly
y_train=data_std['is_churn']
y_train.shape

(551968,)

#### Focused Undersampling

In [202]:
X_no_msno=data_std.drop('msno',axis=1)

In [203]:
col_names=X_no_msno.columns
col_types =X_no_msno.dtypes

#return_indices=True so that we can restore the original "msno" 
rus = RandomUnderSampler(random_state=42,return_indices=True)
#Resample the training data sets in order to balance class distribution!
X_res, y_res,idx_resampled = rus.fit_sample(X_no_msno, y_train)
print('Resampled dataset shape {}'.format(Counter(y_res)))

# convert np back to dataframe
X_res = pd.DataFrame(X_res)
X_res.columns = col_names

y_res=pd.DataFrame(y_res)
#y_res.columns=col_names_y

for col_name, col_type in zip(col_names, col_types):
    X_res[col_name] = X_res[col_name].astype(col_type)

X_res['is_churn']=y_res
#X_res['index']=idx_resampled

X_res.head()

Resampled dataset shape Counter({0: 35299, 1: 35299})


Unnamed: 0,bd,num_25_sum,num_50_sum,num_75_sum,num_985_sum,num_100_sum,num_unq_sum,num_totalsec_sum,num_25_mean,num_50_mean,...,gender_0,gender_1,gender_2,registered_via_3,registered_via_4,registered_via_7,registered_via_9,registered_via_13,is_churn,Temp_index
0,1.005242,1.447979,1.855278,2.181346,1.498299,4.2795,4.180203,4.400011,0.677988,0.887971,...,-0.508672,1.847228,-1.146185,-0.357804,-0.23291,-1.15751,1.691552,-0.05888,0,686193
1,1.067876,1.812864,0.403297,0.338342,0.218478,1.165877,1.489773,1.256013,1.2336,-0.025824,...,1.965905,-0.541352,-1.146185,-0.357804,-0.23291,-1.15751,1.691552,-0.05888,0,600312
2,-0.811134,0.691709,2.526819,1.119776,0.976891,0.145719,0.624301,0.297107,0.118161,1.404949,...,-0.508672,-0.541352,0.87246,-0.357804,-0.23291,0.863924,-0.591173,-0.05888,0,858188
3,1.13051,-0.669986,-0.476966,-0.413604,-0.563636,-0.450727,-0.443168,-0.440912,-0.749262,-0.34124,...,-0.508672,1.847228,-1.146185,-0.357804,-0.23291,-1.15751,1.691552,-0.05888,0,857750
4,-0.811134,1.466325,1.773604,0.36783,0.254028,-0.497478,0.216546,-0.40454,1.081068,1.258413,...,-0.508672,-0.541352,0.87246,-0.357804,-0.23291,0.863924,-0.591173,-0.05888,0,687403


In [204]:
msno_index=data_std[['Temp_index','msno']]

In [205]:
final_result = pd.merge(X_res, msno_index, on='Temp_index')

final_train_StdSc=final_result.drop('Temp_index',axis=1)

In [206]:
final_train_StdSc.to_csv('final_train_data_StdSc.csv')

In [209]:
final_train_StdSc.shape

(70598, 156)

#### Evaluate Performance of Standard Scaler 

##### Data used for both models

In [210]:
df_train_data=pd.read_csv('final_train_data_StdSc.csv')

In [211]:
data_train= df_train_data.drop(['is_churn','msno','Unnamed: 0'],axis=1)
target_train = df_train_data['is_churn']

In [212]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

##### Decision Tree

In [213]:
import statsmodels.api as sm
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

In [214]:
def crossvalid(data_train, target_train, est, nsplit):
    data_train_val=data_train.values
    n=data_train.shape[0]
    target_train_val=np.reshape(target_train.values,[n,])

    cross_val = StratifiedKFold(n_splits=nsplit, shuffle=True, random_state=42)
    acc_each_split = cross_val_score(estimator=est, X=data_train_val, y=target_train_val, cv=5, scoring='accuracy')
    return acc_each_split.mean()

In [215]:
from sklearn import tree
decision_tree = tree.DecisionTreeClassifier()
crossvalid(data_train, target_train, decision_tree, 10)

0.893821328687746

In [216]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results_dec_tree_stdsc = cross_val_score(decision_tree, data_train, target_train, cv=kfold)

In [217]:
print("Results: %.2f%% (%.2f%%)" % (results_dec_tree_stdsc.mean()*100, results_dec_tree_stdsc.std()*100))

Results: 89.66% (0.29%)


##### ANN

In [218]:
##https://machinelearningmastery.com/binary-classification-tutorial-with-the-keras-deep-learning-library/
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score

In [219]:
def create_baseline():
    # create model
    model = Sequential()
    model.add(Dense(20, input_dim=154, kernel_initializer='normal', activation='relu'))
    #if we want a hidden layer :
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [220]:
#Cross validation without selecting a features subset
estimator = KerasClassifier(build_fn=create_baseline, nb_epoch=100, batch_size=5, verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator, data_train, target_train, cv=kfold)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Results: 91.89% (0.34%)


### Apply Robust Scaler, Undersample & Evaluate

In [221]:
df=pd.read_csv('final_train_after_split.csv')
# Get all the columns from the dataframe.
columns = df.columns.tolist()
# Filter the columns to remove ones we don't want. 
columns_subset = [c for c in columns if c not in ["is_churn","Unnamed: 0","Unnamed: 0.1","msno"]]
# Store the variable we'll be predicting on.
target = "is_churn"
#DF for standardization
X = df[columns_subset]

#### Apply Robust Scaler

In [222]:
rsc=RobustScaler()
data_std_2=rsc.fit_transform(X)

In [223]:
#convert back to data frame
data_std=pd.DataFrame(data_std_2)
#restore original df
data_std.columns=X.columns
data_std['msno']=df['msno']
data_std['is_churn']=df['is_churn']
data_std['Temp_index']=df['Unnamed: 0']

In [224]:
X_no_msno=data_std.drop('msno',axis=1)


In [225]:
X_no_msno.head()

Unnamed: 0,bd,num_25_sum,num_50_sum,num_75_sum,num_985_sum,num_100_sum,num_unq_sum,num_totalsec_sum,num_25_mean,num_50_mean,...,gender_0,gender_1,gender_2,registered_via_3,registered_via_4,registered_via_7,registered_via_9,registered_via_13,is_churn,Temp_index
0,0.0,1.577815,4.673203,2.080808,1.613208,1.633598,1.284561,1.533956,0.890235,3.629874,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,745985
1,1.148148,-0.478477,-0.503268,-0.515152,-0.5,-0.508814,-0.541877,-0.523157,-0.807804,-0.869125,...,0.0,1.0,-1.0,0.0,0.0,-1.0,1.0,0.0,0,295235
2,0.0,-0.043046,-0.013072,0.222222,0.188679,0.671621,-0.049445,0.674216,-0.428397,-0.419577,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,716291
3,1.407407,-0.041391,-0.163399,-0.010101,-0.103774,1.169029,0.872519,1.041517,-0.281869,-0.438701,...,1.0,0.0,-1.0,0.0,0.0,-1.0,1.0,0.0,0,222115
4,1.37037,0.594371,0.30719,0.131313,-0.009434,-0.30038,-0.045745,-0.266973,2.58089,1.825162,...,0.0,1.0,-1.0,0.0,0.0,-1.0,1.0,0.0,0,800112


In [226]:
y_train=X_no_msno[target]

#### Undersample

In [227]:
col_names=X_no_msno.columns
col_types =X_no_msno.dtypes

#return_indices=True so that we can restore the original "msno" 
rus = RandomUnderSampler(random_state=42,return_indices=True)
#Resample the training data sets in order to balance class distribution!
X_res, y_res,idx_resampled = rus.fit_sample(X_no_msno, y_train)
print('Resampled dataset shape {}'.format(Counter(y_res)))

# convert np back to dataframe
X_res = pd.DataFrame(X_res)
X_res.columns = col_names

y_res=pd.DataFrame(y_res)
#y_res.columns=col_names_y

for col_name, col_type in zip(col_names, col_types):
    X_res[col_name] = X_res[col_name].astype(col_type)

X_res['is_churn']=y_res
#X_res['index']=idx_resampled

X_res.head()

Resampled dataset shape Counter({0: 40349, 1: 40349})


Unnamed: 0,bd,num_25_sum,num_50_sum,num_75_sum,num_985_sum,num_100_sum,num_unq_sum,num_totalsec_sum,num_25_mean,num_50_mean,...,gender_0,gender_1,gender_2,registered_via_3,registered_via_4,registered_via_7,registered_via_9,registered_via_13,is_churn,Temp_index
0,0.962963,0.299669,0.150327,-0.070707,0.037736,0.375043,0.350824,0.328763,0.520763,0.305368,...,0.0,1.0,-1.0,0.0,1.0,-1.0,0.0,0.0,0,263691
1,1.518519,0.82947,0.901961,1.252525,1.471698,2.783616,1.735957,2.607264,0.244751,0.321078,...,0.0,1.0,-1.0,0.0,0.0,-1.0,1.0,0.0,0,138543
2,1.0,2.928808,2.777778,2.242424,2.943396,1.044936,0.905146,1.104241,4.932058,4.948217,...,1.0,0.0,-1.0,0.0,1.0,-1.0,0.0,0.0,0,534031
3,1.37037,0.344371,0.222222,0.464646,0.103774,-0.111994,-0.099899,-0.068988,0.508168,0.352051,...,0.0,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0,379174
4,0.0,0.764901,0.96732,1.292929,1.0,0.551676,0.47595,0.587724,0.246513,0.443312,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,694943


In [228]:
#merge msno_index and X_res based on the temp index
msno_index=data_std[['Temp_index','msno']]
final_result = pd.merge(X_res, msno_index, on='Temp_index')
final_result.head()

Unnamed: 0,bd,num_25_sum,num_50_sum,num_75_sum,num_985_sum,num_100_sum,num_unq_sum,num_totalsec_sum,num_25_mean,num_50_mean,...,gender_1,gender_2,registered_via_3,registered_via_4,registered_via_7,registered_via_9,registered_via_13,is_churn,Temp_index,msno
0,0.962963,0.299669,0.150327,-0.070707,0.037736,0.375043,0.350824,0.328763,0.520763,0.305368,...,1.0,-1.0,0.0,1.0,-1.0,0.0,0.0,0,263691,Js++0pNy8YJd8+Fn0wUQ/jM1s9I5kQuJ7qE/jR83+OQ=
1,1.518519,0.82947,0.901961,1.252525,1.471698,2.783616,1.735957,2.607264,0.244751,0.321078,...,1.0,-1.0,0.0,0.0,-1.0,1.0,0.0,0,138543,L9+JXElv5ngSSM44AjcQRWKQE9j8DhZh5yyN/6x3ifk=
2,1.0,2.928808,2.777778,2.242424,2.943396,1.044936,0.905146,1.104241,4.932058,4.948217,...,0.0,-1.0,0.0,1.0,-1.0,0.0,0.0,0,534031,trybWgwl99EIQ3xH/44GzuYylncKsXVwJFS9ak3gUKE=
3,1.37037,0.344371,0.222222,0.464646,0.103774,-0.111994,-0.099899,-0.068988,0.508168,0.352051,...,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0,379174,vP4d7eNSNu65P0c99wbQ7Ihrh2i/kVWyzpsj4lcx2Do=
4,0.0,0.764901,0.96732,1.292929,1.0,0.551676,0.47595,0.587724,0.246513,0.443312,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,694943,/RrH5SkBX1Qp4S+LCIaiRsB47QsBU3CDvsDn17edr2E=


In [229]:
final_train_RobSc=final_result.drop('Temp_index',axis=1)

In [230]:
final_train_RobSc.shape

(80698, 156)

#### Evaluate performance of Robust Scaler

##### Decision Tree

In [231]:
df_train_data=final_train_RobSc
data_train= df_train_data.drop(['is_churn','msno'],axis=1)
target_train = df_train_data['is_churn']

In [232]:
decision_tree = tree.DecisionTreeClassifier()
crossvalid(data_train, target_train, decision_tree, 10)

0.88787813534534765

In [233]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results_dec_tree_stdsc = cross_val_score(decision_tree, data_train, target_train, cv=kfold)
print("Results: %.2f%% (%.2f%%)" % (results_dec_tree_stdsc.mean()*100, results_dec_tree_stdsc.std()*100))

Results: 88.59% (0.39%)


##### ANN

In [234]:
#Cross validation without selecting a features subset
estimator = KerasClassifier(build_fn=create_baseline, nb_epoch=100, batch_size=5, verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator, data_train, target_train, cv=kfold)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Results: 90.82% (0.64%)


### Statistics 
- Std Scaler - train : (70 598, 156), test: (262 843)

Decision Tree:
Results: 89.66% (0.29%)
ANN: 
Results: 91.89% (0.34%)

- Robust Scaler- train: (80 698, 156), test: (262 843)

Decision Tree: 
Results: 88.59% (0.39%)
ANN:  Results: 90.82% (0.64%)

Models applied to the dataset with Standard Scaler perform better.

### Apply Standard scaler to Test data set

In [236]:
#Standardize X_test
# Get all the columns from the dataframe.
columns = X_test.columns.tolist()
# Filter the columns to remove ones we don't want. 
columns_subset = [c for c in columns if c not in ["is_churn","Unnamed: 0","Unnamed: 0.1","msno"]]
X_to_std=X_test[columns_subset]

sc=StandardScaler()
data_test_standardized=sc.fit_transform(X_to_std)
#convert back to data frame
data_test_std=pd.DataFrame(data_test_standardized)
#restore original df
data_test_std.columns=X_to_std.columns
data_test_std = data_test_std.set_index(X_test.index)
data_test_std['msno']=X_test['msno']
data_test_std['is_churn']=X_test['is_churn']

In [237]:
data_test_std.shape

(262843, 156)

In [238]:
data_test_std.head()

Unnamed: 0,bd,num_25_sum,num_50_sum,num_75_sum,num_985_sum,num_100_sum,num_unq_sum,num_totalsec_sum,num_25_mean,num_50_mean,...,gender_0,gender_1,gender_2,registered_via_3,registered_via_4,registered_via_7,registered_via_9,registered_via_13,msno,is_churn
120957,-0.840527,-0.522259,-0.602739,-0.58422,-0.377952,-0.114536,-0.19243,-0.146375,-0.703467,-0.759848,...,-0.524053,-0.556615,0.908187,2.711588,-0.243489,-1.105817,-0.608882,-0.061113,fAbsh3kAffPa/1ZZZbo2ItlNou9tn22EMRTvF4clEak=,0
289553,0.792488,-0.58105,-0.640079,-0.575303,-0.433727,-0.338979,-0.497846,-0.337333,-0.665328,-0.703208,...,1.908205,-0.556615,-1.101094,-0.368788,-0.243489,-1.105817,1.642354,-0.061113,3RQH2OV7U01TisVNUqxgylb7is8BaFQUnozvU8AaWhA=,0
525621,-0.840527,2.572062,0.634802,1.011987,0.525592,1.136128,1.511263,1.035867,1.827277,0.159349,...,-0.524053,-0.556615,0.908187,-0.368788,-0.243489,0.904309,-0.608882,-0.061113,9iEb9xcrVceAYOst3Su+Edkqfrx3RTZE1lgpROT9wX4=,0
383252,1.106529,-0.337487,-0.261348,-0.236443,-0.271981,-0.261964,-0.285715,-0.268519,-0.503196,-0.407678,...,-0.524053,1.796573,-1.101094,-0.368788,-0.243489,0.904309,-0.608882,-0.061113,FxLIZcN3AYC1LYvWCG0hNGkcfmnP9MwGUxRxAXY3r5M=,0
274462,1.671803,-0.207907,-0.11199,-0.067013,0.034778,-0.332928,-0.290694,-0.320169,-0.297483,-0.177535,...,-0.524053,1.796573,-1.101094,2.711588,-0.243489,-1.105817,-0.608882,-0.061113,dz5+eXZINBe7apJ8K7v4S+zzDi3o7tAeLMWAnrcIc54=,0


In [239]:
data_test_std.to_csv('final_test_data_StdSc.csv')