In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv (r'LWC_CHURN_V2.csv')

In [None]:
df.info()

In [None]:
df_string=df.select_dtypes(include=np.object)
df_string.info()
df_string.describe().to_csv('string_des.csv', index=False)

In [None]:
# Making dummy variables for categorical data with more inputs.  
data_dummy = pd.get_dummies(df[['SA_CITY','SA_PROV', 'age_band', 'MC_SGMNT_FACTOR','SERV_PROV','COID','FSALDU']], drop_first=True)
# Merging (concatenate) original data frame with 'dummy' dataframe.
df = pd.concat([df,data_dummy], axis=1)
df.head()
# Dropping attributes for which we made dummy variables.  Let's also drop the Date column.
df = df.drop(['SA_CITY','SA_PROV', 'age_band', 'MC_SGMNT_FACTOR','SERV_PROV','COID','FSALDU'], axis=1)


In [None]:
# from sklearn.preprocessing import OneHotEncoder

# enc=OneHotEncoder(handle_unknown='ignore',sparse=False)
# enc=enc.fit(df[['SA_CITY','SA_PROV', 'age_band', 'MC_SGMNT_FACTOR','SERV_PROV','COID','FSALDU']])
# enc.transform(df[['SA_CITY','SA_PROV', 'age_band', 'MC_SGMNT_FACTOR','SERV_PROV','COID','FSALDU']])
# _ohe_array=enc.transform(df[['SA_CITY','SA_PROV', 'age_band', 'MC_SGMNT_FACTOR','SERV_PROV','COID','FSALDU']])
# _ohe_name=enc.get_feature_names()
# for i in range(_ohe_array.shape[1]):
#      df[_ohe_name[i]]=_ohe_array[:,i]

In [None]:
from sklearn.model_selection import train_test_split
X=df.drop('lwc_churn',axis=1)
Y=df['lwc_churn']

X_train,X_test, Y_train, Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [None]:
#Baseline model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score,confusion_matrix
clf=RandomForestClassifier(random_state=42,n_estimators=1000, max_depth=25, min_samples_split=20,class_weight='balanced') 
X_train=X_train.select_dtypes(include=np.number).fillna(0)
X_test=X_test.select_dtypes(include=np.number).fillna(0)

clf.fit(X_train,Y_train)
train_preds=clf.predict(X_train)
test_preds=clf.predict(X_test)

print('Training Accuracy{:4f}'.format(accuracy_score(Y_train,train_preds)))
print('Training f1{:4f}'.format(f1_score(Y_train,train_preds)))
print('Training roc{:4f}'.format(roc_auc_score(Y_train,train_preds)))
print('Testing roc{:4f}'.format(confusion_matrix(Y_train,train_preds)))

print('Testing Accuracy{:4f}'.format(accuracy_score(Y_test,test_preds)))
print('Testing f1{:4f}'.format(f1_score(Y_test,test_preds)))
print('Testing roc{:4f}'.format(roc_auc_score(Y_test,test_preds)))
print('Testing roc{:4f}'.format(confusion_matrix(Y_test,test_preds)))


In [None]:
import matplotlib.pyplot as plt
import scikitplot as skplt
predicted_probas=clf.predict_proba(X_test)
skplt.metrics.plot_cumulative_gain(y_test, predicted_probas)
plt.show()

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
#+Feature Selection(Variance threshold, filter method, unsupervised)
from sklearn.feature_selection import VarianceThreshold
sel=VarianceThreshold(threshold=(0.15));
sel=sel.fit(X_train)
X_train_new1=sel.transform(X_train)
X_train_new1.shape

In [None]:
X_test_new1=sel.transform(X_test)
X_test_new1.shape

In [None]:
clf_1=RandomForestClassifier(random_state=42,n_estimators=1000, max_depth=25, min_samples_split=20,class_weight='balanced') 
X_train=X_train.select_dtypes(include=np.number).fillna(0)
clf_1.fit(X_train_new1,Y_train)
train_preds_1=clf_1.predict(X_train_new1)
test_preds_1=clf_1.predict(X_test_new1)

print('Training Accuracy{:4f}'.format(accuracy_score(Y_train,train_preds_1)))
print('Training f1{:4f}'.format(f1_score(Y_train,train_preds_1)))
print('Training roc{:4f}'.format(roc_auc_score(Y_train,train_preds_1)))
print('Testing Accuracy{:4f}'.format(accuracy_score(Y_test,test_preds_1)))
print('Testing f1{:4f}'.format(f1_score(Y_test,test_preds_1)))
print('Testing roc{:4f}'.format(roc_auc_score(Y_test,test_preds_1)))

In [None]:
#Hyperparameter tuning
model= RandomForestClassifier(random_state=42)
params = {
    'max_depth':[20,25,30],
    'min_samples_split':[10,20,30],
    'n_estimators':[1000,1500,2000]
    
}

search=GridSearchCV(model,params,scoring='f1',cv=3,verbose=1)
search=search.fit(X_train_new1,Y_train)
print(search.best_params_,search.best_score_)

In [None]:
#Changed the parameters to the recommended one
clf_3=RandomForestClassifier(random_state=42,n_estimators=1000, max_depth=25, min_samples_split=20,class_weight='balanced') 
clf_3.fit(X_train_new1,Y_train)
train_preds_3=clf_3.predict(X_train_new1)

print('Training Accuracy{:4f}'.format(accuracy_score(Y_train,train_preds_3)))
print('Training f1{:4f}'.format(f1_score(Y_train,train_preds_3)))
print('Training roc{:4f}'.format(roc_auc_score(Y_train,train_preds_3)))