In [5]:
#Import the essential libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import  train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [6]:
#Import dataset
df = pd.read_csv('E:/Portfolio/PwC/Task 3/02 Churn-Dataset.csv')

#prepare the dataset by replacing the churn column with 0 and 1
df['Churn'].replace({'Yes':1,'No':0},inplace=True)

#Convert TotalCharges to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"],errors='coerce')

In [3]:
#import Label Encoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dummy_columns = [] #array for multiple value columns
for column in df.columns:
    if df[column].dtype == object and column != 'customerID':
        if df[column].nunique() == 2:
            #apply Label Encoder for binary ones
            df[column] = le.fit_transform(df[column])
        else:
            dummy_columns.append(column)

#apply get dummies for selected columns
df= pd.get_dummies(data = df,columns = dummy_columns, dtype =int) 

In [4]:
#Create feature set and labels
y = df['Churn'].values
X = df.drop(['Churn','customerID'],axis=1)

In [5]:
# Scaling all the variables to a range of 0 to 1
features = X.columns.values
scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X))
X.columns = features

In [6]:
# Create Train & Test Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=101)

In [7]:
# Random Forest
rf_model  = RandomForestClassifier()
rf_model.fit(X_train,y_train)
ypred = rf_model.predict(X_test)

In [8]:
#Logistic Regression
log_model = LogisticRegression(solver='lbfgs', max_iter=1000).fit(X_train,y_train)
y_pred2 = log_model.predict(X_test)

In [9]:
#XGBoost 
xg_model = XGBClassifier(max_depth=5, learning_rate=0.08, objective= 'binary:logistic',n_jobs=-1).fit(X_train, y_train)
y_pred3 =  xg_model.predict(X_test)

In [10]:
#Create dataframe to store the evaluation of each model
model_eval = pd.DataFrame(index=['Random Forest'],columns=['Score'])


Unnamed: 0,Score
Random Forest,


In [22]:
#Insert evaluation result
model_eval.loc['Random Forest','Score'] = accuracy_score(ypred,y_test)
model_eval.loc['LogisiticRegression','Score'] = accuracy_score(y_pred2,y_test)
model_eval.loc['XGBoost','Score'] = accuracy_score(y_pred3,y_test)
#evaluate prediction model
model_eval

Unnamed: 0,Score
Random Forest,0.827195
LogisiticRegression,0.827195
XGBoost,0.849858


In [21]:
# Create Feature engineering dataframe of Random Forest
import joblib
joblib.dump(log_model,"model")
rf_model.feature_importances_
weights = pd.DataFrame(rf_model.feature_importances_,index=X.columns.values)

In [19]:
# Create Feature engineering dataframe of XGBoost
xg_model.feature_importances_
weights2 = pd.DataFrame(xg_model.feature_importances_,index=X.columns.values)

In [20]:
#Add churn probability of each customers
df['Churn probability'] = xg_model.predict_proba(df[X_train.columns])[:,1]