# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Importing the Data for SVM Implementation

In [None]:
df=pd.read_csv('')

# Checking the data

In [None]:
df.shape

In [None]:
df.head=()

In [None]:
df.dtypes

# Setting Display options to ensure feature name visibility

In [None]:
pd.set_option('display.max_columns',None)

# Warning Suppression 

In [None]:
import warnings
warnings.filterwarnings('ignore')

# How many rows have missing ID ?

In [None]:
df['customer_id'].isnull().sum()

# Drop ID Feature from the dataset

In [None]:
df=df.drop(['customer_id'],axis=1)

# Defining Target and Independent Features

In [None]:
Y=df[['churn']]
X=df.drop(['churn'],axis=1)

# Get the Response Rate

In [None]:
Y.mean()

# Split features into Numerical and Categorical

In [None]:
num=X.select_dtypes(include="number")
char=X.select_dtypes(include="object")

# Outlier Analysis of Numerical Features

In [None]:
num.describe(percentiles=[0.01,0.05,0.10,0.25,0.50,0.75,0.85,0.9,0.99])

# Capping and Flooring of outliers

In [None]:
def outlier_cap(x):
    x=x.clip(lower=x.quantile(0.01))
    x=x.clip(upper=x.quantile(0.99))
    return(x)

In [None]:
num=num.apply(lambda x : outlier_cap(x))

In [None]:
num.describe(percentiles=[0.01,0.05,0.10,0.25,0.50,0.75,0.85,0.9,0.99])

# Missing Value Analysis

In [None]:
num.isnull().mean()

# Missing Value Handling - Numerical Features (Remove >25% Missing Rows)

In [None]:
num=num.loc[:,num.isnull().mean()<=0.25]

# Missing Value Handling - Numerical Features (Imputation with Mean)

In [None]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan, strategy='mean')
num_1=pd.DataFrame(imputer.fit_transform(num),index=num.index,columns=num.columns)

In [None]:
num_1.isna().sum()

# Missing Value Handling - Categorical Features (Remove >25% Missing Rows)

In [None]:
char=char.loc[:,char.isnull().mean()<=0.25]

# Missing Value Handling - Categorical Features (Imputation with Mode)

In [None]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan, strategy='most_frequent')
char_1=pd.DataFrame(imputer.fit_transform(char),index=char.index,columns=char.columns)

In [None]:
char_1.isnull().mean()

# Encode Categorical Features

In [None]:
# Create dummy features with n-1 levels
X_char_dum = pd.get_dummies(char_1, drop_first = True)
X_char_dum.shape

# Creating the Master Feature Set for Model Development

In [None]:
X_all=pd.concat([X_char_dum,num_1],axis=1,join="inner")

In [None]:
# Scale the data to be between -1 and 1
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_std=pd.DataFrame(scaler.fit_transform(X_all),index=X_all.index,columns=X_all.columns).add_suffix('_std')
X_std.head()

In [None]:
Y['churn'].value_counts()

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X_std, Y, test_size=0.3, random_state=42)

In [None]:
print("Shape of Training Data",X_train.shape)
print("Shape of Testing Data",X_test.shape)
print("Response Rate in Training Data",y_train.mean())
print("Response Rate in Testing Data",y_test.mean())

In [None]:
from sklearn.svm import SVC
svm= SVC()

In [None]:
param_dist = {
 'C': (np.arange(0.1,1,0.1)) , 'kernel': ['linear'],
 'C': (np.arange(0.1,1,0.1)) , 'gamma': [0.01,0.02,0.03,0.04,0.05], 'kernel': ['rbf'],
 'degree': [2,3,4] ,'gamma':[0.01,0.02,0.03,0.04,0.05], 'C':(np.arange(0.1,1,0.1)) , 'kernel':['poly']
                   }

In [None]:
#from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import GridSearchCV
svm_model = GridSearchCV(svm, param_dist,cv=10,scoring='accuracy',n_jobs=3)
svm_model.fit(X_train,y_train)
print('Best Parameters using grid search: \n', svm_model.best_params_)

In [None]:
# Model Evaluation
y_pred_xgb=xgb.predict(X_test)

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_xgb))
print("Precision",metrics.precision_score(y_test,y_pred_xgb))
print("Recall",metrics.recall_score(y_test,y_pred_xgb))
print("f1_score",metrics.f1_score(y_test,y_pred_xgb))

In [None]:
metrics.plot_confusion_matrix(xgb,X_all,Y)

In [None]:
# Lorenz Curve

In [None]:
y_pred_prob = xgb.predict_proba(X_all)[:, 1]
df['pred_prob']=pd.DataFrame(y_pred_prob)
df['P_Rank_xgb']=pd.qcut(df['pred_prob'].rank(method='first').values,10,duplicates='drop').codes+1
rank_df_actuals=df.groupby('P_Rank_xgb')['churn'].agg(['count','mean'])
rank_df_predicted=df.groupby('P_Rank_xgb')['pred_prob'].agg(['mean'])
rank_df_actuals=pd.DataFrame(rank_df_actuals)

rank_df_actuals.rename(columns={'mean':'Actutal_event_rate'},inplace=True)
rank_df_predicted=pd.DataFrame(rank_df_predicted)
rank_df_predicted.rename(columns={'mean':'Predicted_event_rate'},inplace=True)
rank_df=pd.concat([rank_df_actuals,rank_df_predicted],axis=1,join="inner")

sorted_rank_df=rank_df.sort_values(by='P_Rank_xgb',ascending=False)
sorted_rank_df['N_events']=rank_df['count']*rank_df['Actutal_event_rate']
sorted_rank_df['cum_events']=sorted_rank_df['N_events'].cumsum()
sorted_rank_df['event_cap']=sorted_rank_df['N_events']/max(sorted_rank_df['N_events'].cumsum())
sorted_rank_df['cum_event_cap']=sorted_rank_df['event_cap'].cumsum()

sorted_rank_df['N_non_events']=sorted_rank_df['count']-sorted_rank_df['N_events']
sorted_rank_df['cum_non_events']=sorted_rank_df['N_non_events'].cumsum()
sorted_rank_df['non_event_cap']=sorted_rank_df['N_non_events']/max(sorted_rank_df['N_non_events'].cumsum())
sorted_rank_df['cum_non_event_cap']=sorted_rank_df['non_event_cap'].cumsum()

sorted_rank_df['KS']=round((sorted_rank_df['cum_event_cap']-sorted_rank_df['cum_non_event_cap']),4)

sorted_rank_df['random_cap']=sorted_rank_df['count']/max(sorted_rank_df['count'].cumsum())
sorted_rank_df['cum_random_cap']=sorted_rank_df['random_cap'].cumsum()
sorted_reindexed=sorted_rank_df.reset_index()
sorted_reindexed['Decile']=sorted_reindexed.index+1
sorted_reindexed

In [None]:
ax = sns.lineplot( x="Decile", y="Actutal_event_rate", data=sorted_reindexed,color='red')
ax = sns.lineplot( x="Decile", y="Predicted_event_rate", data=sorted_reindexed,color='grey')

In [None]:
ax = sns.lineplot( x="Decile", y="cum_event_cap", data=sorted_reindexed,color='red')
ax = sns.lineplot( x="Decile", y="cum_random_cap", data=sorted_reindexed,color='blue')
ax = sns.lineplot( x="Decile", y="cum_non_event_cap", data=sorted_reindexed,color='green')