In [229]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from ann import ANN

from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split #training and testing data split

In [180]:
data = pd.read_csv('telco-customer.csv')

In [181]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [182]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customerID          7043 non-null object
gender              7043 non-null object
SeniorCitizen       7043 non-null int64
Partner             7043 non-null object
Dependents          7043 non-null object
tenure              7043 non-null int64
PhoneService        7043 non-null object
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null object
OnlineBackup        7043 non-null object
DeviceProtection    7043 non-null object
TechSupport         7043 non-null object
StreamingTV         7043 non-null object
StreamingMovies     7043 non-null object
Contract            7043 non-null object
PaperlessBilling    7043 non-null object
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null float64
TotalCharges        7043 non-null object
Churn               7043 non-null object
dtypes: float64(1), int64(2), obj

In [183]:
data.TotalCharges = pd.to_numeric(data.TotalCharges,errors = 'coerce')

In [184]:
data.TotalCharges = data.TotalCharges.fillna(np.mean(data.TotalCharges))

In [185]:
non_numeric = []
numeric     = []
data = data.drop(columns='customerID')
for i in data.columns:
    if data[i].dtype == 'O':
        data[i] = data[i].astype('category')
        non_numeric.append(i)
    else:
        numeric.append(i)


In [186]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
gender              7043 non-null category
SeniorCitizen       7043 non-null int64
Partner             7043 non-null category
Dependents          7043 non-null category
tenure              7043 non-null int64
PhoneService        7043 non-null category
MultipleLines       7043 non-null category
InternetService     7043 non-null category
OnlineSecurity      7043 non-null category
OnlineBackup        7043 non-null category
DeviceProtection    7043 non-null category
TechSupport         7043 non-null category
StreamingTV         7043 non-null category
StreamingMovies     7043 non-null category
Contract            7043 non-null category
PaperlessBilling    7043 non-null category
PaymentMethod       7043 non-null category
MonthlyCharges      7043 non-null float64
TotalCharges        7043 non-null float64
Churn               7043 non-null category
dtypes: category(16), float64(2), int64(2

In [187]:
data.iloc[:,-1] = [1 if i=='Yes' else 0 for i in data.iloc[:,-1]]

In [188]:
X_numeric = data.loc[:,numeric]
X_non_numeric = data.loc[:,non_numeric]
y = data.iloc[:,-1]

In [189]:
X_numeric['SeniorCitizen'].unique()

array([0, 1])

In [190]:
X_numeric = X_numeric.drop(columns='SeniorCitizen')

In [191]:
X_non_numeric.head()

Unnamed: 0,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
0,Female,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,0
1,Male,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,0
2,Male,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,1
3,Male,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),0
4,Female,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,1


In [192]:
X_non_numeric = pd.get_dummies(X_non_numeric)

In [193]:
X_non_numeric.head()

Unnamed: 0,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,MultipleLines_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,0,0,1,1,0,1,0,0,...,0,1,0,0,0,1,0,0,1,0
1,0,0,1,1,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,1
2,1,0,1,1,0,1,0,0,1,1,...,0,1,0,0,0,1,0,0,0,1
3,0,0,1,1,0,1,0,1,0,0,...,0,0,1,0,1,0,1,0,0,0
4,1,1,0,1,0,1,0,0,1,1,...,0,1,0,0,0,1,0,0,1,0


In [194]:
sc = MinMaxScaler(feature_range = (0, 1))
numeric_scaled = sc.fit_transform(X_numeric)
numeric_scaled = pd.DataFrame(numeric_scaled)

  return self.partial_fit(X, y)


In [223]:
dataset = pd.concat((X_non_numeric,numeric_scaled),axis = 1)

In [224]:
X_train = dataset[:5000]
y_train = y[:5000]
X_test  = dataset[5000:]
y_test  = y[5000:]

In [230]:
train_X,test_X, train_Y, test_Y=train_test_split(dataset,y,test_size=0.3,random_state=0,stratify=y)

In [231]:
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn.svm import SVC #support vector Machine

In [239]:
model_ensemble = RandomForestClassifier()
model_linear   = LogisticRegression()
model_svc      = SVC(kernel='rbf',C=1,gamma=0.1)

In [240]:
model_ensemble.fit(train_X,train_Y)
model_linear.fit(train_X,train_Y)
model_svm.fit(train_X,train_Y)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [246]:
prediction = model_ensemble.predict(test_X)
eval_metrics = classification_report(test_Y, prediction)
print(eval_metrics)
print('Accuracy for ensemble model is ',metrics.accuracy_score(prediction,test_Y))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1552
           1       1.00      0.98      0.99       561

   micro avg       1.00      1.00      1.00      2113
   macro avg       1.00      0.99      0.99      2113
weighted avg       1.00      1.00      1.00      2113

Accuracy for ensemble model is  0.995740653099858


In [248]:
prediction = model_linear.predict(test_X)
eval_metrics = classification_report(test_Y, prediction)
print(eval_metrics)
print('Accuracy for linear model is ',metrics.accuracy_score(prediction,test_Y))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1552
           1       1.00      1.00      1.00       561

   micro avg       1.00      1.00      1.00      2113
   macro avg       1.00      1.00      1.00      2113
weighted avg       1.00      1.00      1.00      2113

Accuracy for linear model is  1.0


In [249]:
prediction = model_svm.predict(test_X)
eval_metrics = classification_report(test_Y, prediction)
print(eval_metrics)
print('Accuracy for rbf SVM is ',metrics.accuracy_score(prediction,test_Y))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1552
           1       1.00      1.00      1.00       561

   micro avg       1.00      1.00      1.00      2113
   macro avg       1.00      1.00      1.00      2113
weighted avg       1.00      1.00      1.00      2113

Accuracy for rbf SVM is  1.0


In [250]:
from sklearn.neighbors import KNeighborsClassifier
model=KNeighborsClassifier() 
model.fit(train_X,train_Y)
prediction5=model.predict(test_X)
print('The accuracy of the KNN is',metrics.accuracy_score(prediction5,test_Y))

The accuracy of the KNN is 0.9034548035967819
