In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
df = pd.read_csv("telco_train.csv", index_col=0)
df_test = pd.read_csv("telco_test.csv", index_col=0)

In [4]:
X = df.drop(["customer_id", "churn"], axis=1)
#y = pd.reset_index(df.churn)
y = df.churn.reset_index().churn

X_test = df_test.drop(["customer_id", "churn"], axis=1)
y_test = df_test.churn.reset_index().churn

In [5]:
X = X.astype(np.float64)
X_test = X_test.astype(np.float64)

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.fit_transform(X_test)

In [7]:
#Split for Cross Validation

from sklearn.model_selection import KFold

kf = KFold(n_splits=10)
kf.get_n_splits(X)
print(kf)


KFold(n_splits=10, random_state=None, shuffle=False)


### Logistic Regression Models

In [8]:
# Simple Logistic Regression

from sklearn.linear_model import LogisticRegression 

logreg_basic = LogisticRegression(solver="lbfgs", random_state=0)
logreg_basic.fit(X, y)
print("Accuracy of Logistic Regression on training set: {:.2f}".format(logreg_basic.score(X, y)))
print("Accuracy of Logistic Regression on test set: {:.2f}".format(logreg_basic.score(X_test, y_test)))

# Generalized Error

gen_error_lrb = list()

for train_index, test_index in kf.split(X):
    logreg_basic_gen = LogisticRegression(solver="lbfgs", random_state=0)
    logreg_basic_gen.fit(X[train_index], y[train_index])
    gen_error_lrb.append(logreg_basic_gen.score(X[test_index], y[test_index]))
    
print(np.mean(gen_error_lrb))




Accuracy of Logistic Regression on training set: 0.80
Accuracy of Logistic Regression on test set: 0.82
0.797651575274303


In [10]:
# Ridge Logistic Regression

from sklearn.linear_model import LogisticRegression 

logreg_ridge = LogisticRegression(solver="lbfgs", random_state=0, penalty="l2")
logreg_ridge.fit(X, y)
print("Accuracy of Ridge Logistic Regression on training set: {:.2f}".format(logreg_ridge.score(X, y)))
print("Accuracy of Ridge Logistic Regression on test set: {:.2f}".format(logreg_ridge.score(X_test, y_test)))


# Estimating Ridge Parameter

best_c = 0
best_score = 0

for i in range(1,101):
    c = (i/100)
    score_list = list()
    
    for train_index, test_index in kf.split(X):
        temp_model = LogisticRegression(solver="lbfgs", random_state=0, penalty="l2", C=c)
        temp_model.fit(X[train_index], y[train_index])
        score_list.append(temp_model.score(X[test_index], y[test_index]))
    
    if np.mean(score_list)>best_score:
        best_score=np.mean(score_list)
        best_c=c

print(best_c)

# Generalized Error

gen_error_lrr = list()

for train_index, test_index in kf.split(X):
    logreg_ridge_gen = LogisticRegression(solver="lbfgs", random_state=0, penalty="l2", C=best_c)
    logreg_ridge_gen.fit(X[train_index], y[train_index])
    gen_error_lrr.append(logreg_ridge_gen.score(X[test_index], y[test_index]))
    
print(np.mean(gen_error_lrr))

Accuracy of Ridge Logistic Regression on training set: 0.80
Accuracy of Ridge Logistic Regression on test set: 0.82
0.03
0.7996047642442337


In [11]:
# Lasso Logistic Regression

from sklearn.linear_model import LogisticRegression 

logreg_lasso = LogisticRegression(solver='saga', random_state=0, penalty="l1")
logreg_lasso.fit(X, y)
print("Accuracy of Lasso Logistic Regression on training set: {:.2f}".format(logreg_lasso.score(X, y)))
print("Accuracy of Lasso Logistic Regression on test set: {:.2f}".format(logreg_lasso.score(X_test, y_test)))

# Estimating Lasso Parameter

best_c = 0
best_score = 0

for i in range(1,101):
    c = (i/100)
    score_list = list()
    
    for train_index, test_index in kf.split(X):
        temp_model = LogisticRegression(solver="saga", random_state=0, penalty="l1", C=c)
        temp_model.fit(X[train_index], y[train_index])
        score_list.append(temp_model.score(X[test_index], y[test_index]))
    
    if np.mean(score_list)>best_score:
        best_score=np.mean(score_list)
        best_c=c

print(best_c)

# Generalized Error

gen_error_lrl = list()

for train_index, test_index in kf.split(X):
    logreg_lasso_gen = LogisticRegression(solver="saga", random_state=0, penalty="l2", C=best_c)
    logreg_lasso_gen.fit(X[train_index], y[train_index])
    gen_error_lrl.append(logreg_lasso_gen.score(X[test_index], y[test_index]))
    
print(np.mean(gen_error_lrl))

Accuracy of Lasso Logistic Regression on training set: 0.80
Accuracy of Lasso Logistic Regression on test set: 0.82
0.06
0.7987172946348714


### LDA, QDA

In [12]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

#LDA

lda = LinearDiscriminantAnalysis()
lda.fit(X, y)
print("Accuracy of LDA on training set: {:.2f}".format(lda.score(X, y)))
print("Accuracy of LDA on test set: {:.2f}".format(lda.score(X_test, y_test)))

# Generalized Error

gen_error_lda = list()

for train_index, test_index in kf.split(X):
    lda_gen = LinearDiscriminantAnalysis()
    lda_gen.fit(X[train_index], y[train_index])
    gen_error_lda.append(lda_gen.score(X[test_index], y[test_index]))
    
print("Generalized Error on LDA: ",np.mean(gen_error_lda))





#QDA


qda = QuadraticDiscriminantAnalysis()
qda.fit(X, y)
print("Accuracy of QDA on traning set: {:.2f}".format(qda.score(X, y)))
print("Accuracy of QDA on test set: {:.2f}".format(qda.score(X_test, y_test)))

# Generalized Error

gen_error_qda = list()

for train_index, test_index in kf.split(X):
    qda_gen = QuadraticDiscriminantAnalysis()
    qda_gen.fit(X[train_index], y[train_index])
    gen_error_qda.append(qda_gen.score(X[test_index], y[test_index]))
    
print("Generalized Error on QDA: ",np.mean(gen_error_qda))

Accuracy of LDA on training set: 0.80
Accuracy of LDA on test set: 0.82
Generalized Error on LDA:  0.7941013819079652
Accuracy of QDA on traning set: 0.77
Accuracy of QDA on test set: 0.77
Generalized Error on QDA:  0.7594884924983938


## Decision Tree

In [8]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier().fit(X, y)

print("Accuracy of Decision Tree classifier on training set: {:.2f}".format(dtc.score(X, y)))
print("Accuracy of Decision Tree classifier on test set: {:.2f}".format(dtc.score(X_test, y_test)))



# Generalized Error

gen_error_dt = list()

for train_index, test_index in kf.split(X):
    dt_gen = DecisionTreeClassifier()
    dt_gen.fit(X[train_index], y[train_index])
    gen_error_dt.append(dt_gen.score(X[test_index], y[test_index]))
    
print("Generalized Error on Decision Tree: ",np.mean(gen_error_dt))

Accuracy of Decision Tree classifier on training set: 1.00
Accuracy of Decision Tree classifier on test set: 0.73


## K-Nearest Neighbors

In [9]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier().fit(X, y)

print("Accuracy of KNN classifier on training set: {:.2f}".format(knn.score(X, y)))
print("Accuracy of KNN classifier on test set: {:.2f}".format(knn.score(X_test, y_test)))


# Generalized Error

gen_error_knn = list()

for train_index, test_index in kf.split(X):
    knn_gen = KNeighborsClassifier()
    knn_gen.fit(X[train_index], y[train_index])
    gen_error_knn.append(knn_gen.score(X[test_index], y[test_index]))
    
print("Generalized Error on K Nearest Neighbour: ",np.mean(gen_error_knn))

Accuracy of KNN classifier on training set: 0.83
Accuracy of KNN classifier on test set: 0.77
Generalized Error on K Nearest Neighbour:  0.7600207223209


## Gaussian Naive Bayes

In [10]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB().fit(X, y)

print("Accuracy of GNB classifier on training set: {:.2f}".format(gnb.score(X, y)))
print("Accuracy of GNB classifier on test set: {:.2f}".format(gnb.score(X_test, y_test)))


# Generalized Error

gen_error_gnb = list()

for train_index, test_index in kf.split(X):
    gnb_gen = GaussianNB()
    gnb_gen.fit(X[train_index], y[train_index])
    gen_error_gnb.append(gnb_gen.score(X[test_index], y[test_index]))
    
print("Generalized Error on Gaussian Naive Bayes: ",np.mean(gen_error_gnb))

Accuracy of GNB classifier on training set: 0.76
Accuracy of GNB classifier on test set: 0.77
Generalized Error on Gaussian Naive Bayes:  0.7568295478880869


## Support Vector Machine

In [16]:
from sklearn.svm import SVC 

svm = SVC().fit(X, y)

print("Accuracy of SVM classifier on training set: {:.2f}".format(svm.score(X, y)))
print("Accuracy of SVM classifier on test set: {:.2f}".format(svm.score(X_test, y_test)))


# Generalized Error

gen_error_svm = list()

for train_index, test_index in kf.split(X):
    svm_gen = SVC()
    svm_gen.fit(X[train_index], y[train_index])
    gen_error_svm.append(svm_gen.score(X[test_index], y[test_index]))
    
print("Generalized Error on Support Vector Machine: ",np.mean(gen_error_svm))

Accuracy of SVM classifier on training set: 0.82
Accuracy of SVM classifier on test set: 0.81




Generalized Error on Support Vector Machine:  0.7957015355932631


## RBF SVM

In [14]:
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF

# rbf = GaussianProcessClassifier(1.0 * RBF(1.0))
# rbf.fit(X, y)

# print("Accuracy of RBF SVM classifier on training set: {:.2f}".format(rbf.score(X, y)))
# print("Accuracy of RBF SVM classifier on test set: {:.2f}".format(rbf.score(X_test, y_test)))

'from sklearn.gaussian_process import GaussianProcessClassifier\nfrom sklearn.gaussian_process.kernels import RBF\n\nrbf = GaussianProcessClassifier(1.0 * RBF(1.0))\nrbf.fit(X, y)\n\nprint("Accuracy of RBF SVM classifier on training set: {:.2f}".format(rbf.score(X, y)))\nprint("Accuracy of RBF SVM classifier on test set: {:.2f}".format(rbf.score(X_test, y_test)))'

## Random Forests

In [17]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=10, n_estimators=10, max_features=1)
rfc.fit(X, y)

print("Accuracy of Random Forests classifier on training set: {:.2f}".format(rfc.score(X, y)))
print("Accuracy of Random Forests classifier on test set: {:.2f}".format(rfc.score(X_test, y_test)))


# Generalized Error

gen_error_rf = list()

for train_index, test_index in kf.split(X):
    rf_gen = RandomForestClassifier()
    rf_gen.fit(X[train_index], y[train_index])
    gen_error_rf.append(rf_gen.score(X[test_index], y[test_index]))
    
print("Generalized Error on Random Forests: ",np.mean(gen_error_rf))

Accuracy of Random Forests classifier on training set: 0.87
Accuracy of Random Forests classifier on test set: 0.80




Generalized Error on Random Forests:  0.7777729488681456




## AdaBoost

In [20]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier()
ada.fit(X, y)

print("Accuracy of AdaBoost classifier on training set: {:.2f}".format(ada.score(X, y)))
print("Accuracy of AdaBoost classifier on test set: {:.2f}".format(ada.score(X_test, y_test)))


# Generalized Error

gen_error_ab = list()

for train_index, test_index in kf.split(X):
    ab_gen = AdaBoostClassifier()
    ab_gen.fit(X[train_index], y[train_index])
    gen_error_ab.append(ab_gen.score(X[test_index], y[test_index]))
    
print("Generalized Error on AdaBoost: ",np.mean(gen_error_ab))

Accuracy of AdaBoost classifier on training set: 0.80
Accuracy of AdaBoost classifier on test set: 0.81
Generalized Error on AdaBoost:  0.7964094957358628


## Neural Net

In [19]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(alpha=1)
mlp.fit(X, y)

print("Accuracy of MLP classifier on training set: {:.2f}".format(mlp.score(X, y)))
print("Accuracy of MLP classifier on test set: {:.2f}".format(mlp.score(X_test, y_test)))


# Generalized Error

gen_error_nn = list()

for train_index, test_index in kf.split(X):
    nn_gen = MLPClassifier(alpha=1)
    nn_gen.fit(X[train_index], y[train_index])
    gen_error_nn.append(nn_gen.score(X[test_index], y[test_index]))
    
print("Generalized Error on AdaBoost: ",np.mean(gen_error_nn))

Accuracy of MLP classifier on training set: 0.81
Accuracy of MLP classifier on test set: 0.82
Generalized Error on AdaBoost:  0.7997817542798835


### Tuning Hyperparameter of Neural Net to Maximise Generalized Score

In [20]:
# Estimating Hyper-Parameter

best_alpha = 0
best_score = 0

for i in range(1,6):
    c = (i/5)
    score_list = list()
    
    for train_index, test_index in kf.split(X):
        temp_model = MLPClassifier(alpha=c)
        temp_model.fit(X[train_index], y[train_index])
        score_list.append(temp_model.score(X[test_index], y[test_index]))
    
    if np.mean(score_list)>best_score:
        best_score=np.mean(score_list)
        best_alpha=c

print(best_alpha)
print(best_score)



0.6
0.7992495244573775


In [22]:
# Generalized Error

gen_error_nn = list()

for train_index, test_index in kf.split(X):
    nn_gen = MLPClassifier(alpha=0.6)
    nn_gen.fit(X[train_index], y[train_index])
    gen_error_nn.append(nn_gen.score(X[test_index], y[test_index]))
    
print("Generalized Error on MLP with tuned Hyperparameter: ",np.mean(gen_error_nn))

Generalized Error on MLP with tuned Hyperparameter:  0.79641044052253
