In [None]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN
from sklearn.ensemble import RandomForestClassifier
import pickle

In [None]:
df = pd.read_csv("tel_churn.csv")

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56.95,1889.5,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42.3,1840.75,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70.7,151.65,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [None]:
df=df.drop('Unnamed: 0',axis=1)

In [None]:
x=df.drop('Churn',axis=1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.50,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.30,1840.75,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70.70,151.65,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,0,1,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,0
7028,0,103.20,7362.90,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
7029,0,29.60,346.45,1,0,0,1,0,1,1,...,0,0,1,0,1,0,0,0,0,0
7030,1,74.40,306.60,0,1,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [None]:
y=df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

**Train Test Split**

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

# **Decision Tree Classifier**

In [None]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [None]:
model_dt.fit(x_train,y_train)

In [None]:
y_pred=model_dt.predict(x_test)
y_pred

array([0, 1, 0, ..., 1, 0, 1])

In [None]:
model_dt.score(x_test,y_test)

0.7818052594171997

In [None]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.81      0.91      0.86      1019
           1       0.65      0.45      0.53       388

    accuracy                           0.78      1407
   macro avg       0.73      0.68      0.70      1407
weighted avg       0.77      0.78      0.77      1407



As we can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.
Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.
Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

In [None]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [None]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [None]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [None]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.9472774416594641
              precision    recall  f1-score   support

           0       0.97      0.91      0.94       524
           1       0.93      0.97      0.95       633

    accuracy                           0.95      1157
   macro avg       0.95      0.94      0.95      1157
weighted avg       0.95      0.95      0.95      1157



In [None]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[479  45]
 [ 16 617]]


# Random Forest Classifier

Now we can see quite better results, i.e. Accuracy: 92 %, and a very good recall, precision & f1 score for minority class.
Let's try with some other classifier

In [None]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [None]:
model_rf.fit(x_train,y_train)

In [None]:
y_pred=model_rf.predict(x_test)

In [None]:
model_rf.score(x_test,y_test)

0.7995735607675906

In [None]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.92      0.87      1019
           1       0.70      0.48      0.57       388

    accuracy                           0.80      1407
   macro avg       0.76      0.70      0.72      1407
weighted avg       0.79      0.80      0.79      1407



In [None]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x,y)

In [None]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)

In [None]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [None]:
model_rf_smote.fit(xr_train1,yr_train1)

In [None]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [None]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

In [None]:
print(model_score_r1)
print(metrics.classification_report(yr_test1, yr_predict1))

0.9294117647058824
              precision    recall  f1-score   support

           0       0.96      0.89      0.92       558
           1       0.91      0.97      0.94       632

    accuracy                           0.93      1190
   macro avg       0.93      0.93      0.93      1190
weighted avg       0.93      0.93      0.93      1190



In [None]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))

[[495  63]
 [ 21 611]]


With RF Classifier, also we are able to get quite good results, infact better than Decision Tree.

# **Naive Bayes**

In [None]:
from sklearn.naive_bayes import GaussianNB 

In [None]:
model_nv = GaussianNB()  
model_nv.fit(x_train, y_train) 

In [None]:
y_pred = model_nv.predict(x_test)  

In [None]:
cm = confusion_matrix(y_test, y_pred)  

In [None]:
cm

array([[663, 356],
       [ 77, 311]])

In [None]:
model_nv.score(x_test, y_test)

0.6922530206112296

In [None]:
sm = SMOTEENN()
X_resampled2, y_resampled2 = sm.fit_resample(x,y)

In [None]:
xr_train2,xr_test2,yr_train2,yr_test2=train_test_split(X_resampled2, y_resampled2,test_size=0.2)

In [None]:
model_nv_smote = GaussianNB()

In [None]:
model_nv_smote.fit(xr_train2,yr_train2)

In [None]:
yr_predict2 = model_nv_smote.predict(xr_test2)
model_score_r2 = model_nv_smote.score(xr_test2, yr_test2)
print(model_score_r2)
print(metrics.classification_report(yr_test2, yr_predict2))

0.9008403361344538
              precision    recall  f1-score   support

           0       0.92      0.86      0.89       535
           1       0.89      0.94      0.91       655

    accuracy                           0.90      1190
   macro avg       0.90      0.90      0.90      1190
weighted avg       0.90      0.90      0.90      1190



# **Logistic Regression**

In [None]:
from sklearn import linear_model

In [None]:
model_lr = linear_model.LogisticRegression()
model_lr.fit(x_train,y_train)

In [None]:
y_pred=model_lr.predict(x_test)
y_pred

array([0, 1, 0, ..., 0, 0, 1])

In [None]:
model_lr.score(x_test,y_test)

0.8017057569296375

In [None]:

print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1019
           1       0.70      0.50      0.58       388

    accuracy                           0.80      1407
   macro avg       0.76      0.71      0.73      1407
weighted avg       0.79      0.80      0.79      1407



In [None]:
sm = SMOTEENN()
X_resampled3, y_resampled3 = sm.fit_resample(x,y)

In [None]:

xr_train3,xr_test3,yr_train3,yr_test3=train_test_split(X_resampled3, y_resampled3,test_size=0.2)

In [None]:
model_lr_smote=linear_model.LogisticRegression()

In [None]:
model_lr_smote.fit(xr_train3,yr_train3)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
yr_predict3 = model_lr_smote.predict(xr_test3)

In [None]:
model_score_r3 = model_lr_smote.score(xr_test3, yr_test3)

In [None]:
print(model_score_r3)
print(metrics.classification_report(yr_test3, yr_predict3))

0.9375534644995723
              precision    recall  f1-score   support

           0       0.94      0.93      0.93       539
           1       0.94      0.95      0.94       630

    accuracy                           0.94      1169
   macro avg       0.94      0.94      0.94      1169
weighted avg       0.94      0.94      0.94      1169



# **K-Nearest Neighbor(KNN)**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
model_knn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2 )  

In [None]:
model_knn.fit(x_train,y_train)

In [None]:
y_pred=model_knn.predict(x_test)

In [None]:
model_knn.score(x_test,y_test)

0.7661691542288557

In [None]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.81      0.89      0.85      1019
           1       0.61      0.43      0.51       388

    accuracy                           0.77      1407
   macro avg       0.71      0.66      0.68      1407
weighted avg       0.75      0.77      0.75      1407



In [None]:
sm = SMOTEENN()
X_resampled4, y_resampled4 = sm.fit_resample(x,y)

In [None]:
xr_train4,xr_test4,yr_train4,yr_test4=train_test_split(X_resampled4, y_resampled4,test_size=0.2)

In [None]:
model_knn_smote=KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2 )

In [None]:
model_knn_smote.fit(xr_train4,yr_train4)

In [None]:
yr_predict4 = model_knn_smote.predict(xr_test4)

In [None]:
predict = model_knn_smote.predict(xr_test4)

In [None]:
model_score_r4 = model_knn_smote.score(xr_test4, yr_test4)
print(model_score_r4)
print(metrics.classification_report(yr_test4, yr_predict4))
print (metrics.accuracy_score(yr_test4, predict))

0.9481733220050977
              precision    recall  f1-score   support

           0       0.96      0.92      0.94       510
           1       0.94      0.97      0.95       667

    accuracy                           0.95      1177
   macro avg       0.95      0.95      0.95      1177
weighted avg       0.95      0.95      0.95      1177

0.9481733220050977


In [None]:

def predict(SeniorCitizen,Dependents,tenure,PhoneService,MultipleLines,
              InternetService,OnlineSecurity,OnlineBackup,TechSupport,StreamingTV,StreamingMovies,
              Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges):
    input=np.array([[SeniorCitizen,Dependents,tenure,PhoneService,MultipleLines,
                    InternetService,OnlineSecurity,OnlineBackup,TechSupport,StreamingTV,StreamingMovies,
                    Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges]]).astype(np.float64)
    prediction = model_knn_smote.predict(input)
    
    return int(prediction)

In [None]:
from sklearn.svm import SVC  
clf = SVC(kernel='linear') 
  
# fitting x samples and y classes 
clf.fit(x_train,y_train)



KeyboardInterrupt: ignored

In [None]:
y_pred=clf.predict(x_test)

NameError: ignored

# **Saving the Model**

In [None]:
filename = 'model.sav'

In [None]:
pickle.dump(model_knn_smote, open(filename, 'wb'))

In [None]:
load_model = pickle.load(open(filename, 'rb'))
with open ('model.sav','rb') as file:
  data = pickle.load(file)

In [None]:
model_score_r4 = load_model.score(xr_test4, yr_test4)

In [None]:
model_score_r4*100

94.81733220050977

In [None]:
import joblib
#Save the model to disk
filename = 'model2.sav'
joblib.dump(model_knn_smote, filename)

['model2.sav']