In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("dataset.csv")

In [3]:
df = df[['SeniorCitizen','Dependents','tenure','OnlineSecurity','TechSupport'
           ,'Contract','MonthlyCharges','TotalCharges','Churn']]

In [4]:
df.dropna(subset={"SeniorCitizen"},inplace=True)
df.fillna({"tenure":df['tenure'].mean()},inplace=True)

In [5]:
encs = {}
for col in df.columns:
    if df[col].dtype == 'object':
        encs[col] = LabelEncoder()
        df[col] = encs[col].fit_transform(df[col])

In [6]:
X = df.iloc[:,:8].values
y = df.iloc[:,8].values

In [7]:
from imblearn.under_sampling import InstanceHardnessThreshold
iht = InstanceHardnessThreshold(random_state=0,estimator=LogisticRegression(solver='lbfgs', multi_class='auto'))

In [8]:
X_resampled, y_resampled = iht.fit_resample(X, y)

In [9]:
X_resampled.size

29696

In [10]:
# Splitting the dataset into the Training set and Test set

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = 0.2, random_state = 0)

In [11]:
# Feature Scaling

sc = StandardScaler()

X_train = sc.fit_transform(X_train)

X_test = sc.transform(X_test)

In [12]:
# Fitting K-NN to the Training set

classifier = KNeighborsClassifier(n_neighbors = 7, metric = 'minkowski', p = 2)

classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=7, p=2,
           weights='uniform')

In [13]:
# Predication accuracy

classifier.score(X_train,y_train)*100

97.60862243179523

In [14]:
y_pred = classifier.predict(X_train)
cm = confusion_matrix(y_train, y_pred)
cm

array([[1456,   12],
       [  59, 1442]], dtype=int64)

In [15]:
# Predication accuracy

classifier.score(X_test,y_test)*100

96.63526244952894

In [16]:
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[383,   5],
       [ 20, 335]], dtype=int64)

In [17]:
# Fitting Logistic Regression to the Training set  

classifier = LogisticRegression(random_state = 0)

classifier.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [18]:
# Predication accuracy

classifier.score(X_train,y_train)*100

97.91175479959581

In [19]:
y_pred = classifier.predict(X_train)
cm = confusion_matrix(y_train, y_pred)
cm

array([[1460,    8],
       [  54, 1447]], dtype=int64)

In [20]:
# Predication accuracy

classifier.score(X_test,y_test)*100

97.1736204576043

In [21]:
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[385,   3],
       [ 18, 337]], dtype=int64)

In [22]:
# Fitting Naive Bayes to the Training set
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [23]:
# Predication accuracy

classifier.score(X_train,y_train)*100

95.89087234759178

In [24]:
y_pred = classifier.predict(X_train)
cm = confusion_matrix(y_train, y_pred)
cm

array([[1457,   11],
       [ 111, 1390]], dtype=int64)

In [25]:
# Predication accuracy

classifier.score(X_test,y_test)*100

95.69313593539704

In [26]:
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[385,   3],
       [ 29, 326]], dtype=int64)

In [27]:
# Fitting rbf SVM to the Training set


classifier = SVC(kernel = 'rbf', random_state = 0)

classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=0,
  shrinking=True, tol=0.001, verbose=False)

In [28]:
# Predication accuracy

classifier.score(X_train,y_train)*100

97.91175479959581

In [29]:
y_pred = classifier.predict(X_train)
cm = confusion_matrix(y_train, y_pred)
cm

array([[1461,    7],
       [  55, 1446]], dtype=int64)

In [30]:
# Predication accuracy

classifier.score(X_test,y_test)*100

97.30820995962316

In [31]:
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[387,   1],
       [ 19, 336]], dtype=int64)

In [32]:
# Fitting Decision Tree to the Training set


classifier = DecisionTreeClassifier(criterion="entropy",max_depth=7)

classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=7,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [33]:
# Predication accuracy

classifier.score(X_train,y_train)*100

98.41697541259683

In [34]:
y_pred = classifier.predict(X_train)
cm = confusion_matrix(y_train, y_pred)
cm

array([[1465,    3],
       [  44, 1457]], dtype=int64)

In [35]:
# Predication accuracy

classifier.score(X_test,y_test)*100

96.5006729475101

In [36]:
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[381,   7],
       [ 19, 336]], dtype=int64)

In [37]:
# Fitting Random Forest to the Training set


classifier = RandomForestClassifier(n_estimators=1000,max_depth=7)

classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [38]:
# Predication accuracy

classifier.score(X_train,y_train)*100

98.41697541259683

In [39]:
y_pred = classifier.predict(X_train)
cm = confusion_matrix(y_train, y_pred)
cm

array([[1468,    0],
       [  47, 1454]], dtype=int64)

In [40]:
# Predication accuracy

classifier.score(X_test,y_test)*100

96.63526244952894

In [41]:
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[383,   5],
       [ 20, 335]], dtype=int64)