In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
churnData = pd.read_csv('files_for_lab/customer_churn.csv')
churnData.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
churnData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [4]:
display(churnData['MonthlyCharges'].value_counts())

20.05     61
19.85     45
19.95     44
19.90     44
20.00     43
          ..
114.75     1
103.60     1
113.40     1
57.65      1
113.30     1
Name: MonthlyCharges, Length: 1585, dtype: int64

In [5]:
churnData['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [6]:
target = churnData['Churn']
target

0        No
1        No
2       Yes
3        No
4       Yes
       ... 
7038     No
7039     No
7040     No
7041    Yes
7042     No
Name: Churn, Length: 7043, dtype: object

In [7]:
target.isna().sum()

0

In [8]:
indep_df= churnData[['tenure', 'SeniorCitizen','MonthlyCharges']]

In [9]:
churnData.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [10]:
# churnData.replace([np.inf, -np.inf], np.nan, inplace=True)

In [11]:
# scale them
X = indep_df
y = pd.DataFrame(data=churnData, columns=['Churn'])

transformer = StandardScaler().fit(X)
scaled_x = transformer.transform(X)
# gives array back
print(X)
print(y)
scaled_x


      tenure  SeniorCitizen  MonthlyCharges
0          1              0           29.85
1         34              0           56.95
2          2              0           53.85
3         45              0           42.30
4          2              0           70.70
...      ...            ...             ...
7038      24              0           84.80
7039      72              0          103.20
7040      11              0           29.60
7041       4              1           74.40
7042      66              0          105.65

[7043 rows x 3 columns]
     Churn
0       No
1       No
2      Yes
3       No
4      Yes
...    ...
7038    No
7039    No
7040    No
7041   Yes
7042    No

[7043 rows x 1 columns]


array([[-1.27744458, -0.43991649, -1.16032292],
       [ 0.06632742, -0.43991649, -0.25962894],
       [-1.23672422, -0.43991649, -0.36266036],
       ...,
       [-0.87024095, -0.43991649, -1.1686319 ],
       [-1.15528349,  2.27315869,  0.32033821],
       [ 1.36937906, -0.43991649,  1.35896134]])

In [12]:
np.ptp(scaled_x)

3.819018511812418

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(scaled_x, y, test_size=0.3, random_state=100)

In [14]:
classification = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, y_train)

y_pred = classification.predict(X_test)

print("The accuracy of the logistic_regression model is: %4.2f "% (classification.score(X_test, y_test)))

The accuracy of the logistic_regression model is: 0.78 


In [15]:
y.value_counts()

Churn
No       5174
Yes      1869
dtype: int64

In [16]:
print("The accuracy of a blind guess is: %4.2f " % (5163/(5163+1869)))

The accuracy of a blind guess is: 0.73 


In [17]:
from sklearn.metrics import cohen_kappa_score

print("The kappa of the logistic regression model is: %4.2f " %(cohen_kappa_score(y_pred,y_test)) )

The kappa of the logistic regression model is: 0.39 


# imblearn.over_sampling.SMOTE

In [18]:

X.corr()


Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
tenure,1.0,0.016567,0.2479
SeniorCitizen,0.016567,1.0,0.220173
MonthlyCharges,0.2479,0.220173,1.0


In [19]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
transformer = StandardScaler().fit(X)
X2 = transformer.transform(X)
y = churnData['Churn']
print(y.value_counts())

X_sm, y_sm = smote.fit_sample(X2, y)

y_sm = y_sm.to_numpy()
y_sm = pd.DataFrame(data=y_sm.flatten())

print(y_sm.value_counts())

No     5174
Yes    1869
Name: Churn, dtype: int64
Yes    5174
No     5174
dtype: int64


In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.3, random_state=100)

In [21]:
classification = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, y_train)
y_pred = classification.predict(X_test)
print("The accuracy of the logistic_regression model after oversampling is: %4.2f "% (classification.score(X_test, y_test)))

The accuracy of the logistic_regression model after oversampling is: 0.75 


In [22]:
from sklearn.metrics import cohen_kappa_score

print("The kappa of the logistic regression model is: %4.2f " %(cohen_kappa_score(y_pred,y_test)) )

The kappa of the logistic regression model is: 0.50 


# UnderSampling using TomekLinks

In [23]:
X

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
0,1,0,29.85
1,34,0,56.95
2,2,0,53.85
3,45,0,42.30
4,2,0,70.70
...,...,...,...
7038,24,0,84.80
7039,72,0,103.20
7040,11,0,29.60
7041,4,1,74.40


In [26]:
y

0        No
1        No
2       Yes
3        No
4       Yes
       ... 
7038     No
7039     No
7040     No
7041    Yes
7042     No
Name: Churn, Length: 7043, dtype: object

In [24]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks('majority')
X_tl, y_tl = tl.fit_sample(X, y)
y_tl =  y_tl.to_numpy()
y_tl = pd.DataFrame(data=y_tl.flatten())
y_tl.value_counts()

No     4711
Yes    1869
dtype: int64

In [25]:
X_tl2, y_tl2 = tl.fit_sample(X_tl, y_tl)
y_tl2 =  y_tl2.to_numpy()
y_tl2 = pd.DataFrame(data=y_tl2.flatten())
y_tl2.value_counts()

No     4575
Yes    1869
dtype: int64

In [27]:
transformer = StandardScaler().fit(X_tl2)
X3 = transformer.transform(X_tl2)
# y = churnData['Churn']

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tl2, y_tl2, test_size=0.3, random_state=100)

In [29]:
classification = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, y_train)
y_pred = classification.predict(X_test)
print("The accuracy of the logistic_regression model after oversampling is: %4.2f "% (classification.score(X_test, y_test)))

The accuracy of the logistic_regression model after oversampling is: 0.80 


In [30]:
from sklearn.metrics import cohen_kappa_score

print("The kappa of the logistic regression model is: %4.2f " %(cohen_kappa_score(y_pred,y_test)) )

The kappa of the logistic regression model is: 0.46 
