In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import load_iris


In [3]:
data = pd.read_csv('telecom_customer_churn.csv')

In [4]:
for column in data:
    print(f'{column}: {data[column].unique()}')

Customer ID: ['0002-ORFBO' '0003-MKNFE' '0004-TLHLJ' ... '9992-UJOEL' '9993-LHIEB'
 '9995-HOTOH']
Gender: ['Female' 'Male']
Age: [37 46 50 78 75 23 67 52 68 43 47 25 58 32 39 72 79 26 30 22 34 42 64 48
 28 33 31 60 69 20 57 19 38 54 35 51 66 61 24 55 59 71 49 40 21 45 29 62
 76 77 73 41 56 80 63 53 44 70 74 36 27 65]
Married: ['Yes' 'No']
Number of Dependents: [0 3 1 2 4 6 5 9 7 8]
City: ['Frazier Park' 'Glendale' 'Costa Mesa' ... 'Jacumba' 'Carpinteria'
 'Meadow Valley']
Zip Code: [93225 91206 92627 ... 91934 93013 95956]
Latitude: [34.827662 34.162515 33.645672 ... 32.649787 34.441398 39.937017]
Longitude: [-118.999073 -118.203869 -117.922613 ... -116.2237   -119.513163
 -121.058043]
Number of Referrals: [ 2  0  1  3  8  9 10  5  4  7  6 11]
Tenure in Months: [ 9  4 13  3 71 63  7 65 54 72  5 56 34  1 45 50 23 55 26 69 37 49 66 67
 20 43 59 12 27  2 25 29 14 35 64 39 40 11  6 30 70 57 58 16 32 33 10 21
 61 15 44 22 24 19 47 62 46 52  8 60 48 28 41 53 68 31 36 17 18 51 38 42]
Offer: [

In [5]:
data = data.drop(columns = ['Customer ID', 'City', 'Zip Code', 'Longitude', 'Latitude', 'Avg Monthly Long Distance Charges'])

In [6]:
data['MRR'] = (data['Total Revenue'] + data['Total Refunds'] - data['Total Extra Data Charges'] - data['Total Long Distance Charges'])/data['Tenure in Months']

In [7]:
for column in data:
    print(f'{column}: {data[column].unique()}')

Gender: ['Female' 'Male']
Age: [37 46 50 78 75 23 67 52 68 43 47 25 58 32 39 72 79 26 30 22 34 42 64 48
 28 33 31 60 69 20 57 19 38 54 35 51 66 61 24 55 59 71 49 40 21 45 29 62
 76 77 73 41 56 80 63 53 44 70 74 36 27 65]
Married: ['Yes' 'No']
Number of Dependents: [0 3 1 2 4 6 5 9 7 8]
Number of Referrals: [ 2  0  1  3  8  9 10  5  4  7  6 11]
Tenure in Months: [ 9  4 13  3 71 63  7 65 54 72  5 56 34  1 45 50 23 55 26 69 37 49 66 67
 20 43 59 12 27  2 25 29 14 35 64 39 40 11  6 30 70 57 58 16 32 33 10 21
 61 15 44 22 24 19 47 62 46 52  8 60 48 28 41 53 68 31 36 17 18 51 38 42]
Offer: ['None' 'Offer E' 'Offer D' 'Offer A' 'Offer B' 'Offer C']
Phone Service: ['Yes' 'No']
Multiple Lines: ['No' 'Yes' nan]
Internet Service: ['Yes' 'No']
Internet Type: ['Cable' 'Fiber Optic' 'DSL' nan]
Avg Monthly GB Download: [16. 10. 30.  4. 11. 73. 14.  7. 21. 59. 19. 12. 20. 22. 17.  9. nan 52.
 57. 51. 41. 23. 27.  2. 69. 53. 15. 29. 85. 28. 18. 48. 25. 26.  8.  6.
  5. 13. 75. 82. 24. 76. 47. 71. 58. 4

In [8]:
columns_null = []
for column in data:
    for x in data[column]:
        if pd.isna(x):
            columns_null.append(column)
            break

In [9]:
data[columns_null] = data[columns_null].replace(np.nan,'No')

In [10]:
data.columns

Index(['Gender', 'Age', 'Married', 'Number of Dependents',
       'Number of Referrals', 'Tenure in Months', 'Offer', 'Phone Service',
       'Multiple Lines', 'Internet Service', 'Internet Type',
       'Avg Monthly GB Download', 'Online Security', 'Online Backup',
       'Device Protection Plan', 'Premium Tech Support', 'Streaming TV',
       'Streaming Movies', 'Streaming Music', 'Unlimited Data', 'Contract',
       'Paperless Billing', 'Payment Method', 'Monthly Charge',
       'Total Charges', 'Total Refunds', 'Total Extra Data Charges',
       'Total Long Distance Charges', 'Total Revenue', 'Customer Status',
       'Churn Category', 'Churn Reason', 'MRR'],
      dtype='object')

In [11]:
df = data.drop(columns=['Gender', 'Age','Number of Referrals','Offer', 'Internet Type', 'Avg Monthly GB Download', 'Paperless Billing', 'Payment Method', 'Monthly Charge','Total Charges', 'Total Refunds', 'Total Extra Data Charges',
       'Total Long Distance Charges', 'Total Revenue','Churn Category', 'Churn Reason'])

In [12]:
for column in df:
    print(f'{column}: {df[column].unique()}')

Married: ['Yes' 'No']
Number of Dependents: [0 3 1 2 4 6 5 9 7 8]
Tenure in Months: [ 9  4 13  3 71 63  7 65 54 72  5 56 34  1 45 50 23 55 26 69 37 49 66 67
 20 43 59 12 27  2 25 29 14 35 64 39 40 11  6 30 70 57 58 16 32 33 10 21
 61 15 44 22 24 19 47 62 46 52  8 60 48 28 41 53 68 31 36 17 18 51 38 42]
Phone Service: ['Yes' 'No']
Multiple Lines: ['No' 'Yes']
Internet Service: ['Yes' 'No']
Online Security: ['No' 'Yes']
Online Backup: ['Yes' 'No']
Device Protection Plan: ['No' 'Yes']
Premium Tech Support: ['Yes' 'No']
Streaming TV: ['Yes' 'No']
Streaming Movies: ['No' 'Yes']
Streaming Music: ['No' 'Yes']
Unlimited Data: ['Yes' 'No']
Contract: ['One Year' 'Month-to-Month' 'Two Year']
Customer Status: ['Stayed' 'Churned' 'Joined']
MRR: [65.92222222 60.26666667 70.2125     ... 46.375      69.06940299
 58.85079365]


In [13]:
df = df.replace(['Yes', 'No'],[1,0])
df = df.replace({'Customer Status': {'Churned': 1, 'Joined': 0, 'Stayed':0}})
df['Number of Dependents'] = df['Number of Dependents'].replace([3, 1, 2, 4, 6, 5, 9, 7, 8], 1)
for column in df:
    print(f'{column}: {df[column].unique()}')

Married: [1 0]
Number of Dependents: [0 1]
Tenure in Months: [ 9  4 13  3 71 63  7 65 54 72  5 56 34  1 45 50 23 55 26 69 37 49 66 67
 20 43 59 12 27  2 25 29 14 35 64 39 40 11  6 30 70 57 58 16 32 33 10 21
 61 15 44 22 24 19 47 62 46 52  8 60 48 28 41 53 68 31 36 17 18 51 38 42]
Phone Service: [1 0]
Multiple Lines: [0 1]
Internet Service: [1 0]
Online Security: [0 1]
Online Backup: [1 0]
Device Protection Plan: [0 1]
Premium Tech Support: [1 0]
Streaming TV: [1 0]
Streaming Movies: [0 1]
Streaming Music: [0 1]
Unlimited Data: [1 0]
Contract: ['One Year' 'Month-to-Month' 'Two Year']
Customer Status: [0 1]
MRR: [65.92222222 60.26666667 70.2125     ... 46.375      69.06940299
 58.85079365]


In [14]:
scaler = MinMaxScaler()

In [15]:
cols_to_scale = ['Tenure in Months','MRR']
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale]) 

In [16]:
for column in df:
    print(f'{column}: {df[column].unique()}')

Married: [1 0]
Number of Dependents: [0 1]
Tenure in Months: [0.11267606 0.04225352 0.16901408 0.02816901 0.98591549 0.87323944
 0.08450704 0.90140845 0.74647887 1.         0.05633803 0.77464789
 0.46478873 0.         0.61971831 0.69014085 0.30985915 0.76056338
 0.35211268 0.95774648 0.50704225 0.67605634 0.91549296 0.92957746
 0.26760563 0.5915493  0.81690141 0.15492958 0.36619718 0.01408451
 0.33802817 0.3943662  0.18309859 0.47887324 0.88732394 0.53521127
 0.54929577 0.14084507 0.07042254 0.4084507  0.97183099 0.78873239
 0.8028169  0.21126761 0.43661972 0.45070423 0.12676056 0.28169014
 0.84507042 0.1971831  0.6056338  0.29577465 0.32394366 0.25352113
 0.64788732 0.85915493 0.63380282 0.71830986 0.09859155 0.83098592
 0.66197183 0.38028169 0.56338028 0.73239437 0.94366197 0.42253521
 0.49295775 0.22535211 0.23943662 0.70422535 0.52112676 0.57746479]
Phone Service: [1 0]
Multiple Lines: [0 1]
Internet Service: [1 0]
Online Security: [0 1]
Online Backup: [1 0]
Device Protection Plan:

In [17]:
reg_data = pd.get_dummies(data=df, columns=['Contract'])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(reg_data.drop(columns=['Customer Status']),reg_data[['Customer Status']],test_size = 0.15)

In [19]:
LogReg = LogisticRegression(solver='lbfgs', max_iter=1000)
LogReg.fit(X_train,y_train.values.ravel())

LogisticRegression(max_iter=1000)

In [20]:
LogReg.score(X_test, y_test)

0.8306527909176916

In [21]:
LogReg.intercept_

array([-1.52218286])

In [22]:
LogReg.coef_

array([[ 0.32488086, -1.55603368, -2.01441321, -1.01539802,  0.21539832,
         0.24742053, -0.58706194, -0.25780914, -0.16663093, -0.50868767,
         0.05441117,  0.12079477, -0.07581146, -0.02212437,  3.29044529,
         1.21993477, -0.03291127, -1.18797287]])

In [23]:
reg_data.columns

Index(['Married', 'Number of Dependents', 'Tenure in Months', 'Phone Service',
       'Multiple Lines', 'Internet Service', 'Online Security',
       'Online Backup', 'Device Protection Plan', 'Premium Tech Support',
       'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Unlimited Data',
       'Customer Status', 'MRR', 'Contract_Month-to-Month',
       'Contract_One Year', 'Contract_Two Year'],
      dtype='object')

In [24]:
LogReg.predict(X_test)

array([0, 0, 0, ..., 1, 0, 0])

In [25]:
X_test

Unnamed: 0,Married,Number of Dependents,Tenure in Months,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection Plan,Premium Tech Support,Streaming TV,Streaming Movies,Streaming Music,Unlimited Data,MRR,Contract_Month-to-Month,Contract_One Year,Contract_Two Year
4382,1,0,1.000000,1,1,1,1,1,1,0,1,0,0,1,0.790095,0,0,1
4364,1,0,0.112676,0,0,1,0,0,0,0,0,0,0,1,0.140225,0,1,0
2665,1,0,1.000000,1,1,1,1,1,0,0,1,1,1,0,0.862769,0,1,0
5488,1,0,1.000000,0,0,1,1,0,1,1,1,1,1,1,0.459059,0,0,1
1912,0,0,0.169014,1,0,0,0,0,0,0,0,0,0,0,0.065130,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1441,1,0,0.816901,1,1,1,0,0,1,1,1,0,0,1,0.532629,0,0,1
510,1,1,0.704225,0,0,1,0,0,1,1,1,1,1,1,0.400159,0,1,0
379,0,0,0.112676,1,1,1,0,0,0,0,1,1,1,1,0.727965,1,0,0
5167,0,0,0.760563,1,0,1,0,0,0,0,0,0,0,1,0.288033,0,1,0
