In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('Train_Churn_binary.csv')
test = pd.read_csv('Test_Churn.csv')

full_data = pd.concat([train, test])

churn = test['Churn']

full_data.drop(['Customer Number'], axis=1, inplace=True)

full_data.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [3]:
#### Early manipulation ####

full_data['TotalCharges'] = full_data['TotalCharges'].replace(to_replace = " ",value = 0)
full_data['TotalCharges'] = full_data['TotalCharges'].astype(float)

#replace 'No internet service' option to No
internet_columns = ['OnlineSecurity','OnlineBackup','DeviceProtection',
                   'TechSupport','StreamingTV','StreamingMovies']

for i in internet_columns:
    full_data[i] = full_data[i].replace({'No internet service':'No'})


In [4]:
#### Create features ####

full_data['ratio'] = full_data.tenure * full_data.MonthlyCharges / full_data.TotalCharges
full_data['ratio'].fillna(0, inplace=True)

bins = np.array([129.7,86.5,57.6,38.4,25.6,17.1,11.4,7.6,5.0625,3.375,2.25,1.5,1])
digitized = np.digitize(full_data.tenure, bins)
full_data['digitized'] = digitized

full_data['twelver'] = full_data.tenure % 12 == 0
print(full_data.twelver)

full_data['elevener'] = full_data.tenure % 11 == 0 

#movies[(movies.duration >= 200) & (movies.genre == 'Drama')]
#https://www.ritchieng.com/pandas-multi-criteria-filtering/

0       False
1       False
2       False
3        True
4        True
5       False
6       False
7       False
8       False
9       False
10      False
11      False
12      False
13      False
14      False
15       True
16      False
17      False
18      False
19       True
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29       True
        ...  
1731    False
1732    False
1733     True
1734    False
1735    False
1736    False
1737    False
1738    False
1739    False
1740    False
1741    False
1742    False
1743     True
1744     True
1745    False
1746    False
1747    False
1748     True
1749    False
1750    False
1751    False
1752    False
1753     True
1754    False
1755    False
1756     True
1757    False
1758    False
1759    False
1760    False
Name: twelver, Length: 7043, dtype: bool


In [5]:
#### Dummy time ####

full_data = pd.get_dummies(data=full_data, columns=['gender', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'],
    drop_first=False)

#full_data.info()

In [6]:
#full_data = full_data.drop(full_data['gender_Male'])

final_train = full_data.head(5282)
final_test = full_data.tail(1761)

final_train.to_csv('final_train.csv')
final_test.to_csv('final_test.csv')

full_data.columns

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn',
       'ratio', 'digitized', 'twelver', 'elevener', 'gender_Female',
       'gender_Male', 'Partner_No', 'Partner_Yes', 'Dependents_No',
       'Dependents_Yes', 'PhoneService_No', 'PhoneService_Yes',
       'MultipleLines_No', 'MultipleLines_No phone service',
       'MultipleLines_Yes', 'InternetService_DSL',
       'InternetService_Fiber optic', 'InternetService_No',
       'OnlineSecurity_No', 'OnlineSecurity_Yes', 'OnlineBackup_No',
       'OnlineBackup_Yes', 'DeviceProtection_No', 'DeviceProtection_Yes',
       'TechSupport_No', 'TechSupport_Yes', 'StreamingTV_No',
       'StreamingTV_Yes', 'StreamingMovies_No', 'StreamingMovies_Yes',
       'Contract_Month-to-month', 'Contract_One year', 'Contract_Two year',
       'PaperlessBilling_No', 'PaperlessBilling_Yes',
       'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'Paym

In [7]:
columns = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn',
           
           'ratio',
           'digitized',
           'twelver',
           'elevener',
           
           #'gender_Female', 'gender_Male', 
           #'Dependents_Yes', 'PhoneService_No',
           #'PhoneService_Yes', 'MultipleLines_No phone service',
           # 'StreamingMovies_Yes',
           
       'Partner_No', 'Partner_Yes',
       'Dependents_No', 
       'MultipleLines_No', 'MultipleLines_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No', 'OnlineSecurity_Yes',
       'OnlineBackup_No', 'OnlineBackup_Yes', 'DeviceProtection_No',
       'DeviceProtection_Yes', 'TechSupport_No', 'TechSupport_Yes',
       'StreamingTV_No', 'StreamingTV_Yes', 'StreamingMovies_No',
        'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaperlessBilling_No', 'PaperlessBilling_Yes',
       'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']



train_data = final_train[columns]
train_data = pd.DataFrame(train_data).drop(columns='Churn')
test_data = final_test[columns]
test_data = pd.DataFrame(test_data).drop(columns='Churn')

target = final_train.Churn

train_data.columns

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'ratio',
       'digitized', 'twelver', 'elevener', 'Partner_No', 'Partner_Yes',
       'Dependents_No', 'MultipleLines_No', 'MultipleLines_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No', 'OnlineSecurity_Yes',
       'OnlineBackup_No', 'OnlineBackup_Yes', 'DeviceProtection_No',
       'DeviceProtection_Yes', 'TechSupport_No', 'TechSupport_Yes',
       'StreamingTV_No', 'StreamingTV_Yes', 'StreamingMovies_No',
       'Contract_Month-to-month', 'Contract_One year', 'Contract_Two year',
       'PaperlessBilling_No', 'PaperlessBilling_Yes',
       'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')

In [8]:
#### Preprocess and scale the data ####

# dropped accuracy severely

from sklearn.preprocessing import scale
train_data=scale(train_data)
test_data=scale(test_data)


In [9]:
#### Lets determine correlations so we can drop columns ####

corr = full_data.corr()

corr_array = pd.DataFrame(corr)
corr_unstacked = corr.unstack()
corr_list = corr_unstacked.sort_values(kind="quicksort",ascending = False)

corr_churn = pd.DataFrame(corr['Churn'])
corr_churn_unstacked = corr_churn.unstack().abs()
corr_churn_list = corr_churn_unstacked.sort_values(kind="quicksort", ascending = False)

print(corr_churn_list)

Churn  Churn                                      1.000000
       Contract_Month-to-month                    0.406114
       digitized                                  0.384282
       tenure                                     0.361497
       PaymentMethod_Electronic check             0.306797
       Contract_Two year                          0.298762
       InternetService_Fiber optic                0.297013
       InternetService_No                         0.222458
       TotalCharges                               0.212372
       PaperlessBilling_No                        0.185480
       PaperlessBilling_Yes                       0.185480
       Contract_One year                          0.182562
       MonthlyCharges                             0.180156
       OnlineSecurity_Yes                         0.178086
       OnlineSecurity_No                          0.178086
       TechSupport_Yes                            0.174806
       TechSupport_No                             0.1748

In [10]:
#### Train the log reg model on the TRAIN data ####

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import  cross_val_score,GridSearchCV
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix

logreg = LogisticRegression()

logreg.fit(train_data, target)
pred = logreg.predict_proba(train_data)

parameters = logreg.coef_

model_score = cross_val_score(logreg, train_data, target, cv=10)
print('logreg cv scores are: ',model_score)
print('avg of the cv scores: ',model_score.mean())


print('Parameters: ',parameters)

prediction = pd.DataFrame(pred)
prediction = prediction.drop(columns = 0, axis = 1)
prediction.to_csv('pred.csv')

#accuracy = accuracy_score(target,prediction)
#print('Accuracy: ',accuracy)

print('ROC_AUC is: ',roc_auc_score(target, prediction))


logreg cv scores are:  [0.82230624 0.80151229 0.80151229 0.78827977 0.80681818 0.79545455
 0.83712121 0.81060606 0.82352941 0.77609108]
avg of the cv scores:  0.8063231079369976
Parameters:  [[ 0.08375557  0.04159855 -0.53097551 -0.08747726  0.0136777   0.81942709
  -0.0416381  -0.0261713   0.00667857 -0.00667857  0.03491813 -0.10644938
   0.11804865 -0.08678156  0.59252176 -0.61325233  0.07001648 -0.07001648
   0.02522308 -0.02522308 -0.02477641  0.02477641  0.05705898 -0.05705898
  -0.12369783  0.12369783 -0.2276371   0.30513044 -0.02970263 -0.32519738
  -0.08011285  0.08011285  0.00584942 -0.05806457  0.11829956 -0.08205972]]
ROC_AUC is:  0.8543758967001436


In [11]:
param_df =pd.DataFrame(parameters).transpose().abs()
col_df = pd.DataFrame(final_train.columns.drop(['Churn']))

col_df.columns = ['Columns']
param_df.columns = ['Coefficients']
coef_df = param_df.join(col_df)

#coef_df.sort_values('Coefficients')
coef_df.sort_values(by='Coefficients',kind="quicksort",ascending = False)

Unnamed: 0,Coefficients,Columns
5,0.819427,digitized
15,0.613252,PhoneService_Yes
14,0.592522,PhoneService_No
2,0.530976,MonthlyCharges
29,0.325197,TechSupport_Yes
27,0.30513,DeviceProtection_Yes
26,0.227637,DeviceProtection_No
24,0.123698,OnlineBackup_No
25,0.123698,OnlineBackup_Yes
34,0.1183,Contract_Month-to-month


In [12]:


output = logreg.predict_proba(test_data)

output = pd.DataFrame(output, columns = ['drop1','Churn'])
cust_col = pd.DataFrame(test['Customer Number'], columns = ['Customer Number'])
final_output = pd.merge(cust_col, output, on=cust_col.index)

drop_cols = [0,2]
final_output.drop(final_output.columns[drop_cols], axis = 1, inplace = True)
final_output.to_csv('output.csv', index = False)

