In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('Train_Churn_binary.csv')
test = pd.read_csv('Test_Churn.csv')

full_data = pd.concat([train, test])

churn = test['Churn']

full_data.drop(['Customer Number'], axis=1, inplace=True)

full_data.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [3]:
#### Early manipulation ####

full_data['TotalCharges'] = full_data['TotalCharges'].replace(to_replace = " ",value = 0)
full_data['TotalCharges'] = full_data['TotalCharges'].astype(float)

#replace 'No internet service' option to No
internet_columns = ['OnlineSecurity','OnlineBackup','DeviceProtection',
                   'TechSupport','StreamingTV','StreamingMovies']

for i in internet_columns:
    full_data[i] = full_data[i].replace({'No internet service':'No'})

#full_data.info()


In [4]:
#### Dummy time ####

full_data = pd.get_dummies(data=full_data, columns=['gender', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'],
    drop_first=False)

#full_data.info()

In [5]:
#full_data = full_data.drop(full_data['gender_Male'])

final_train = full_data.head(5282)
final_test = full_data.tail(1761)

final_train.to_csv('final_train.csv')
final_test.to_csv('final_test.csv')

full_data.columns

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn',
       'gender_Female', 'gender_Male', 'Partner_No', 'Partner_Yes',
       'Dependents_No', 'Dependents_Yes', 'PhoneService_No',
       'PhoneService_Yes', 'MultipleLines_No',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No', 'OnlineSecurity_Yes',
       'OnlineBackup_No', 'OnlineBackup_Yes', 'DeviceProtection_No',
       'DeviceProtection_Yes', 'TechSupport_No', 'TechSupport_Yes',
       'StreamingTV_No', 'StreamingTV_Yes', 'StreamingMovies_No',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaperlessBilling_No', 'PaperlessBilling_Yes',
       'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')

In [6]:
columns = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn',
       'gender_Female', 'gender_Male', 'Partner_No', 'Partner_Yes',
       'Dependents_No', 'Dependents_Yes', 'PhoneService_No',
       'PhoneService_Yes', 'MultipleLines_No',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No', 'OnlineSecurity_Yes',
       'OnlineBackup_No', 'OnlineBackup_Yes', 'DeviceProtection_No',
       'DeviceProtection_Yes', 'TechSupport_No', 'TechSupport_Yes',
       'StreamingTV_No', 'StreamingTV_Yes', 'StreamingMovies_No',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaperlessBilling_No', 'PaperlessBilling_Yes',
       'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']



train_data = final_train[columns]
train_data = pd.DataFrame(train_data).drop(columns='Churn')
test_data = final_test[columns]
test_data = pd.DataFrame(test_data).drop(columns='Churn')

target = final_train.Churn

train_data.columns

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges',
       'gender_Female', 'gender_Male', 'Partner_No', 'Partner_Yes',
       'Dependents_No', 'Dependents_Yes', 'PhoneService_No',
       'PhoneService_Yes', 'MultipleLines_No',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No', 'OnlineSecurity_Yes',
       'OnlineBackup_No', 'OnlineBackup_Yes', 'DeviceProtection_No',
       'DeviceProtection_Yes', 'TechSupport_No', 'TechSupport_Yes',
       'StreamingTV_No', 'StreamingTV_Yes', 'StreamingMovies_No',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaperlessBilling_No', 'PaperlessBilling_Yes',
       'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')

In [7]:
#### Preprocess and scale the data ####

# dropped accuracy severely

from sklearn.preprocessing import scale
train_data=scale(train_data)
test_data=scale(test_data)


In [8]:
#### Lets determine correlations so we can drop columns ####

corr = full_data.corr()

corr_array = pd.DataFrame(corr)
corr_unstacked = corr.unstack()
corr_list = corr_unstacked.sort_values(kind="quicksort",ascending = False)
#print(corr_list)

corr_churn = pd.DataFrame(corr['Churn'])
corr_churn_unstacked = corr_churn.unstack().abs()
corr_churn_list = corr_churn_unstacked.sort_values(kind="quicksort", ascending = False)
#corr_churn_list = corr_churn_list.abs()

print(corr_churn_list)

Churn  Churn                                      1.000000
       Contract_Month-to-month                    0.406114
       tenure                                     0.361497
       PaymentMethod_Electronic check             0.306797
       Contract_Two year                          0.298762
       InternetService_Fiber optic                0.297013
       InternetService_No                         0.222458
       TotalCharges                               0.212372
       PaperlessBilling_No                        0.185480
       PaperlessBilling_Yes                       0.185480
       Contract_One year                          0.182562
       MonthlyCharges                             0.180156
       OnlineSecurity_No                          0.178086
       OnlineSecurity_Yes                         0.178086
       TechSupport_Yes                            0.174806
       TechSupport_No                             0.174806
       Dependents_Yes                             0.1628

In [9]:
#### Train the log reg model on the TRAIN data ####

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import  cross_val_score,GridSearchCV

logreg = LogisticRegression()

logreg.fit(train_data, target)
pred = logreg.predict_proba(train_data)
#accuracy = accuracy_score(target,pred)
parameters = logreg.coef_

model_score = cross_val_score(logreg, train_data, target, cv=10)
print('logreg cv scores are: ',model_score)
print('avg of the cv scores: ',model_score.mean())

#print('Accuracy: ',accuracy)
print('Parameters: ',parameters)



prediction = pd.DataFrame(pred)
prediction = prediction.drop(columns = 0, axis = 1)
prediction.to_csv('pred.csv')

logreg cv scores are:  [0.82230624 0.80718336 0.7826087  0.80151229 0.8030303  0.80492424
 0.83522727 0.81818182 0.81783681 0.77988615]
avg of the cv scores:  0.8072697182526781
Parameters:  [[ 0.08264528 -1.34744578 -0.55631577  0.57060526  0.00268926 -0.00268926
   0.01252134 -0.01252134  0.02575327 -0.02575327  0.00663151 -0.00663151
  -0.08487242  0.00663151  0.08208619 -0.05610737  0.53544766 -0.5800299
   0.08895286 -0.08895286  0.03168608 -0.03168608 -0.00497868  0.00497868
   0.07864976 -0.07864976 -0.10092401  0.10092401 -0.08783504  0.08783504
   0.25539461 -0.0556423  -0.24333985 -0.07535865  0.07535865 -0.0086371
  -0.07461776  0.12639081 -0.06058322]]


In [10]:
param_df =pd.DataFrame(parameters).transpose()
print(param_df.info)
param_df.to_csv('param_df.csv')


col_df = pd.DataFrame(final_train.columns.drop(['Churn']))
col_df.to_csv('train_col.csv')
print(col_df.info)


<bound method DataFrame.info of            0
0   0.082645
1  -1.347446
2  -0.556316
3   0.570605
4   0.002689
5  -0.002689
6   0.012521
7  -0.012521
8   0.025753
9  -0.025753
10  0.006632
11 -0.006632
12 -0.084872
13  0.006632
14  0.082086
15 -0.056107
16  0.535448
17 -0.580030
18  0.088953
19 -0.088953
20  0.031686
21 -0.031686
22 -0.004979
23  0.004979
24  0.078650
25 -0.078650
26 -0.100924
27  0.100924
28 -0.087835
29  0.087835
30  0.255395
31 -0.055642
32 -0.243340
33 -0.075359
34  0.075359
35 -0.008637
36 -0.074618
37  0.126391
38 -0.060583>
<bound method DataFrame.info of                                           0
0                             SeniorCitizen
1                                    tenure
2                            MonthlyCharges
3                              TotalCharges
4                             gender_Female
5                               gender_Male
6                                Partner_No
7                               Partner_Yes
8                  

In [11]:
output = logreg.predict_proba(test_data)

output = pd.DataFrame(output, columns = ['drop1','Churn'])
cust_col = pd.DataFrame(test['Customer Number'], columns = ['Customer Number'])
final_output = pd.merge(cust_col, output, on=cust_col.index)

drop_cols = [0,2]
final_output.drop(final_output.columns[drop_cols], axis = 1, inplace = True)
final_output.to_csv('output.csv', index = False)