In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('Train_Churn_binary.csv')
test = pd.read_csv('Test_Churn.csv')

full_data = pd.concat([train, test])

churn = test['Churn']

full_data.drop(['Customer Number'], axis=1, inplace=True)

full_data.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [3]:
#### Early manipulation ####

full_data['TotalCharges'] = full_data['TotalCharges'].replace(to_replace = " ",value = 0)
full_data['TotalCharges'] = full_data['TotalCharges'].astype(float)

#replace 'No internet service' option to No
internet_columns = ['OnlineSecurity','OnlineBackup','DeviceProtection',
                   'TechSupport','StreamingTV','StreamingMovies']

for i in internet_columns:
    full_data[i] = full_data[i].replace({'No internet service':'No'})

#full_data.info()


In [4]:
#### Dummy time ####

full_data = pd.get_dummies(data=full_data, columns=['gender', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod'],
    drop_first=False)

#full_data.info()

In [5]:
#full_data = full_data.drop(full_data['gender_Male'])

final_train = full_data.head(5282)
final_test = full_data.tail(1761)

final_train.to_csv('final_train.csv')
final_test.to_csv('final_test.csv')

full_data.columns

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn',
       'gender_Female', 'gender_Male', 'Partner_No', 'Partner_Yes',
       'Dependents_No', 'Dependents_Yes', 'PhoneService_No',
       'PhoneService_Yes', 'MultipleLines_No',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No', 'OnlineSecurity_Yes',
       'OnlineBackup_No', 'OnlineBackup_Yes', 'DeviceProtection_No',
       'DeviceProtection_Yes', 'TechSupport_No', 'TechSupport_Yes',
       'StreamingTV_No', 'StreamingTV_Yes', 'StreamingMovies_No',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaperlessBilling_No', 'PaperlessBilling_Yes',
       'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')

In [7]:
columns = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn',
       'gender_Female', 'gender_Male', 'Partner_No', 'Partner_Yes',
       'Dependents_No', 'Dependents_Yes', 'PhoneService_No',
       'PhoneService_Yes', 'MultipleLines_No',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No', 'OnlineSecurity_Yes',
       'OnlineBackup_No', 'OnlineBackup_Yes', 'DeviceProtection_No',
       'DeviceProtection_Yes', 'TechSupport_No', 'TechSupport_Yes',
       'StreamingTV_No', 'StreamingTV_Yes', 'StreamingMovies_No',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaperlessBilling_No', 'PaperlessBilling_Yes',
       'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']



train_data = final_train[columns]
train_data = pd.DataFrame(train_data).drop(columns='Churn')
test_data = final_test[columns]
test_data = pd.DataFrame(test_data).drop(columns='Churn')

target = final_train.Churn

train_data.columns

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges',
       'gender_Female', 'gender_Male', 'Partner_No', 'Partner_Yes',
       'Dependents_No', 'Dependents_Yes', 'PhoneService_No',
       'PhoneService_Yes', 'MultipleLines_No',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No', 'OnlineSecurity_Yes',
       'OnlineBackup_No', 'OnlineBackup_Yes', 'DeviceProtection_No',
       'DeviceProtection_Yes', 'TechSupport_No', 'TechSupport_Yes',
       'StreamingTV_No', 'StreamingTV_Yes', 'StreamingMovies_No',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaperlessBilling_No', 'PaperlessBilling_Yes',
       'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')

In [8]:
#### Train the log reg model on the TRAIN data ####

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import  cross_val_score,GridSearchCV

logreg = LogisticRegression()

logreg.fit(train_data, target)
pred = logreg.predict_proba(train_data)
#accuracy = accuracy_score(target,pred)
parameters = logreg.coef_

model_score = cross_val_score(logreg, train_data, target, cv=10)
print('logreg cv scores are: ',model_score)
print('avg of the cv scores: ',model_score.mean())

print('Prediction shape: ',pred.shape)
#print('Accuracy: ',accuracy)
print('Parameters: ',parameters)



prediction = pd.DataFrame(pred)
prediction = prediction.drop(columns = 0, axis = 1)
prediction.to_csv('pred.csv')

logreg cv scores are:  [0.8241966  0.80340265 0.7826087  0.79773157 0.8030303  0.80871212
 0.83333333 0.81060606 0.81783681 0.77988615]
avg of the cv scores:  0.8061344286840237
Prediction shape:  (5282, 2)
Parameters:  [[ 2.28547819e-01 -5.69874448e-02  1.33045840e-02  2.74526860e-04
  -1.06578462e-01 -1.17516323e-01 -8.27363644e-02 -1.41358420e-01
  -5.81086915e-02 -1.65986093e-01  1.32696402e-01 -3.56791186e-01
  -2.65925961e-01  1.32696402e-01 -9.08652254e-02 -4.55162475e-02
   3.58147252e-01 -5.36725789e-01  1.68738077e-01 -3.92832861e-01
   3.47888274e-02 -2.58883612e-01 -3.54316437e-02 -1.88663141e-01
   1.44283098e-01 -3.68377882e-01 -1.55972471e-01 -6.81223132e-02
  -1.28928620e-01 -9.51661650e-02  4.76785925e-01 -1.81737564e-01
  -5.19143146e-01 -2.77277871e-01  5.31830862e-02 -5.14906562e-02
  -2.13804092e-01  2.21060276e-01 -1.79860312e-01]]


In [9]:
output = logreg.predict_proba(test_data)

output = pd.DataFrame(output, columns = ['drop1','Churn'])
cust_col = pd.DataFrame(test['Customer Number'], columns = ['Customer Number'])
final_output = pd.merge(cust_col, output, on=cust_col.index)

drop_cols = [0,2]
final_output.drop(final_output.columns[drop_cols], axis = 1, inplace = True)
final_output.to_csv('output.csv', index = False)