In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn import metrics
from sklearn.cross_validation import train_test_split
%matplotlib inline



In [2]:
telco = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [3]:
#Data Cleaning
#Change TotalCharges to numeric float values and drop all rows with NaN value
t = telco['TotalCharges']
telco['TotalCharges'] = pd.to_numeric(t, errors='coerce')
telco = telco.dropna()

# get dummy variables to categorical columns
telco = pd.get_dummies(telco, columns = ['Churn'])


In [4]:
telco.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn_No,Churn_Yes
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,1,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,No,No,One year,No,Mailed check,56.95,1889.5,1,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,0,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,1,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,0,1


In [5]:
# Get dummies on features with two options
# define features with dummies and continuous variables
# Use Ridge regression to predict Churn

In [6]:
list(telco.columns)

['customerID',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn_No',
 'Churn_Yes']

In [7]:
telco = pd.get_dummies(telco, columns = ['SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod'])

In [8]:
telco.head()

Unnamed: 0,customerID,gender,tenure,MonthlyCharges,TotalCharges,Churn_No,Churn_Yes,SeniorCitizen_0,SeniorCitizen_1,Partner_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,Female,1,29.85,29.85,1,0,1,0,0,...,0,1,0,0,0,1,0,0,1,0
1,5575-GNVDE,Male,34,56.95,1889.5,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,1
2,3668-QPYBK,Male,2,53.85,108.15,0,1,1,0,1,...,0,1,0,0,0,1,0,0,0,1
3,7795-CFOCW,Male,45,42.3,1840.75,1,0,1,0,1,...,0,0,1,0,1,0,1,0,0,0
4,9237-HQITU,Female,2,70.7,151.65,0,1,1,0,1,...,0,1,0,0,0,1,0,0,1,0


In [9]:
# Using features, Apply model
ridgreg = linear_model.Ridge(alpha=0, fit_intercept=False)

x = telco.drop(['customerID', 'Churn_No', 'Churn_Yes', 'gender'], 1)
y = telco['Churn_Yes']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
ridgreg.fit(x_train, y_train)


Ridge(alpha=0, copy_X=True, fit_intercept=False, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [10]:
#Predict
y_pred = ridgreg.predict(x_test)
print('Accuracy of Ridge Regression = ' + str(ridgreg.score(x_test, y_test)))

Accuracy of Ridge Regression = 0.26312164138476835
