# Predicting Churn

<hr>

## Logistic Regression

In [1]:
import pandas as pd

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('telco-customer-churn.csv')

In [3]:
df.dropna()
df.drop('customerID', axis=1, inplace=True)

Let's preprocess our columns with binary answers first because we can utilize the `drop_first` keyword in `get_dummies`.

In [4]:
binary_cols = [col for col in df.columns if len(df.value_counts(col)) == 2]

In [5]:
data_binaries = pd.get_dummies(df[binary_cols], drop_first=True)

In [6]:
data_binaries.columns

Index(['SeniorCitizen', 'gender_Male', 'Partner_Yes', 'Dependents_Yes',
       'PhoneService_Yes', 'PaperlessBilling_Yes', 'Churn_Yes'],
      dtype='object')

Now every other interesting column

In [7]:
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [8]:
df.value_counts('OnlineBackup')

OnlineBackup
No                     3088
Yes                    2429
No internet service    1526
dtype: int64

In [9]:
other_cols = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'DeviceProtection', 'TechSupport', 
              'Contract', 'OnlineBackup', 'PaymentMethod']

In [10]:
data_cat = pd.get_dummies(df[other_cols])

In [11]:
data = pd.concat([data_cat, data_binaries], axis=1)

In [12]:
data.shape

(7043, 32)

First and last two column names:

In [13]:
data.columns[[0, -2, -1]]

Index(['MultipleLines_No', 'PaperlessBilling_Yes', 'Churn_Yes'], dtype='object')

In [14]:
x = data.loc[:, 'MultipleLines_No':'PaperlessBilling_Yes']
y = data.loc[:, 'Churn_Yes']

In [15]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y)

In [16]:
classifier = LogisticRegression(solver='lbfgs')

In [17]:
classifier.fit(xtrain, ytrain)

LogisticRegression()

In [18]:
classifier.score(xtest, ytest)

0.7830777967064169