## Import Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pd.options.display.max_columns = 1000
from sklearn.model_selection import RandomizedSearchCV

## Load in Data

In [4]:
telcomChurn = pd.read_csv("telcomChurn.csv")
telcomChurn.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Data Wrangling

### Drop or recode anything that isn't a number or that isn't useful

### Dropping Customer ID because it is a unique identifier and not a helpful qualifier

In [5]:
telcomChurn1 = telcomChurn.drop('customerID', axis=1)

In [6]:
telcomChurn1.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Recode the rest

In [7]:
def gender (series):
    if series == 'Male':
        return 0
    if series == "Female":
        return 1
telcomChurn1['GenderR'] = telcomChurn1['gender'].apply(gender)

In [8]:
telcomChurn.Partner.value_counts()

Partner
No     3641
Yes    3402
Name: count, dtype: int64

In [9]:
telcomChurn.Dependents.value_counts()

Dependents
No     4933
Yes    2110
Name: count, dtype: int64

In [10]:
telcomChurn.PhoneService.value_counts()

PhoneService
Yes    6361
No      682
Name: count, dtype: int64

In [11]:
telcomChurn.OnlineSecurity.value_counts()

OnlineSecurity
No                     3498
Yes                    2019
No internet service    1526
Name: count, dtype: int64

In [12]:
telcomChurn.OnlineBackup.value_counts()

OnlineBackup
No                     3088
Yes                    2429
No internet service    1526
Name: count, dtype: int64

In [13]:
telcomChurn.DeviceProtection.value_counts()

DeviceProtection
No                     3095
Yes                    2422
No internet service    1526
Name: count, dtype: int64

In [14]:
telcomChurn.TechSupport.value_counts()

TechSupport
No                     3473
Yes                    2044
No internet service    1526
Name: count, dtype: int64

In [15]:
telcomChurn.PaperlessBilling.value_counts()

PaperlessBilling
Yes    4171
No     2872
Name: count, dtype: int64

In [16]:
telcomChurn.Churn.value_counts()

Churn
No     5174
Yes    1869
Name: count, dtype: int64

### When there are mulitple variables with the same format, you can recode these all in the same definition.

In [19]:
def partner (series):
    if series == "No":
        return 0
    if series == "Yes":
        return 1
telcomChurn1['PartnerR'] = telcomChurn['Partner'].apply(partner)
telcomChurn1['DependentsR'] = telcomChurn['Dependents'].apply(partner)
telcomChurn1['PhoneServiceR'] = telcomChurn['PhoneService'].apply(partner)
telcomChurn1['OnlineSecurityR'] = telcomChurn['OnlineSecurity'].apply(partner)
telcomChurn1['OnlineBackupR'] = telcomChurn['OnlineBackup'].apply(partner)
telcomChurn1['ProtectionR'] = telcomChurn['DeviceProtection'].apply(partner)
telcomChurn1['TechSupportR'] = telcomChurn['TechSupport'].apply(partner)
telcomChurn1['PaperlessR'] = telcomChurn['PaperlessBilling'].apply(partner)
telcomChurn1['ChurnR'] = telcomChurn['Churn'].apply(partner)

In [21]:
telcomChurn.StreamingTV.value_counts()

StreamingTV
No                     2810
Yes                    2707
No internet service    1526
Name: count, dtype: int64

In [22]:
telcomChurn.StreamingMovies.value_counts()

StreamingMovies
No                     2785
Yes                    2732
No internet service    1526
Name: count, dtype: int64

In [23]:
def streaming (series):
    if series == "No":
        return 0
    if series == "Yes":
        return 1
    if series == "No internet service":
        return 2
telcomChurn1['TVStreamingR'] = telcomChurn1['StreamingTV'].apply(streaming)
telcomChurn1['MovieStreamingR'] = telcomChurn1['StreamingMovies'].apply(streaming)

In [24]:
telcomChurn.MultipleLines.value_counts()

MultipleLines
No                  3390
Yes                 2971
No phone service     682
Name: count, dtype: int64

In [25]:
def phone (series):
    if series == "No":
        return 0
    if series == "Yes":
        return 1
    if series == "No phone service":
        return 2
telcomChurn1['MultipleLinesR'] = telcomChurn1['MultipleLines'].apply(phone)

In [26]:
telcomChurn.InternetService.value_counts()

InternetService
Fiber optic    3096
DSL            2421
No             1526
Name: count, dtype: int64

In [27]:
def internet (series):
    if series == "No":
        return 0
    if series == "Fiber optic":
        return 1
    if series == "DSL":
        return 2
telcomChurn1['InternetServiceR'] = telcomChurn1['InternetService'].apply(internet)

In [28]:
telcomChurn.Contract.value_counts()

Contract
Month-to-month    3875
Two year          1695
One year          1473
Name: count, dtype: int64

In [29]:
def contract (series):
    if series == "Month-to-month":
        return 0
    if series == "One year":
        return 1
    if series == "Two year":
        return 2
telcomChurn1['ContractR'] = telcomChurn1['Contract'].apply(contract)

In [31]:
telcomChurn.PaymentMethod.value_counts()

PaymentMethod
Electronic check             2365
Mailed check                 1612
Bank transfer (automatic)    1544
Credit card (automatic)      1522
Name: count, dtype: int64

In [32]:
def billing (series):
    if series == "Electronic check":
        return 0
    if series == "Mailed check":
        return 1
    if series == "Bank transfer (automatic)":
        return 2
    if series == "Credit card (automatic)":
        return 3
telcomChurn1['PaymentR'] = telcomChurn1['PaymentMethod'].apply(billing)

In [33]:
telcomChurn1.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,GenderR,PartnerR,DependentsR,PhoneServiceR,OnlineSecurityR,OnlineBackupR,DeviceProtectionR,TechSupportR,ProtectionR,PaperlessR,ChurnR,TVStreamingR,MovieStreamingR,MultipleLinesR,InternetServiceR,ContractR,PaymentR
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,1,1,0,0,0.0,1.0,0.0,0.0,0.0,1,0,0,0,2,2,0,0
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No,0,0,0,1,1.0,0.0,1.0,0.0,1.0,0,0,0,0,0,2,1,1
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,0,0,0,1,1.0,1.0,0.0,0.0,0.0,1,1,0,0,0,2,0,1
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,0,0,0,0,1.0,0.0,1.0,1.0,1.0,0,0,0,0,2,2,1,2
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,0,0,1,0.0,0.0,0.0,0.0,0.0,1,1,0,0,0,1,0,0


In [34]:
telcomChurn1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             7043 non-null   object 
 1   SeniorCitizen      7043 non-null   int64  
 2   Partner            7043 non-null   object 
 3   Dependents         7043 non-null   object 
 4   tenure             7043 non-null   int64  
 5   PhoneService       7043 non-null   object 
 6   MultipleLines      7043 non-null   object 
 7   InternetService    7043 non-null   object 
 8   OnlineSecurity     7043 non-null   object 
 9   OnlineBackup       7043 non-null   object 
 10  DeviceProtection   7043 non-null   object 
 11  TechSupport        7043 non-null   object 
 12  StreamingTV        7043 non-null   object 
 13  StreamingMovies    7043 non-null   object 
 14  Contract           7043 non-null   object 
 15  PaperlessBilling   7043 non-null   object 
 16  PaymentMethod      7043 

### Have to have everything we want to use as an integer (or float).

### Total Charges is coming in as an object.

### Convert TotalCharges to an int

In [37]:
telcomChurn1['TotalCharges'] = pd.to_numeric(telcomChurn['TotalCharges'], errors='coerce')

In [38]:
telcomChurn1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             7043 non-null   object 
 1   SeniorCitizen      7043 non-null   int64  
 2   Partner            7043 non-null   object 
 3   Dependents         7043 non-null   object 
 4   tenure             7043 non-null   int64  
 5   PhoneService       7043 non-null   object 
 6   MultipleLines      7043 non-null   object 
 7   InternetService    7043 non-null   object 
 8   OnlineSecurity     7043 non-null   object 
 9   OnlineBackup       7043 non-null   object 
 10  DeviceProtection   7043 non-null   object 
 11  TechSupport        7043 non-null   object 
 12  StreamingTV        7043 non-null   object 
 13  StreamingMovies    7043 non-null   object 
 14  Contract           7043 non-null   object 
 15  PaperlessBilling   7043 non-null   object 
 16  PaymentMethod      7043 

### Drop Missing and Infinite Values

In [39]:
telcomChurn1.dropna(inplace=True)

## Define x & y variables

In [40]:
x = telcomChurn1[['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'GenderR', 
                  'PartnerR', 'DependentsR', 'PhoneServiceR', 'OnlineSecurityR', 
                  'OnlineBackupR', 'DeviceProtectionR', 'TechSupportR', 'ProtectionR', 
                  'PaperlessR', 'ChurnR', 'TVStreamingR', 'MovieStreamingR', 
                  'MultipleLinesR', 'InternetServiceR', 'ContractR', 'PaymentR']]
y = telcomChurn1['Churn']

In [41]:
x.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,GenderR,PartnerR,DependentsR,PhoneServiceR,OnlineSecurityR,OnlineBackupR,DeviceProtectionR,TechSupportR,ProtectionR,PaperlessR,ChurnR,TVStreamingR,MovieStreamingR,MultipleLinesR,InternetServiceR,ContractR,PaymentR
0,0,1,29.85,29.85,1,1,0,0,0.0,1.0,0.0,0.0,0.0,1,0,0,0,2,2,0,0
1,0,34,56.95,1889.5,0,0,0,1,1.0,0.0,1.0,0.0,1.0,0,0,0,0,0,2,1,1
2,0,2,53.85,108.15,0,0,0,1,1.0,1.0,0.0,0.0,0.0,1,1,0,0,0,2,0,1
3,0,45,42.3,1840.75,0,0,0,0,1.0,0.0,1.0,1.0,1.0,0,0,0,0,2,2,1,2
4,0,2,70.7,151.65,1,0,0,1,0.0,0.0,0.0,0.0,0.0,1,1,0,0,0,1,0,0


In [42]:
y.head()

0     No
1     No
2    Yes
3     No
4    Yes
Name: Churn, dtype: object

## Train Test Split

In [43]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

## Create Initial Decision Tree

In [45]:
decisionTree = DecisionTreeClassifier()
decisionTree.fit(x_train, y_train)

## Get Predictions

In [46]:
treePredictions = decisionTree.predict(x_test)
print(confusion_matrix(y_test, treePredictions))

[[1131    0]
 [   0  523]]


#### My values vary quite a bit from the video.

#### She did not have 0s, but numbers

In [49]:
print(classification_report(y_test, treePredictions))

              precision    recall  f1-score   support

          No       1.00      1.00      1.00      1131
         Yes       1.00      1.00      1.00       523

    accuracy                           1.00      1654
   macro avg       1.00      1.00      1.00      1654
weighted avg       1.00      1.00      1.00      1654



### Uhhhh...... 100% 

## Create Initial Random Forest Model

In [51]:
forest = RandomForestClassifier()
forest.fit(x_train, y_train)

In [52]:
print(accuracy_score(y_test, forest.predict(x_test)))

1.0


#### I got 100% accuracy here too.

In [53]:
print(classification_report(y_test, forest.predict(x_test)))

              precision    recall  f1-score   support

          No       1.00      1.00      1.00      1131
         Yes       1.00      1.00      1.00       523

    accuracy                           1.00      1654
   macro avg       1.00      1.00      1.00      1654
weighted avg       1.00      1.00      1.00      1654



#### I got 100%