
# Lab | Random Forest
#### Apply the Random Forests algorithm but this time only by upscaling the data using SMOTE.
#### Note that since SMOTE works on numerical data only, we will first encode the categorical variables in this case.


In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import warnings
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

import seaborn as sns

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

In [2]:
data = pd.read_csv("Customer-Churn.csv")
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [3]:
data["gender"].value_counts()

Male      3555
Female    3488
Name: gender, dtype: int64

In [4]:
def gender(x):

    if x == "Male":
         return 1
    else:
        return 0

In [5]:
data["gender"] = data["gender"].apply(gender)
data["gender"].value_counts()

1    3555
0    3488
Name: gender, dtype: int64

In [6]:
data["SeniorCitizen"].value_counts()

0    5901
1    1142
Name: SeniorCitizen, dtype: int64

In [7]:
data["Partner"].value_counts()

No     3641
Yes    3402
Name: Partner, dtype: int64

In [8]:
def partner(x):

    if x == "Yes":
         return 1
    else:
        return 0

In [9]:
data["Partner"] = data["Partner"].apply(partner)
data["Partner"].value_counts()

0    3641
1    3402
Name: Partner, dtype: int64

In [10]:
data["Dependents"].value_counts()

No     4933
Yes    2110
Name: Dependents, dtype: int64

In [11]:
def dependents(x):

    if x == "Yes":
         return 1
    else:
        return 0

In [12]:
data["Dependents"] = data["Dependents"].apply(dependents)
data["Dependents"].value_counts()

0    4933
1    2110
Name: Dependents, dtype: int64

In [13]:
data["tenure"].value_counts()

1     613
72    362
2     238
3     200
4     176
     ... 
28     57
39     56
44     51
36     50
0      11
Name: tenure, Length: 73, dtype: int64

In [14]:
data["tenure"].unique()

array([ 1, 34,  2, 45,  8, 22, 10, 28, 62, 13, 16, 58, 49, 25, 69, 52, 71,
       21, 12, 30, 47, 72, 17, 27,  5, 46, 11, 70, 63, 43, 15, 60, 18, 66,
        9,  3, 31, 50, 64, 56,  7, 42, 35, 48, 29, 65, 38, 68, 32, 55, 37,
       36, 41,  6,  4, 33, 67, 23, 57, 61, 14, 20, 53, 40, 59, 24, 44, 19,
       54, 51, 26,  0, 39])

In [15]:
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,1,0,0,0,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,1,0,0,0,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,1,0,0,0,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,0,0,0,0,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [16]:
data["PhoneService"].value_counts()

Yes    6361
No      682
Name: PhoneService, dtype: int64

In [17]:
def phoneserv(x):

    if x == "Yes":
         return 1
    else:
        return 0

In [18]:
data["PhoneService"] = data["PhoneService"].apply(phoneserv)
data["PhoneService"].value_counts()

1    6361
0     682
Name: PhoneService, dtype: int64

In [19]:
data["OnlineSecurity"].value_counts()

No                     3498
Yes                    2019
No internet service    1526
Name: OnlineSecurity, dtype: int64

In [20]:
def onlinesec(x):

    if x == "Yes":
         return 1
    elif x == "No":
        return 0
    else:
        return 2

In [21]:
data["OnlineSecurity"] = data["OnlineSecurity"].apply(onlinesec)
data["OnlineSecurity"].value_counts()

0    3498
1    2019
2    1526
Name: OnlineSecurity, dtype: int64

In [22]:
data.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [23]:
data["OnlineBackup"].value_counts()

No                     3088
Yes                    2429
No internet service    1526
Name: OnlineBackup, dtype: int64

In [24]:
data["OnlineBackup"] = data["OnlineBackup"].apply(onlinesec)
data["OnlineBackup"].value_counts()

0    3088
1    2429
2    1526
Name: OnlineBackup, dtype: int64

In [25]:
data["DeviceProtection"].value_counts()

No                     3095
Yes                    2422
No internet service    1526
Name: DeviceProtection, dtype: int64

In [26]:
data["DeviceProtection"] = data["DeviceProtection"].apply(onlinesec)
data["DeviceProtection"].value_counts()

0    3095
1    2422
2    1526
Name: DeviceProtection, dtype: int64

In [27]:
data["TechSupport"].value_counts()

No                     3473
Yes                    2044
No internet service    1526
Name: TechSupport, dtype: int64

In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   int64  
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   int64  
 3   Dependents        7043 non-null   int64  
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   int64  
 6   OnlineSecurity    7043 non-null   int64  
 7   OnlineBackup      7043 non-null   int64  
 8   DeviceProtection  7043 non-null   int64  
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   object 
dtypes: float64(1), int64(9), object(6)
memory 

In [29]:
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,0,1,0,No,No,No,Month-to-month,29.85,29.85,No
1,1,0,0,0,34,1,1,0,1,No,No,No,One year,56.95,1889.5,No
2,1,0,0,0,2,1,1,1,0,No,No,No,Month-to-month,53.85,108.15,Yes
3,1,0,0,0,45,0,1,0,1,Yes,No,No,One year,42.3,1840.75,No
4,0,0,0,0,2,1,0,0,0,No,No,No,Month-to-month,70.7,151.65,Yes


In [30]:
data["TechSupport"].value_counts()

No                     3473
Yes                    2044
No internet service    1526
Name: TechSupport, dtype: int64

In [31]:
data["TechSupport"] = data["TechSupport"].apply(onlinesec)
data["TechSupport"].value_counts()

0    3473
1    2044
2    1526
Name: TechSupport, dtype: int64

In [32]:
data["StreamingTV"].value_counts()

No                     2810
Yes                    2707
No internet service    1526
Name: StreamingTV, dtype: int64

In [33]:
data["StreamingTV"] = data["StreamingTV"].apply(onlinesec)
data["StreamingTV"].value_counts()

0    2810
1    2707
2    1526
Name: StreamingTV, dtype: int64

In [34]:
data["StreamingMovies"].value_counts()

No                     2785
Yes                    2732
No internet service    1526
Name: StreamingMovies, dtype: int64

In [35]:
data["StreamingMovies"] = data["StreamingMovies"].apply(onlinesec)
data["StreamingMovies"].value_counts()

0    2785
1    2732
2    1526
Name: StreamingMovies, dtype: int64

In [36]:
data["Contract"].value_counts()

Month-to-month    3875
Two year          1695
One year          1473
Name: Contract, dtype: int64

In [37]:
def contr(x):

    if x == "Month-to-month":
         return 0
    elif x == "One year":
        return 1
    else:
        return 2

In [38]:
data["Contract"] = data["Contract"].apply(contr)
data["Contract"].value_counts()

0    3875
2    1695
1    1473
Name: Contract, dtype: int64

In [39]:
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,0,1,0,0,0,0,0,29.85,29.85,No
1,1,0,0,0,34,1,1,0,1,0,0,0,1,56.95,1889.5,No
2,1,0,0,0,2,1,1,1,0,0,0,0,0,53.85,108.15,Yes
3,1,0,0,0,45,0,1,0,1,1,0,0,1,42.3,1840.75,No
4,0,0,0,0,2,1,0,0,0,0,0,0,0,70.7,151.65,Yes


In [40]:
data["Churn"].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [41]:
data["Churn"] = data["Churn"].apply(phoneserv)
data["Churn"].value_counts()

0    5174
1    1869
Name: Churn, dtype: int64

In [42]:
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,0,1,0,0,0,0,0,29.85,29.85,0
1,1,0,0,0,34,1,1,0,1,0,0,0,1,56.95,1889.5,0
2,1,0,0,0,2,1,1,1,0,0,0,0,0,53.85,108.15,1
3,1,0,0,0,45,0,1,0,1,1,0,0,1,42.3,1840.75,0
4,0,0,0,0,2,1,0,0,0,0,0,0,0,70.7,151.65,1


In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   int64  
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   int64  
 3   Dependents        7043 non-null   int64  
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   int64  
 6   OnlineSecurity    7043 non-null   int64  
 7   OnlineBackup      7043 non-null   int64  
 8   DeviceProtection  7043 non-null   int64  
 9   TechSupport       7043 non-null   int64  
 10  StreamingTV       7043 non-null   int64  
 11  StreamingMovies   7043 non-null   int64  
 12  Contract          7043 non-null   int64  
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   int64  
dtypes: float64(1), int64(14), object(1)
memory

In [44]:
data["TotalCharges"].value_counts()

          11
20.2      11
19.75      9
20.05      8
19.9       8
          ..
6849.4     1
692.35     1
130.15     1
3211.9     1
6844.5     1
Name: TotalCharges, Length: 6531, dtype: int64

In [45]:
# only 11 blank spaces (" "), i'll drop them (11 rows in 7043)

In [46]:
data["TotalCharges"] = data["TotalCharges"].replace(" ", float("NaN"))
data["TotalCharges"] = data["TotalCharges"].astype(float)
data = data.dropna(subset=["TotalCharges"])

In [47]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7032 non-null   int64  
 1   SeniorCitizen     7032 non-null   int64  
 2   Partner           7032 non-null   int64  
 3   Dependents        7032 non-null   int64  
 4   tenure            7032 non-null   int64  
 5   PhoneService      7032 non-null   int64  
 6   OnlineSecurity    7032 non-null   int64  
 7   OnlineBackup      7032 non-null   int64  
 8   DeviceProtection  7032 non-null   int64  
 9   TechSupport       7032 non-null   int64  
 10  StreamingTV       7032 non-null   int64  
 11  StreamingMovies   7032 non-null   int64  
 12  Contract          7032 non-null   int64  
 13  MonthlyCharges    7032 non-null   float64
 14  TotalCharges      7032 non-null   float64
 15  Churn             7032 non-null   int64  
dtypes: float64(2), int64(14)
memory usage: 933

In [48]:
data.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [49]:
# After encoding manually everything (i think that i lost too much time in this... 
# there must be another way so much easier to do this)
# I'll apply the SMOTE and TomekLinks and compare them

In [50]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X = data[['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'MonthlyCharges', 'TotalCharges']]
transformer = StandardScaler().fit(X)
X = transformer.transform(X)
y = data['Churn']
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

0    5163
1    5163
Name: Churn, dtype: int64

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=42)

rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train, y_train)

y_pred = rf_regressor.predict(X_test)


In [59]:
model = RandomForestRegressor(criterion='friedman_mse', max_depth=None, random_state=42, bootstrap=True, n_jobs=-1)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.504698168357204