# YSO Telecom business review

## Python Project 

#### Submitted by: Yair Barel • Shlomi Kiko • Ori Valdman

###

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv (r'C:\Users\ybarel\Desktop\Documents\Data Analyst Course\Python Project\Data Source\churn.csv')

#Rename all columns
df = df.rename(columns=str.lower)

# Merge all the redundant values into 'no' or 'yes'
df.loc[(df['internetservice']    == 'No'), 'internetservice'] = 'No Internet Service'
df.loc[(df['multiplelines']      == 'No phone service'),'multiplelines'] = 'No'
df.loc[(df['onlinesecurity']     == 'No internet service'),'onlinesecurity'] = 'No'
df.loc[(df['onlinebackup']       == 'No internet service'),'onlinebackup'] = 'No'
df.loc[(df['deviceprotection']   == 'No internet service'),'deviceprotection'] = 'No'
df.loc[(df['techsupport']        == 'No internet service'),'techsupport'] = 'No'
df.loc[(df['streamingtv']        == 'No internet service'),'streamingtv'] = 'No'
df.loc[(df['streamingmovies']    == 'No internet service'),'streamingmovies'] = 'No'

# convert YES and NO values into 1 and 0
df.churn                         = df.churn.map(dict(Yes=1, No=0))
df.multiplelines                 = df.multiplelines.map(dict(Yes=1, No=0))
df.onlinesecurity                = df.onlinesecurity.map(dict(Yes=1, No=0))
df.onlinebackup                  = df.onlinebackup.map(dict(Yes=1, No=0))
df.deviceprotection              = df.deviceprotection.map(dict(Yes=1, No=0))
df.techsupport                   = df.techsupport.map(dict(Yes=1, No=0))
df.streamingtv                   = df.streamingtv.map(dict(Yes=1, No=0))
df.streamingmovies               = df.streamingmovies.map(dict(Yes=1, No=0))
df.partner                       = df.partner.map(dict(Yes=1, No=0))
df.dependents                    = df.dependents.map(dict(Yes=1, No=0))
df.phoneservice                  = df.phoneservice.map(dict(Yes=1, No=0))
df.paperlessbilling              = df.paperlessbilling.map(dict(Yes=1, No=0))

# convert Object to Float
df['totalcharges']               = pd.to_numeric(df['totalcharges'],errors='coerce')
df['totalcharges']               = df['totalcharges'].astype('float64')

# convert Object to string
df['customerid']                 = df['customerid'].astype('string')
df['gender']                     = df['gender'].astype('string')
df['contract']                   = df['contract'].astype('string')
df['paymentmethod']              = df['paymentmethod'].astype('string')
df['internetservice']            = df['internetservice'].astype('string')

# Fill 'TotalCharges' empty values with 'MonthlyCharges' values
df['totalcharges'].fillna(df['monthlycharges'], inplace=True)

##### Total Customers and Total Churn 
***

In [None]:
Total_Churn = sum(df['churn'])
Total_Customers = len(df)

print('\nThere are %d customers in records\nThe number of churn customers is %d which makes the churn rate %f%%\n\n' 
% (Total_Customers,Total_Churn,(Total_Churn/Total_Customers*100)))

Churn_Pie    = df['churn'].value_counts()
LegendLabels = ['Active (73.5%)','Churn (26.5%)']
PieLabels    = None
PieColors    = ["#74959A","#F1E0AC"]

plt.figure                (figsize = (10,10)                                                                      )
Churn_Pie.plot.pie        (autopct='%1.1f%%', labels = PieLabels,colors = PieColors,textprops = {'fontsize': 18}  )
plt.legend                (LegendLabels ,prop={"size":16}, loc=2                                                  )
plt.xlabel                (None                                                                                   )
plt.ylabel                (None                                                                                   )
plt.title                 (None                                                                                   )
plt.xticks                (rotation = 0                                                                           )
plt.savefig               ('Total Churn Customers.png', dpi=300, bbox_inches='tight'                              )
plt.show                  (                                                                                       )

##### Churn by Contract Type
***

In [None]:
Churn_Contract = df.groupby('contract').churn.mean()
Churn_Contract = pd.DataFrame(Churn_Contract)

Month_to_month = Churn_Contract.at['Month-to-month','churn']
One_Year       = Churn_Contract.at['One year','churn']
Two_Year       = Churn_Contract.at['Two year','churn']

print('\nWe measure the customers Churn by the contract types:\n\nMonth-to-Month contract  %f%%\nOne-Year contract        %f%%\nTwo-Years contract       %f%%\n\n' 
% (Month_to_month*100,One_Year*100,Two_Year*100))


Dist_Contract = df.contract.value_counts(normalize = True).sort_index()
Churn_Contract = df.groupby('contract').churn.mean()
Contract_Dist_Churn = pd.DataFrame({'Contract Type':Dist_Contract,'Churn Rate':Churn_Contract})

Colors    = ["#5B7DB1","#61A4BC"]

ax = Contract_Dist_Churn.plot  (kind = 'bar',color = Colors,figsize = (15,10)                  )
plt.ylabel                     (None                                                           )
plt.title                      ('Churn by Contract Types', fontsize = 16, loc='left'           )
plt.xticks                     (rotation = 0,size = 14                                         )
plt.yticks                     (plt.yticks()[0], ['{:,.0%}'.format(x) for x in plt.yticks()[0]])
plt.legend                     (prop={"size":14}                                               )

for p in ax.patches:
    ax.annotate('{:.1f}%'.format(100*p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005),size = 15,color = '#545454')
    
plt.savefig                    ('Churn by Contract Types.png', dpi=300, bbox_inches='tight'    )
plt.show                       (                                                               )

##### Churn by Tenure
***

In [None]:
plt.figure                 (figsize = (10,8)                                       )
plt.hist                   (df['tenure'], bins = 'auto', alpha = 0.7, rwidth = 0.85)
plt.xlabel                 ('Tenure', fontsize = 14                                )
plt.ylabel                 ('Frequency', fontsize = 14                             )
plt.title                  ('Tenure Frequency', fontsize = 16, loc='left'          )
plt.show                   (                                                       )

In [None]:
Churn_Tenure = df.groupby('tenure').churn.mean()

plt.figure                 (figsize = (25,8)                                               )
Churn_Tenure.plot          (kind = 'bar', color = '#61A4BC'                                )
plt.xlabel                 ('Tenure', fontsize = 14                                        )
plt.ylabel                 ('Churn Rate', fontsize = 14                                    )
plt.title                  ('Churn by Tenure', fontsize = 16,loc='left'                    )
plt.xticks                 (rotation = 0                                                   )
plt.yticks                 (plt.yticks()[0], ['{:,.0%}'.format(x) for x in plt.yticks()[0]])
plt.savefig                ('Churn by Tenure.png', dpi=300, bbox_inches='tight'            )
plt.show                   (                                                               )

In [None]:
Churn_Tenure = df.groupby('tenure').churn.mean()
Churn_Tenure = pd.DataFrame(Churn_Tenure)
Churn_Tenure.reset_index(inplace=True)

Colors    = ["#61A4BC"]

Churn_Tenure.plot.scatter  (x = 'tenure', y = 'churn',figsize = (10,10),s = 30, c = Colors )
plt.xlabel                 ('Tenure', fontsize = 14                                        )
plt.ylabel                 ('Churn Rate', fontsize = 14                                    )
plt.title                  ('Churn by Tenure', fontsize = 16,loc = 'left'                  )
plt.xticks                 (rotation = 0                                                   )
plt.yticks                 (plt.yticks()[0], ['{:,.0%}'.format(x) for x in plt.yticks()[0]])
plt.savefig                ('Churn by TenureGroup.png', dpi=300, bbox_inches='tight'      )

plt.show                   (                                                               )

In [None]:
df['tenuregroup'] = None

df.loc[(df['tenure'] >= 0)  & (df['tenure'] <= 12),'tenuregroup'] = '01'
df.loc[(df['tenure'] >= 13) & (df['tenure'] <= 24),'tenuregroup'] = '02'
df.loc[(df['tenure'] >= 25) & (df['tenure'] <= 36),'tenuregroup'] = '03'
df.loc[(df['tenure'] >= 37) & (df['tenure'] <= 48),'tenuregroup'] = '04'
df.loc[(df['tenure'] >= 49) & (df['tenure'] <= 60),'tenuregroup'] = '05'
df.loc[(df['tenure'] >= 61) & (df['tenure'] <= 72),'tenuregroup'] = '06'

df['tenuregroup'] = df['tenuregroup'].astype('int64')

Dist_TenureGroup = df.tenuregroup.value_counts(normalize = True).sort_index()
Churn_TenureGroup = df.groupby('tenuregroup').churn.mean()
TenureGroup_Dist_Churn = pd.DataFrame({'Tenure Year Group':Dist_TenureGroup,'Churn Rate':Churn_TenureGroup})


Colors    = ["#5B7DB1","#61A4BC"]

ax = TenureGroup_Dist_Churn.plot  (kind='bar',   color = Colors, width=0.7,figsize = (18,10)      )
plt.xlabel                        (None                                                           )
plt.ylabel                        (None                                                           )
plt.title                         ('Churn by Tenure Year Group', fontsize = 16, loc='left'        )
plt.xticks                        (size = 13, rotation = 0                                        )
plt.yticks                        (plt.yticks()[0], ['{:,.0%}'.format(x) for x in plt.yticks()[0]])
plt.legend                        (prop={"size":14}                                               )

for p in ax.patches:
    ax.annotate('{:.1f}%'.format(100*p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005),size = 15,color = '#545454')

plt.savefig                       ('Churn by TenureGroup.png', dpi=300, bbox_inches='tight'      )
plt.show                          (                                                              )

##### Churn by Internet Service
***

In [None]:
Dist_InternetService = df.internetservice.value_counts(normalize = True).sort_values(ascending = True)
Churn_InternetService = df.groupby('internetservice').churn.mean()
InternetService_Dist_Churn = pd.DataFrame({'Internet Service':Dist_InternetService,'Churn Rate':Churn_InternetService})

Colors    = ["#5B7DB1","#61A4BC"]

ax = InternetService_Dist_Churn.plot(kind = 'bar', color = Colors, figsize = (15,10)                                         )
plt.xlabel                          (''                                                                                      )
plt.ylabel                          (None                                                                                    )
plt.title                           ('Churn by Internet Service', fontsize = 16, loc='left'                                  )
plt.xticks                          (rotation = 0, size = 14                                                                 )
plt.yticks                          (plt.yticks()[0], ['{:,.0%}'.format(x) for x in plt.yticks()[0]]                         )
plt.legend                          (prop={"size":14}                                                                        )

for p in ax.patches:
    ax.annotate('{:.1f}%'.format(100*p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005),size = 15,color = '#545454')

plt.savefig                         ('Churn by Internet Service.png', dpi=300, bbox_inches='tight'                           )
plt.show                            (                                                                                        )

##### Churn by Payment Method
***

In [None]:
Dist_PaymentMethod = df.paymentmethod.value_counts(normalize = True).sort_values(ascending = True)
Churn_PaymentMethod = df.groupby('paymentmethod').churn.mean().sort_values()
InternetService_Dist_Churn = pd.DataFrame({'PaymentMethod':Dist_PaymentMethod,'Churn Rate':Churn_PaymentMethod})

Colors    = ["#5B7DB1","#61A4BC"]

ax = InternetService_Dist_Churn.plot  (kind = 'barh', color = Colors, figsize = (15,10)               )
plt.xlabel                            (None                                                           )
plt.ylabel                            (None                                                           )
plt.title                             ('Churn by Payment Method', fontsize = 16,loc='left'            )
plt.xticks                            (plt.xticks()[0], ['{:,.0%}'.format(x) for x in plt.xticks()[0]])
plt.yticks                            (size = 14                                                      )
plt.legend                            (prop={"size":14}                                               )

#for p in ax.patches:
#    ax.annotate('{:.1f}%'.format(100*p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005),size = 15,color = '#545454')

plt.savefig                           ('Churn by Payment Method.png', dpi=300, bbox_inches='tight'    )
plt.show                              (                                                               )

##### Churn by Paperless Billing
***

In [None]:
Dist_PaperlessBilling = df.paperlessbilling.value_counts(normalize = True).sort_values(ascending = True)
Churn_PaperlessBilling = df.groupby('paperlessbilling').churn.mean().sort_values()
InternetService_Dist_Churn = pd.DataFrame({'Billing Type':Dist_PaperlessBilling,'Churn Rate':Churn_PaperlessBilling})

Colors    = ["#5B7DB1","#61A4BC"]

ax = InternetService_Dist_Churn.plot  (kind='bar',   color = Colors, figsize = (15,10)                )
plt.xlabel                            (None                                                           )
plt.ylabel                            (None                                                           )
plt.title                             ('Churn by Billing Type', fontsize = 16, loc='left'             )
positions =                           (0, 1                                                           )
labels    =                           ('Paper Billing','Electronic Billing'                           )
plt.xticks                            (positions, labels,size = 14, rotation = 0                      )
plt.yticks                            (plt.yticks()[0], ['{:,.0%}'.format(x) for x in plt.yticks()[0]])
plt.legend                            (prop={"size":14}                                               )

for p in ax.patches:
    ax.annotate('{:.1f}%'.format(100*p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005),size = 15,color = '#545454')
    
plt.savefig                           ('Churn by Billing Type.png', dpi=300, bbox_inches='tight'      )
plt.show                              (                                                               )

##### Churn by Senior Citizen
***

In [None]:
Dist_SeniorCitizen = df.seniorcitizen.value_counts(normalize = True).sort_values(ascending = True)
Churn_SeniorCitizen = df.groupby('seniorcitizen').churn.mean().sort_values()
SeniorCitizen_Dist_Churn = pd.DataFrame({'Senior Citizen':Dist_SeniorCitizen,'Churn Rate':Churn_SeniorCitizen})

Colors    = ["#5B7DB1","#61A4BC"]

ax = SeniorCitizen_Dist_Churn.plot  (kind='bar',   color = Colors,figsize = (15,10)                 )
plt.xlabel                          (None                                                           )
plt.ylabel                          (None                                                           )
plt.title                           ('Churn by Citizen Seniority', fontsize = 16, loc='left'        )
positions =                         (0, 1                                                           )
labels    =                         ('Not Senior','Senior'                                          )
plt.xticks                          (positions, labels,size = 14, rotation = 0                      )
plt.yticks                          (plt.yticks()[0], ['{:,.0%}'.format(x) for x in plt.yticks()[0]])
plt.legend                          (prop={"size":14}                                               )

for p in ax.patches:
    ax.annotate('{:.1f}%'.format(100*p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005),size = 15,color = '#545454')

plt.savefig                         ('Churn by Citizen Seniority.png', dpi=300, bbox_inches='tight' )
plt.show                            (                                                               )

##### Churn by Other Services
***

In [None]:
Churn_OnlineSecurity = df.groupby('onlinesecurity').churn.mean()

Colors    = ["#5B7DB1","#61A4BC"]

plt.figure                     (figsize = (10,10)                                              )
ax = Churn_OnlineSecurity.plot (kind = 'bar', color = Colors                                   )
plt.xlabel                     (''                                                             )
plt.ylabel                     ('Churn Rate', fontsize = 13                                    )
plt.title                      ('Churn by Online Security', fontsize = 14                      )
plt.xticks                     (rotation = 0,size = 13                                         )
positions =                    (0, 1                                                           )
labels    =                    ('Online Security Not Included','Online Security Included'      )
plt.xticks                     (positions, labels,size = 14, rotation = 0                      )
plt.yticks                     (plt.yticks()[0], ['{:,.0%}'.format(x) for x in plt.yticks()[0]])

for p in ax.patches:
    ax.annotate('{:.1f}%'.format(100*p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005),size = 15,color = '#545454')
    
plt.show                       (                                                               )

In [None]:
Churn_OnlineBackup = df.groupby('onlinebackup').churn.mean()

Colors    = ["#5B7DB1","#61A4BC"]

plt.figure                     (figsize = (10,10)                                         )
ax = Churn_OnlineBackup.plot   (kind = 'bar', color = Colors                              )
plt.xlabel                     (''                                                        )
plt.ylabel                     ('Churn Rate', fontsize = 13                               )
plt.title                      ('Churn by Online Backup', fontsize = 14                   )
positions =                    (0, 1                                                      )
labels    =                    ('Online Backup Not Included','Online Backup Included'     )
plt.xticks                     (positions, labels,size = 14, rotation = 0                 )

for p in ax.patches:
    ax.annotate('{:.1f}%'.format(100*p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005),size = 15,color = '#545454')
                  
plt.show                       (                                                          )

In [None]:
Churn_TechSupport = df.groupby('techsupport').churn.mean()

Colors    = ["#5B7DB1","#61A4BC"]

plt.figure                     (figsize = (10,10)                                      )
ax = Churn_TechSupport.plot    (kind = 'bar', color = Colors                           )
plt.xlabel                     (''                                                     )
plt.ylabel                     ('Churn Rate', fontsize = 13                            )
plt.title                      ('Churn by Tech Support', fontsize = 14                 )
positions =                    (0, 1                                                   )
labels    =                    ('Tech Support Not Included','Tech Support Included'    )
plt.xticks                     (positions, labels,size = 14, rotation = 0              )

for p in ax.patches:
    ax.annotate('{:.1f}%'.format(100*p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005),size = 15,color = '#545454')
    

plt.show                       (                                                       )

In [None]:
Churn_PhoneService = df.groupby('phoneservice').churn.mean()

Colors    = ["#5B7DB1","#61A4BC"]

plt.figure                     (figsize = (10,10)                                    )
ax = Churn_PhoneService.plot   (kind = 'bar', color = Colors                         )
plt.xlabel                     (''                                                   )
plt.ylabel                     ('Churn Rate', fontsize = 13                          )
plt.title                      ('Churn by Phone Service', fontsize = 14              )
positions =                    (0, 1                                                 )
labels    =                    ('Phone Service Not Included','Phone Service Included')
plt.xticks                     (positions, labels,size = 14, rotation = 0            )

for p in ax.patches:
    ax.annotate('{:.1f}%'.format(100*p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005),size = 15,color = '#545454')
    
plt.show                       (                                                     )

In [None]:
Churn_MultipleLines = df.groupby('multiplelines').churn.mean()

Colors    = ["#5B7DB1","#61A4BC"]

plt.figure                     (figsize = (10,10)                           )
ax = Churn_MultipleLines.plot  (kind = 'bar', color = Colors                )
plt.xlabel                     (''                                          )
plt.ylabel                     ('Churn Rate', fontsize = 13                 )
plt.title                      ('Churn by Multiple Lines', fontsize = 14    )
positions =                    (0, 1                                        )
labels    =                    ('No Multiple Lines','Multiple Lines'        )
plt.xticks                     (positions, labels,size = 14, rotation = 0   )

for p in ax.patches:
    ax.annotate('{:.1f}%'.format(100*p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005),size = 15,color = '#545454')

plt.show                       (                                            )

##### Average Tenure by Contract Type
***

In [None]:
ContractByTenure = df.groupby(['contract']).tenure.mean()
ContractByTenure = pd.DataFrame(ContractByTenure)
ContractByTenure.reset_index(inplace=True)

Colors    = ["#A2D5AB","#5B7DB1","#61A4BC"]

ax = ContractByTenure['tenure'].plot.bar (color = Colors, figsize = (15, 10)                                )
positions =                              (0, 1, 2                                                           )
labels    =                              ('Month-to-month','One year','Two year'                            )
plt.xticks                               (positions, labels,size = 14, rotation = 0                         )
plt.yticks                               (size = 13                                                         )
plt.title                                ('Average Tenure by Contract Type', fontsize = 16,loc = 'left'     )

for p in ax.patches:
    ax.annotate('{:.1f}'.format(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005),size = 15,color = '#545454')
    
plt.savefig                             ('Average Tenure by Contract Type.png', dpi=300, bbox_inches='tight')

##### Average Monthly Charge
***

In [None]:
GroupByMonthlyCharges = df.groupby(['churn'])
AverageMonthlyCharge  = GroupByMonthlyCharges['monthlycharges'].describe().sort_values(by=[('mean')], ascending=False)

Colors    = ["#F1E0AC","#74959A"]

plt.figure                              (figsize = (10,10)                                             )
ax = AverageMonthlyCharge['mean'].plot  (kind = 'bar', color = Colors                                  )
plt.xlabel                              (''                                                            )
plt.ylabel                              (None                                                          )
plt.title                               ('Average Monthly Charges', fontsize = 16, loc = 'left'        )
positions =                             (0, 1                                                          )
labels    =                             ('Churn','Active'                                              )
plt.xticks                              (positions, labels,size = 14, rotation = 0                     )

for p in ax.patches:
    ax.annotate('{:.1f}'.format(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005),size = 15,color = '#545454')
    
plt.savefig                             ('Average Monthly Charge.png', dpi=300, bbox_inches='tight'    )
plt.show                                (                                                              )

In [None]:
df['monthlychargegroups'] = None
df.monthlychargegroups = df.monthlychargegroups.astype('string')

df.loc[(df['monthlycharges'] < 74.441332)  & (df['churn'] == 1),'monthlychargegroups'] = 'Low Monthly Charge'
df.loc[(df['monthlycharges'] < 61.265124)  & (df['churn'] == 0),'monthlychargegroups'] = 'Low Monthly Charge'

df.loc[(df['monthlycharges'] >= 74.441332) & (df['churn'] == 1),'monthlychargegroups'] = 'High Monthly Charge'
df.loc[(df['monthlycharges'] >= 61.265124) & (df['churn'] == 0),'monthlychargegroups'] = 'High Monthly Charge'


MonthlyChargesGroups = df.groupby('monthlychargegroups').churn.mean().sort_values()

Colors    = ["#5B7DB1","#61A4BC"]

plt.figure                     (figsize = (10,10)                                              )
ax = MonthlyChargesGroups.plot (kind='bar',   color = Colors                                   )
plt.xlabel                     (None                                                           )
plt.ylabel                     (None                                                           )
plt.title                      ('Churn by Monthly Charge Rate', fontsize = 16, loc='left'      )
plt.xticks                     (size = 14, rotation = 0                                        )
plt.yticks                     (plt.yticks()[0], ['{:,.0%}'.format(x) for x in plt.yticks()[0]])

for p in ax.patches:
    ax.annotate('{:.1f}%'.format(100*p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005),size = 15,color = '#545454')

plt.show                       (                                                               )

##### Scatter Churn by Contract types
***

In [None]:
sns.set_style('ticks')

sns.relplot(data=df, x="monthlycharges", y="totalcharges",hue="contract",col = 'churn')
plt.gcf().set_size_inches(20, 10)

sns.despine()

In [None]:
sns.relplot(data=df, x="tenuregroup", y="churn",
    hue="contract", kind="line",col = 'contract')
sns.despine()

###

#### Correlation

***

###

In [None]:
dfcorr = df

In [None]:
dfcorr = df.drop(['customerid','tenure','totalcharges','monthlychargegroups'], axis=1)

In [None]:
dfcorr.dtypes

In [None]:
dfcorr = pd.get_dummies(dfcorr,columns = ['gender','tenuregroup','internetservice','contract','paymentmethod'])
dfcorr.insert(0, 'churn', dfcorr.pop('churn'))
dfcorr.dtypes

In [None]:
plt.figure   (figsize=(30, 15)                                            )
sns.heatmap  (dfcorr.corr()[['churn']].sort_values(by='churn', ascending=False), 
              vmin=-1, vmax=1, annot=True, cmap='BrBG').set_title('Correlation Heatmap by Churn', fontdict={'fontsize':15}, pad=12);
plt.yticks   (size = 13                                                   )
plt.xticks   ([]                                                          )
plt.savefig  ('heatmap.png', dpi=300, bbox_inches='tight'                 )
plt.show     (                                                            )

In [None]:
plt.figure(figsize=(30, 15))

mask = np.triu(np.ones_like(dfcorr.corr(), dtype=np.bool))

sns.heatmap(dfcorr.corr().sort_values(by='churn', ascending=True), vmin=-1, vmax=1, annot=True, cmap='BrBG',mask=mask);
plt.yticks (size = 12                                           )

plt.savefig('heatmap.png', dpi=300, bbox_inches='tight')

plt.show()

###

#### Machine Learning
***

###

In [None]:
dfml = df

In [None]:
dfml = dfml.set_index('customerid')
dfml = dfml.rename_axis(None, axis = 0)
dfml['customerid'] = np.arange(len(df))
dfml.insert(0, 'customerid', dfml.pop('customerid'))

In [None]:
dfml.dtypes

In [None]:
dfml = dfml.drop(['totalcharges','tenuregroup','monthlychargegroups'], axis=1)

In [None]:
dfml = pd.get_dummies(dfml,columns = ['gender','internetservice','contract','paymentmethod'])

In [None]:
dfml = dfml.astype(float)

In [None]:
from sklearn.model_selection import train_test_split

# 1409 = 20% 
# 5634 = 80%

test_size = 1409
train, test = train_test_split(dfml, test_size = test_size, random_state = 0, shuffle = True)

label = 'churn'
cusid = 'customerid'

x_train = train.drop(label, axis = 1)
x_train = x_train.drop(cusid, axis = 1)
y_train = train[label]
cusid_train = train[cusid]

x_test = test.drop(label, axis = 1)
x_test = x_test.drop(cusid, axis = 1)
y_test = test[label]
cusid_test = test[cusid]

#### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth = 3)
clf.fit(x_train, y_train)
y_test_pred_DecisionTree = clf.predict(x_test)

output = pd.DataFrame({'CustomerID':cusid_test,'Churn Fact':y_test, 'Churn Predict By Model':y_test_pred_DecisionTree})
output.to_csv('DecisionTreeClassifier.csv',index = False)

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
from sklearn.metrics import accuracy_score


def plot_tree(tree,features,labels):
    graph = Source(export_graphviz(tree,feature_names = features, class_names = labels, filled = True))
    display(SVG(graph.pipe(format = 'svg')))
    
plot_tree(clf, x_train.columns,['Active','Churn'])

In [None]:
# Test the Decision Tree 

clf = DecisionTreeClassifier(max_depth=5)
clf.fit(x_train, y_train)
y_test_pred_DecisionTree = clf.predict(x_test)
test_acc = accuracy_score(y_test, y_test_pred_DecisionTree)
test_acc

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=999, max_depth=3, random_state=1)

model.fit(x_train, y_train) 

y_test_pred_RandomForest = model.predict(x_test) 

output = pd.DataFrame({'CustomerID':cusid_test,'Churn Fact':y_test, 'Churn Predict By Model':y_test_pred_RandomForest})
output.to_csv('RandomForest_Prediction.csv',index = False)

In [None]:
feature_importances = model.feature_importances_
feature_importances 

In [None]:
features = x_train.columns
stats = pd.DataFrame({'feature':features, 'importance':feature_importances})
stats.sort_values('importance', ascending=False)

Colors    = ["#5B7DB1","#61A4BC"]

stats_sort = stats.sort_values('importance', ascending=True)
stats_sort.plot(y='importance', x='feature', kind='barh', figsize = (15,10),color = Colors)
plt.title('Feature Importance of Random Forest');

plt.xticks                     (plt.xticks()[0], ['{:,.0%}'.format(x) for x in plt.xticks()[0]])

plt.savefig                    ('Churn by Citizen Seniority.png', dpi=300, bbox_inches='tight' )

In [None]:
# Test the Random Forest

model = RandomForestClassifier(n_estimators=16, max_depth=6, random_state=1)        
model.fit(x_train, y_train)
y_test_pred_RandomForest = model.predict(x_test)
test_acc = accuracy_score(y_test, y_test_pred_RandomForest)
test_acc

#### K Nearest Neighbors


In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=3)

clf.fit(x_train, y_train)

y_test_pred_Knn = clf.predict(x_test) 

output = pd.DataFrame({'CustomerID': cusid_test, 'Churn Fact':y_test, 'Churn Predict By Model': y_test_pred_Knn})
output.to_csv('Knn_Prediction.csv', index=False)
output

In [None]:
# Test the Knn

clf = KNeighborsClassifier(n_neighbors=21)
clf.fit(x_train, y_train)
y_test_pred_Knn = clf.predict(x_test)
test_acc = accuracy_score(y_test, y_test_pred_Knn)
test_acc

#### Reduce Overfitting

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train) 
x_test_scaled = scaler.transform(x_test)

clf = KNeighborsClassifier(n_neighbors=21)
clf.fit(x_train_scaled, y_train)

y_test_pred_Knn_Scaled = clf.predict(x_test_scaled)


output = pd.DataFrame({'CustomerID': cusid_test, 'Churn Fact':y_test, 'Churn Predict By Model': y_test_pred_Knn, 'Churn_predicted_by_model_scaled':y_test_pred_Knn_Scaled})
output.to_csv('Knn_Prediction_scaled.csv', index=False)


test_acc = accuracy_score(y_test, y_test_pred_Knn_Scaled)
test_acc

#### Benchmark

In [None]:
y_train.value_counts()

In [None]:
def get_benchmark_predictions(x, benchmark_value): # The function returns a numpy array on the same length as x with all values equal to benchmark_value
    return np.ones(len(x))*benchmark_value

benchmark_value = 0

y_test_pred_Benchmark = get_benchmark_predictions(x_test, benchmark_value)
y_test_pred_Benchmark

In [None]:
test_acc = accuracy_score(y_test, y_test_pred_Benchmark)
test_acc

### Our model results:
Decision tree best accuracy Result: 78.28%<br>
Random Forest best accuracy Result: 79.98%<br>
Knn best accuracy Result (before standardization): 77.9%<br>
Knn best accuracy Result (after standardization): 77.9%<br>