# Hypothesis Testing using Chi-Square

In [33]:
#import libraries
from scipy.stats import chi2
import pandas as pd
import numpy as np

In [34]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [35]:
%cd /content/gdrive/

/content/gdrive/MyDrive/Dibimbing/Dataset


In [36]:
#import dataset
df = pd.read_csv('Telco Churn.csv',encoding ='latin')

In [37]:
#drop unused column
df.drop(columns=['customerID','gender','SeniorCitizen','Partner','Dependents','tenure','PhoneService','MultipleLines','StreamingTV','StreamingMovies',
                 'Contract','PaperlessBilling','PaymentMethod','MonthlyCharges','TotalCharges'], inplace=True)
df.dropna(inplace=True)
df

Unnamed: 0,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,Churn
0,DSL,No,Yes,No,No,No
1,DSL,Yes,No,Yes,No,No
2,DSL,Yes,Yes,No,No,Yes
3,DSL,Yes,No,Yes,Yes,No
4,Fiber optic,No,No,No,No,Yes
...,...,...,...,...,...,...
7038,DSL,Yes,No,Yes,Yes,No
7039,Fiber optic,No,Yes,Yes,No,No
7040,DSL,Yes,No,No,No,No
7041,Fiber optic,No,No,No,No,Yes


In [38]:
#categorical data encoding
df['Churn']= df['Churn'].apply(lambda x: 1 if x=="Yes" else 0).astype(float)
df['OnlineSecurity']= df['OnlineSecurity'].apply(lambda x: 1 if x=="Yes" else 0).astype(float)
df['OnlineBackup']= df['OnlineBackup'].apply(lambda x: 1 if x=="Yes" else 0).astype(float)
df['DeviceProtection']= df['DeviceProtection'].apply(lambda x: 1 if x=="Yes" else 0).astype(float)
df['TechSupport']= df['TechSupport'].apply(lambda x: 1 if x=="Yes" else 0).astype(float)
df['InternetService']= df['InternetService'].apply(lambda x: 0 if x=="No" else 1).astype(float)

In [39]:
#feature engineering of service plan (accumulation of consumers additional subscription)
df['ServicePlan'] = df['OnlineSecurity'] +  df['OnlineBackup'] + df['DeviceProtection'] + df['TechSupport']
df

Unnamed: 0,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,Churn,ServicePlan
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1.0,1.0,0.0,1.0,0.0,0.0,2.0
2,1.0,1.0,1.0,0.0,0.0,1.0,2.0
3,1.0,1.0,0.0,1.0,1.0,0.0,3.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...
7038,1.0,1.0,0.0,1.0,1.0,0.0,3.0
7039,1.0,0.0,1.0,1.0,0.0,0.0,2.0
7040,1.0,1.0,0.0,0.0,0.0,0.0,1.0
7041,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [40]:
#contigency table of Service Plan and Churn
df_cont = pd.crosstab(df['ServicePlan'], df['Churn'],margins=True)
df_cont

Churn,0.0,1.0,All
ServicePlan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,1962,831,2793
1.0,897,570,1467
2.0,1046,326,1372
3.0,824,117,941
4.0,445,25,470
All,5174,1869,7043


In [41]:
#Filter data with users who do multiple subscriptions
df_multiple = df[(df['ServicePlan'] != 0.0)]
df_multiple

Unnamed: 0,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,Churn,ServicePlan
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1.0,1.0,0.0,1.0,0.0,0.0,2.0
2,1.0,1.0,1.0,0.0,0.0,1.0,2.0
3,1.0,1.0,0.0,1.0,1.0,0.0,3.0
5,1.0,0.0,0.0,1.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...
7036,1.0,0.0,1.0,1.0,1.0,0.0,3.0
7038,1.0,1.0,0.0,1.0,1.0,0.0,3.0
7039,1.0,0.0,1.0,1.0,0.0,0.0,2.0
7040,1.0,1.0,0.0,0.0,0.0,0.0,1.0


Chi-Square interpretation:

*   If chi_square_statistic >= critical_value: H0 will be rejected and there is a relationship between two variables.
*   If p_value <= alpha: H0 will be rejected and there is a relationship between two variables.



In [42]:
#Hypothesis testing using Chi Square for Multiple Service Plan

df_cont=pd.crosstab(df_multiple["ServicePlan"],df_multiple["Churn"])
print('df_cont :-\n',df_cont)

#Observed Values
Observed_Values = df_cont.values 
print("Observed Values :-\n",Observed_Values)

#Expected Values
b=stats.chi2_contingency(df_cont)
Expected_Values = b[3]
print("Expected Values :-\n",Expected_Values)

no_of_rows=len(df_cont.iloc[0:2,0])
no_of_columns=len(df_cont.iloc[0,0:2])
ddof=(no_of_rows-1)*(no_of_columns-1)
print("Degree of Freedom:-",ddof)

#significance value
alpha = 0.05

chi_square=sum([(o-e)**2./e for o,e in zip(Observed_Values,Expected_Values)])
chi_square_statistic=chi_square[0]+chi_square[1]
print("chi-square statistic:-",chi_square_statistic)

#critical value
critical_value=chi2.ppf(q=1-alpha,df=ddof)
print('critical_value:',critical_value)


#p-value
p_value=1-chi2.cdf(x=chi_square_statistic,df=ddof)


print('p-value:',p_value)
print('Significance level: ',alpha)
print('Degree of Freedom: ',ddof)
print('chi-square statistic:',chi_square_statistic)
print('critical_value:',critical_value)
print('p-value:',p_value)

df_cont :-
 Churn         0.0  1.0
ServicePlan           
1.0           897  570
2.0          1046  326
3.0           824  117
4.0           445   25
Observed Values :-
 [[ 897  570]
 [1046  326]
 [ 824  117]
 [ 445   25]]
Expected Values :-
 [[1108.70682353  358.29317647]
 [1036.90917647  335.09082353]
 [ 711.17458824  229.82541176]
 [ 355.20941176  114.79058824]]
Degree of Freedom:- 1
chi-square statistic:- 332.06419956738944
critical_value: 3.841458820694124
p-value: 0.0
Significance level:  0.05
Degree of Freedom:  1
chi-square statistic: 332.06419956738944
critical_value: 3.841458820694124
p-value: 0.0
