# HYPOTHESIS TESTING CASE STUDY

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

In [2]:
df=pd.read_csv('cust_seg.csv',sep=',')
df.head()

Unnamed: 0,custid,sex,AqChannel,region,Marital_status,segment,pre_usage,Post_usage_1month,Latest_mon_usage,post_usage_2ndmonth
0,70,0,4,1,1,1,57,52,49.2,57.2
1,121,1,4,2,1,3,68,59,63.6,64.9
2,86,0,4,3,1,1,44,33,64.8,36.3
3,141,0,4,3,1,3,63,44,56.4,48.4
4,172,0,4,2,1,2,47,52,68.4,57.2


In [3]:
#1.Card has been improved significantly from last year usage which is 50.

# One simple t-test

In [4]:
stats.ttest_1samp(a=df.Latest_mon_usage,
                 popmean=50)

Ttest_1sampResult(statistic=16.57233752433133, pvalue=2.4963719280931583e-39)

In [5]:
df.Latest_mon_usage.mean()

63.17400000000001

In [6]:
#2.The last campaign was successful in terms usage of credit card.

In [7]:
#when we are comparing 2 variables like before and after
#(bcoz obs. are sample but there values are representing change in periods)
#then we apply paired sample t-test.

# Two Sample T-Test (Paired)

In [8]:
print(df.pre_usage.mean())
print(df.Post_usage_1month.mean())
print(df.post_usage_2ndmonth.mean())

52.23
52.775
58.05250000000003


In [9]:
stats.ttest_rel(a=df.pre_usage,
               b=df.Post_usage_1month)

Ttest_relResult(statistic=-0.8673065458794775, pvalue=0.3868186820914985)

In [10]:
stats.ttest_rel(a=df.pre_usage,
               b=df.post_usage_2ndmonth)

Ttest_relResult(statistic=-8.866832246938742, pvalue=4.295733828012836e-16)

In [11]:
#3.Is there any difference between male & female in terms of credit usage?

In [12]:
# we have samples which are mutually exclusive(diff. from each other).Apply independent t-test.

# Two sample T-Test (Independent)

In [13]:
Male_spent=df.Post_usage_1month[df.sex==0]
Male_spent.head()

0    52
2    33
3    44
4    52
5    52
Name: Post_usage_1month, dtype: int64

In [14]:
Female_spent=df.Post_usage_1month[df.sex==1]
Female_spent.head()

1     59
92    62
93    44
94    44
95    62
Name: Post_usage_1month, dtype: int64

In [15]:
Male_spent.mean()

50.120879120879124

In [16]:
Female_spent.mean()

54.99082568807339

In [17]:
stats.ttest_ind(a=Male_spent,
               b=Female_spent,equal_var=True)

Ttest_indResult(statistic=-3.7340738531536797, pvalue=0.00024625461203549315)

In [18]:
stats.ttest_ind(a=Male_spent,
               b=Female_spent,equal_var=False)

Ttest_indResult(statistic=-3.6564080478875276, pvalue=0.00034088493594266187)

In [19]:
#4.Is there any difference between segment of customer in terms of credit usage?

In [20]:
df.segment.value_counts()

2    105
3     50
1     45
Name: segment, dtype: int64

In [21]:
s1=df.Latest_mon_usage[df.segment==1]
s2=df.Latest_mon_usage[df.segment==2]
s3=df.Latest_mon_usage[df.segment==3]

In [22]:
s1.head()

0     49.2
2     64.8
6     50.4
8     64.8
15    51.6
Name: Latest_mon_usage, dtype: float64

In [23]:
s2.head()

4     68.4
5     61.2
7     54.0
9     62.4
11    61.2
Name: Latest_mon_usage, dtype: float64

In [24]:
s3.head()

1     63.6
3     56.4
10    61.2
21    90.0
24    54.0
Name: Latest_mon_usage, dtype: float64

In [25]:
print(s1.mean())
print(s2.mean())
print(s3.mean())

60.026666666666685
68.08000000000003
55.703999999999986


In [26]:
#bcoz there is more group than 2 so can't apply independent t-test.
#Now we apply ANOVA test.

In [27]:
stats.f_oneway(s1,s2,s3)

F_onewayResult(statistic=29.279283801321778, pvalue=7.36401083352674e-12)

In [28]:
df.dtypes

custid                   int64
sex                      int64
AqChannel                int64
region                   int64
Marital_status           int64
segment                  int64
pre_usage                int64
Post_usage_1month        int64
Latest_mon_usage       float64
post_usage_2ndmonth    float64
dtype: object

In [29]:
#5.Is there any relation b/w region & segment?

In [31]:
#Observed Frequency
t1=pd.crosstab(df.region,df.segment,margins=True)
t1.head()

segment,1,2,3,All
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,16,19,12,47
2,20,44,31,95
3,9,42,7,58
All,45,105,50,200


In [32]:
#Expected Frequency
stats.chi2_contingency(observed=t1)

(16.604441649489342,
 0.055282939487992254,
 9,
 array([[ 10.575,  24.675,  11.75 ,  47.   ],
        [ 21.375,  49.875,  23.75 ,  95.   ],
        [ 13.05 ,  30.45 ,  14.5  ,  58.   ],
        [ 45.   , 105.   ,  50.   , 200.   ]]))

In [33]:
stats.stats.pearsonr(df.region,df.segment)

(0.017332984779166463, 0.8075350000773025)

In [34]:
#6.Is the relationship b/w card usage in the latest month & pre usage of compaign?

In [35]:
# in scipy.stats
stats.stats.pearsonr(df.Latest_mon_usage,df.Post_usage_1month) 

(0.6174492644854919, 2.0866647416871388e-22)