In [17]:
import pandas as pd 
import numpy as np

In [18]:
np.random.seed(20)
data = {
    'Category' : np.random.choice(['A', 'B', 'C', 'D'], size=20),
    'Region'   : np.random.choice(['North', 'South', 'East', 'West'], size=20),
    'Sales'    : np.random.randint(50, 100, size=20),
    'Profit'   : np.random.randint(10, 100, size=20)
}
data

{'Category': array(['D', 'C', 'D', 'D', 'A', 'C', 'B', 'A', 'D', 'C', 'D', 'C', 'A',
        'C', 'A', 'B', 'C', 'C', 'D', 'D'], dtype='<U1'),
 'Region': array(['North', 'South', 'East', 'East', 'East', 'North', 'South',
        'South', 'West', 'South', 'West', 'East', 'East', 'East', 'South',
        'West', 'East', 'South', 'West', 'East'], dtype='<U5'),
 'Sales': array([61, 63, 69, 80, 96, 82, 60, 93, 56, 61, 68, 53, 63, 67, 93, 66, 68,
        65, 76, 57]),
 'Profit': array([85, 39, 55, 16, 73, 88, 34, 28, 43, 88, 52, 13, 28, 80, 62, 32, 24,
        14, 37, 85])}

In [19]:
df = pd.DataFrame(data)
df

Unnamed: 0,Category,Region,Sales,Profit
0,D,North,61,85
1,C,South,63,39
2,D,East,69,55
3,D,East,80,16
4,A,East,96,73
5,C,North,82,88
6,B,South,60,34
7,A,South,93,28
8,D,West,56,43
9,C,South,61,88


In [20]:
pivot = pd.pivot_table(df, values='Sales', index='Category', columns='Region', aggfunc=['count', 'sum'])
pivot

Unnamed: 0_level_0,count,count,count,count,sum,sum,sum,sum
Region,East,North,South,West,East,North,South,West
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
A,2.0,,2.0,,159.0,,186.0,
B,,,1.0,1.0,,,60.0,66.0
C,3.0,1.0,3.0,,188.0,82.0,189.0,
D,3.0,1.0,,3.0,206.0,61.0,,200.0


Để có thể xử lý được NaN (vì ví dụ nếu ta có thể g/s ko có dữ liệu là 0) => sử dụng fill_value

In [21]:
pivot = pd.pivot_table(df, values='Sales', index='Category', columns='Region', aggfunc='sum', fill_value=0)
pivot

Region,East,North,South,West
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,159,0,186,0
B,0,0,60,66
C,188,82,189,0
D,206,61,0,200


In [22]:
A = np.array([[1, 2], [3, 4]])
A.sum()

10

In [23]:
df.head()

Unnamed: 0,Category,Region,Sales,Profit
0,D,North,61,85
1,C,South,63,39
2,D,East,69,55
3,D,East,80,16
4,A,East,96,73


In [24]:
contingency_table = df.pivot_table(
    index='Category',
    columns='Region',
    values='Sales',
    fill_value=0,
    aggfunc='count',
    margins=True,
    margins_name='Total'
)
contingency_table

Region,East,North,South,West,Total
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,2,0,2,0,4
B,0,0,1,1,2
C,3,1,3,0,7
D,3,1,0,3,7
Total,8,2,6,4,20


In [25]:
m1, m2 = contingency_table.shape
m1 -= 1
m2 -= 1
m1, m2

(4, 4)

In [26]:
q = (m1 - 1) * (m2 - 1)

In [27]:
import scipy.stats as stats

In [28]:
X = stats.chi2(q)

In [29]:
contingency_table.to_numpy()

array([[ 2,  0,  2,  0,  4],
       [ 0,  0,  1,  1,  2],
       [ 3,  1,  3,  0,  7],
       [ 3,  1,  0,  3,  7],
       [ 8,  2,  6,  4, 20]], dtype=int64)

In [30]:
contingency_table

Region,East,North,South,West,Total
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,2,0,2,0,4
B,0,0,1,1,2
C,3,1,3,0,7
D,3,1,0,3,7
Total,8,2,6,4,20


In [31]:
expected_table = contingency_table.apply(lambda x: x[-1] * contingency_table.iloc[:, -1] / contingency_table.iloc[-1, -1])
expected_table

  expected_table = contingency_table.apply(lambda x: x[-1] * contingency_table.iloc[:, -1] / contingency_table.iloc[-1, -1])
  expected_table = contingency_table.apply(lambda x: x[-1] * contingency_table.iloc[:, -1] / contingency_table.iloc[-1, -1])
  expected_table = contingency_table.apply(lambda x: x[-1] * contingency_table.iloc[:, -1] / contingency_table.iloc[-1, -1])
  expected_table = contingency_table.apply(lambda x: x[-1] * contingency_table.iloc[:, -1] / contingency_table.iloc[-1, -1])
  expected_table = contingency_table.apply(lambda x: x[-1] * contingency_table.iloc[:, -1] / contingency_table.iloc[-1, -1])


Region,East,North,South,West,Total
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,1.6,0.4,1.2,0.8,4.0
B,0.8,0.2,0.6,0.4,2.0
C,2.8,0.7,2.1,1.4,7.0
D,2.8,0.7,2.1,1.4,7.0
Total,8.0,2.0,6.0,4.0,20.0


In [32]:
type(contingency_table.iloc[:, -1])

pandas.core.series.Series

In [33]:
type(contingency_table.iloc[-1, :])
contingency_table.iloc[-1, :]

Region
East      8
North     2
South     6
West      4
Total    20
Name: Total, dtype: int64

default : columns

In [34]:
contingency_table.iloc[-1, -1]

20

In [35]:
expected_table = contingency_table.apply(lambda x: x[-1] * contingency_table.iloc[:, -1]) / contingency_table.iloc[-1, -1]
expected_table

  expected_table = contingency_table.apply(lambda x: x[-1] * contingency_table.iloc[:, -1]) / contingency_table.iloc[-1, -1]
  expected_table = contingency_table.apply(lambda x: x[-1] * contingency_table.iloc[:, -1]) / contingency_table.iloc[-1, -1]
  expected_table = contingency_table.apply(lambda x: x[-1] * contingency_table.iloc[:, -1]) / contingency_table.iloc[-1, -1]
  expected_table = contingency_table.apply(lambda x: x[-1] * contingency_table.iloc[:, -1]) / contingency_table.iloc[-1, -1]
  expected_table = contingency_table.apply(lambda x: x[-1] * contingency_table.iloc[:, -1]) / contingency_table.iloc[-1, -1]


Region,East,North,South,West,Total
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,1.6,0.4,1.2,0.8,4.0
B,0.8,0.2,0.6,0.4,2.0
C,2.8,0.7,2.1,1.4,7.0
D,2.8,0.7,2.1,1.4,7.0
Total,8.0,2.0,6.0,4.0,20.0


In [36]:
expected_table

Region,East,North,South,West,Total
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,1.6,0.4,1.2,0.8,4.0
B,0.8,0.2,0.6,0.4,2.0
C,2.8,0.7,2.1,1.4,7.0
D,2.8,0.7,2.1,1.4,7.0
Total,8.0,2.0,6.0,4.0,20.0


In [37]:
contingency_table

Region,East,North,South,West,Total
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,2,0,2,0,4
B,0,0,1,1,2
C,3,1,3,0,7
D,3,1,0,3,7
Total,8,2,6,4,20


matrixA / matrixB is likely applied as a element-wise operator

In [38]:
A = np.array([[1, 2], [3, 4]]).astype(float)
B = np.array([[2, 2], [2, 2]])
A / B

array([[0.5, 1. ],
       [1.5, 2. ]])

In [39]:
chi_square_value = ((contingency_table - expected_table)**2).sum().sum()
chi_square_value

12.8

In [40]:
X

<scipy.stats._distn_infrastructure.rv_continuous_frozen at 0x1cdfed698e0>

In [41]:
p_value = 1 - X.cdf(chi_square_value)

In [42]:
chi_square_value, p_value

(12.8, 0.17186683746684395)

In [43]:
df

Unnamed: 0,Category,Region,Sales,Profit
0,D,North,61,85
1,C,South,63,39
2,D,East,69,55
3,D,East,80,16
4,A,East,96,73
5,C,North,82,88
6,B,South,60,34
7,A,South,93,28
8,D,West,56,43
9,C,South,61,88


In [44]:
def chi2_contingency_analysis(cat1, cat2):
    if not isinstance(df[cat1].dtype, pd.CategoricalDtype):
        df[cat1] = df[cat1].astype('category')

    if not isinstance(df[cat2].dtype, pd.CategoricalDtype):
        df[cat2] = df[cat2].astype('category')
        
    contingency_table = pd.pivot_table(
        df,
        values='Sales',
        index='Category',
        columns='Region',
        fill_value=0
    )        

In [45]:
import itertools

In [46]:
for x, y in itertools.combinations([1, 2, 3, 4], 2):
    print(x, y)

1 2
1 3
1 4
2 3
2 4
3 4
