In [2]:
import pandas as pd
import numpy as np

category_mapping = {
    'a': 'Apple',
    'b': 'Bat',
    'c': 'Cat',
    'd': 'Dog',
    'e': 'Elephant'
}

num_rows = 1000
categories = np.random.choice(list(category_mapping.keys()), num_rows)

values = [category_mapping[cat] for cat in categories]

df = pd.DataFrame({'Category': categories, 'Value': values})

df.head()

Unnamed: 0,Category,Value
0,e,Elephant
1,d,Dog
2,c,Cat
3,c,Cat
4,c,Cat


The chi-square statistic measures the independence between categorical variables. The formula for the chi-square statistic in the context of a contingency table is:

![image-2.png](attachment:image-2.png) 
â€‹
 
Where:
- \( \chi^2 \) is the chi-square statistic,
- \( O_{ij} \) is the observed frequency in cell \( (i, j) \) of the contingency table,
- \( E_{ij} \) is the expected frequency in cell \( (i, j) \) under the assumption that the null hypothesis is true.

\[ \chi^2 = \sum \frac{(O_{ij} - E_{ij})^2}{E_{ij}} \]


* Null Hypothesis (H0): There is no significant association between the two categorical variables.

* Alternative Hypothesis (H1): There is a significant association between the two categorical variables.

In [4]:
from scipy.stats import chi2_contingency
contingency_table = pd.crosstab(df['Category'], df['Value'])
print(contingency_table)
# Performing the chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

chi2, p, dof, expected

Value     Apple  Bat  Cat  Dog  Elephant
Category                                
a           193    0    0    0         0
b             0  201    0    0         0
c             0    0  198    0         0
d             0    0    0  210         0
e             0    0    0    0       198


(3999.999999999999,
 0.0,
 16,
 array([[37.249, 38.793, 38.214, 40.53 , 38.214],
        [38.793, 40.401, 39.798, 42.21 , 39.798],
        [38.214, 39.798, 39.204, 41.58 , 39.204],
        [40.53 , 42.21 , 41.58 , 44.1  , 41.58 ],
        [38.214, 39.798, 39.204, 41.58 , 39.204]]))

In [14]:
categories = ['a', 'b', 'c', 'd', 'e']
values = ['Apple', 'Bat', 'Cat', 'Dog', 'Elephant']

# Create a DataFrame with 1000 rows and random selection of categories and values
num_rows = 1000
random_categories = np.random.choice(categories, num_rows)
random_values = np.random.choice(values, num_rows)

# Create the DataFrame
df_random_both = pd.DataFrame({'Category': random_categories, 'Value': random_values})

# Display the first few rows of the DataFrame
df_random_both.head()

Unnamed: 0,Category,Value
0,c,Cat
1,a,Apple
2,a,Apple
3,a,Bat
4,b,Elephant


In [15]:
contingency_table = pd.crosstab(df_random_both['Category'], df_random_both['Value'])
print(contingency_table)
# Performing the chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

chi2, p, dof, expected

Value     Apple  Bat  Cat  Dog  Elephant
Category                                
a            37   39   44   41        32
b            35   42   43   47        39
c            45   39   36   40        37
d            37   37   32   41        48
e            38   38   43   48        42


(8.77533666954696,
 0.9223762213956626,
 16,
 array([[37.056, 37.635, 38.214, 41.881, 38.214],
        [39.552, 40.17 , 40.788, 44.702, 40.788],
        [37.824, 38.415, 39.006, 42.749, 39.006],
        [37.44 , 38.025, 38.61 , 42.315, 38.61 ],
        [40.128, 40.755, 41.382, 45.353, 41.382]]))

In [16]:
alpha = 0.05
if p < alpha:
    print("Reject the null hypothesis (H0): There is a significant association between the variables.")
else:
    print("Fail to reject the null hypothesis (H0): There is no significant association between the variables.")

Fail to reject the null hypothesis (H0): There is no significant association between the variables.


In [18]:
categories = ['a', 'b', 'c', 'd', 'e']
specific_values = {'a': 'Apple', 'b': 'Bat'}
random_values = ['Cat', 'Dog', 'Elephant']

# Number of rows
num_rows = 1000

# Generate random categories
random_categories = np.random.choice(categories, num_rows)

# Initialize the list for random values
values_assigned = []

# Assign values based on the category
for category in random_categories:
    if category in specific_values:
        values_assigned.append(specific_values[category])
    else:
        values_assigned.append(np.random.choice(random_values))

# Create the DataFrame
df_random_both = pd.DataFrame({'Category': random_categories, 'Value': values_assigned})

# Display the first few rows of the DataFrame
print(df_random_both.head())

  Category     Value
0        c       Dog
1        c       Cat
2        d  Elephant
3        a     Apple
4        a     Apple


In [19]:
contingency_table = pd.crosstab(df_random_both['Category'], df_random_both['Value'])
print(contingency_table)
# Performing the chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

chi2, p, dof, expected

Value     Apple  Bat  Cat  Dog  Elephant
Category                                
a           196    0    0    0         0
b             0  211    0    0         0
c             0    0   64   65        67
d             0    0   71   49        75
e             0    0   64   72        66


(2009.3516997273196,
 0.0,
 16,
 array([[38.416, 41.356, 39.004, 36.456, 40.768],
        [41.356, 44.521, 41.989, 39.246, 43.888],
        [38.416, 41.356, 39.004, 36.456, 40.768],
        [38.22 , 41.145, 38.805, 36.27 , 40.56 ],
        [39.592, 42.622, 40.198, 37.572, 42.016]]))

In [21]:
categories = ['a', 'b', 'c', 'd', 'e']
values = ['Apple', 'Bat', 'Cat', 'Dog', 'Elephant']
import scipy.stats as stats

# Number of rows
num_rows = 1000

# Generate random categories and values
random_categories = np.random.choice(categories, num_rows)
random_values = np.random.choice(values, num_rows)

# Create the DataFrame
df_random_both = pd.DataFrame({'Category': random_categories, 'Value': random_values})

# Filter the DataFrame for categories 'a' and 'b'
df_ab = df_random_both[df_random_both['Category'].isin(['a', 'b'])]

# Create a contingency table
contingency_table_ab = pd.crosstab(df_ab['Category'], df_ab['Value'])

# Perform the chi-square test
chi2_ab, p_ab, dof_ab, expected_ab = stats.chi2_contingency(contingency_table_ab)

# Print the results
print("Chi-square test for 'a' vs 'b'")
print(f"Chi-square Statistic: {chi2_ab}")
print(f"P-value: {p_ab}")
print(f"Degrees of Freedom: {dof_ab}")
print("Expected Frequencies:")
print(expected_ab)

# Interpret the result
alpha = 0.05
if p_ab < alpha:
    print("Reject the null hypothesis (H0): There is a significant association between 'a' and 'b'.")
else:
    print("Fail to reject the null hypothesis (H0): There is no significant association between 'a' and 'b'.")

Chi-square test for 'a' vs 'b'
Chi-square Statistic: 4.970006618342654
P-value: 0.2903888722650252
Degrees of Freedom: 4
Expected Frequencies:
[[40.976      33.096      40.45066667 44.65333333 37.824     ]
 [37.024      29.904      36.54933333 40.34666667 34.176     ]]
Fail to reject the null hypothesis (H0): There is no significant association between 'a' and 'b'.


In [22]:
contingency_table_ab

Value,Apple,Bat,Cat,Dog,Elephant
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,37,27,43,49,41
b,41,36,34,36,31
