In [1]:
# Hypothesis Testing: Two-Sample T-Test for Gender-Based Spending Behavior

# Dataset: Mall_Customers.csv
# The 'Genre' column contains either 'Male' or 'Female'.
# The 'Spending Score (1-100)' column represents customer spending behavior.

# Objective:
# Perform a two-sample t-test to determine if there is a statistically significant difference
# in spending scores between Male and Female customers.

# Null Hypothesis (H0): There is no difference in mean spending scores between genders.
# Alternative Hypothesis (H1): There is a difference in mean spending scores between genders.

In [2]:
# Import necessary libraries for data analysis and statistical testing
import pandas as pd  # For loading and processing the dataset
from scipy import stats  # For performing statistical hypothesis tests

# These libraries will help us conduct a one-sample t-test on the spending scores in the dataset.

In [3]:
# Load the dataset into a Pandas DataFrame
data = pd.read_csv('C:/Users/dbda.STUDENTSDC/Music/LabPractice/Notebooks/Datasets/Mall_Customers.csv')

# Display the first few rows to inspect the data structure
print(data.head())

   CustomerID   Genre  Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male   19                  15                      39
1           2    Male   21                  15                      81
2           3  Female   20                  16                       6
3           4  Female   23                  16                      77
4           5  Female   31                  17                      40


In [4]:
# Display the first few rows of the dataset to inspect its structure
data.head()

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [5]:
# Display dataset information, including column names, data types, and missing values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Genre                   200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


In [6]:
# Extract the 'Age' column from the dataset
age_data = data['Age']

# Display the extracted age data
print(age_data)

0      19
1      21
2      20
3      23
4      31
       ..
195    35
196    45
197    32
198    32
199    30
Name: Age, Length: 200, dtype: int64


In [7]:
# Define the hypothesized population average age for statistical testing
pop_avg_age = 40

# This value will be used as the benchmark for a one-sample t-test
# to determine if the sample mean age significantly differs from 40.

In [8]:
# Perform a one-sample t-test to compare the sample mean age to the hypothesized population mean
t_statistic, p_value = stats.ttest_1samp(age_data, pop_avg_age)

# Print the test results for interpretation
print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

# Decision criteria:
# - If p_value ≤ 0.05: Reject H0 (significant difference in average age)
# - If p_value > 0.05: Fail to reject H0 (no significant evidence of difference)
if p_value <= 0.05:
    print("Reject H0: The average age is significantly different from 40.")
else:
    print("Fail to reject H0: No significant evidence that the average age differs from 40.")

T-Statistic: -1.1642528048895828
P-Value: 0.24571546680949305
Fail to reject H0: No significant evidence that the average age differs from 40.


In [9]:
# Compute degrees of freedom for the one-sample t-test
# Degrees of freedom (df) is calculated as (sample size - 1)
degrees_of_freedom = len(age_data) - 1

# Print the computed value for verification
print("Degrees of Freedom:", degrees_of_freedom)

Degrees of Freedom: 199


In [10]:
# Print the results of the one-sample t-test for age comparison
print("T-statistic: ", t_statistic)  # The computed test statistic
print("P-value: ", p_value)  # The probability of observing the result under H0
print("Degrees of freedom: ", degrees_of_freedom)  # df = sample size - 1

T-statistic:  -1.1642528048895828
P-value:  0.24571546680949305
Degrees of freedom:  199


In [11]:
# Set the significance level (alpha) for hypothesis testing
alpha = 0.025  # Defines the threshold for rejecting the null hypothesis

# Evaluate the hypothesis test result
if p_value < alpha:
    print("The null hypothesis (mean age = 40) is rejected.")
else:
    print("The null hypothesis (mean age = 40) cannot be rejected.")

The null hypothesis (mean age = 40) cannot be rejected.


In [12]:
# Compute the critical t-value for the one-sample t-test
# - The critical value helps determine the rejection region for the hypothesis test
# - stats.t.ppf(1 - alpha, df) calculates the one-tailed critical value
critical_value = stats.t.ppf(1 - alpha, degrees_of_freedom)

# Print the computed critical t-value for reference
print("Critical value: ", critical_value)

Critical value:  1.971956544249395


In [13]:
# Compare the absolute value of the t-statistic with the critical value
# - If t_statistic is within the threshold, we fail to reject H0
# - Otherwise, we reject H0, indicating a significant difference in mean age

if abs(t_statistic) < critical_value:
    print("The null hypothesis (mean age = 40) cannot be rejected.")
else:
    print("The null hypothesis (mean age = 40) is rejected.")

The null hypothesis (mean age = 40) cannot be rejected.
