In [None]:
# Start your code here!
import pandas as pd
from scipy import stats

# 1. Load the data
men_df = pd.read_csv('men_results.csv')
women_df = pd.read_csv('women_results.csv')

# 2. Filter for FIFA World Cup matches after 2002-01-01
men_filtered = men_df[
    (men_df['tournament'] == 'FIFA World Cup') & 
    (men_df['date'] >= '2002-01-01')
].copy()

women_filtered = women_df[
    (women_df['tournament'] == 'FIFA World Cup') & 
    (women_df['date'] >= '2002-01-01')
].copy()

# Calculate total goals per match
men_filtered['total_goals'] = men_filtered['home_score'] + men_filtered['away_score']
women_filtered['total_goals'] = women_filtered['home_score'] + women_filtered['away_score']

# 3. EDA - Check normality

# Sample sizes
print(f"Men's matches: {len(men_filtered)}")
print(f"Women's matches: {len(women_filtered)}")
print(f"\nMen's goals - Mean: {men_filtered['total_goals'].mean():.2f}, Std: {men_filtered['total_goals'].std():.2f}")
print(f"Women's goals - Mean: {women_filtered['total_goals'].mean():.2f}, Std: {women_filtered['total_goals'].std():.2f}")

# 4. Perform hypothesis test
t_stat, p_val = stats.ttest_ind(
    men_filtered['total_goals'], 
    women_filtered['total_goals'],
    equal_var=False  # Welch's t-test
)

print(f"\n--- Hypothesis Test Results ---")
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_val:.4f}")

# 5. Interpret the result (10% significance level)
alpha = 0.10

if p_val < alpha:
    result = "reject"
    print(f"\nResult: Reject the null hypothesis (p-value {p_val:.4f} < {alpha})")
    print("There IS a significant difference between men's and women's goal scoring.")
else:
    result = "fail to reject"
    print(f"\nResult: Fail to reject the null hypothesis (p-value {p_val:.4f} >= {alpha})")
    print("There is NO significant difference between men's and women's goal scoring.")

# Store results in required format
result_dict = {"p_val": p_val, "result": result}
print(f"\nFinal result dictionary: {result_dict}")