In [1]:
import pandas as pd
import scipy.stats as stats

# Load dataset
data = pd.read_csv("data/train.csv")  # Replace with your dataset's file name

# Handle missing values
data = data.fillna(method="ffill")  # Forward fill for simplicity; adjust as needed

# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()

# Define target variable
target_variable = 'Premium Amount'

# Ensure the target variable is numerical
if data[target_variable].dtype != 'float64' and data[target_variable].dtype != 'int64':
    data[target_variable] = pd.to_numeric(data[target_variable], errors='coerce')

# Remove rows with missing target values
data = data.dropna(subset=[target_variable])

# Run ANOVA for each categorical column
anova_results = []
for cat_col in categorical_columns:
    try:
        # Group data by the categorical column
        groups = [group[target_variable].dropna() for _, group in data.groupby(cat_col)]
        
        # Perform ANOVA
        f_stat, p_value = stats.f_oneway(*groups)
        
        # Determine significance
        significance = "Significant" if p_value < 0.05 else "Not Significant"
        
        # Append results
        anova_results.append({
            'Categorical Variable': cat_col,
            'F-Statistic': f_stat,
            'P-Value': p_value,
            'Significance': significance
        })
    except Exception as e:
        # Handle cases where ANOVA cannot be computed
        anova_results.append({
            'Categorical Variable': cat_col,
            'F-Statistic': None,
            'P-Value': None,
            'Significance': f"Error: {str(e)}"
        })

# Convert results to a DataFrame
anova_df = pd.DataFrame(anova_results)

print(anova_df)

# Display results
# import ace_tools as tools; tools.display_dataframe_to_user(name="ANOVA Results for Predicting Premium Amount", dataframe=anova_df)

   Categorical Variable  F-Statistic       P-Value     Significance
0                Gender     0.031099  8.600207e-01  Not Significant
1        Marital Status     0.337613  7.134715e-01  Not Significant
2       Education Level     1.145105  3.292494e-01  Not Significant
3            Occupation     0.110829  8.950920e-01  Not Significant
4              Location     0.676414  5.084371e-01  Not Significant
5           Policy Type     0.470910  6.244336e-01  Not Significant
6     Policy Start Date     1.078124  1.524729e-92      Significant
7     Customer Feedback     3.038610  4.790182e-02      Significant
8        Smoking Status     0.031783  8.585039e-01  Not Significant
9    Exercise Frequency     0.483057  6.940501e-01  Not Significant
10        Property Type     1.051181  3.495250e-01  Not Significant


In [2]:
# The ANOVAs shows that Policy Start Date and Customer Feedback are significant in determining the Preimum amount 

In [3]:
anova_df["Categorical Variable"]

0                 Gender
1         Marital Status
2        Education Level
3             Occupation
4               Location
5            Policy Type
6      Policy Start Date
7      Customer Feedback
8         Smoking Status
9     Exercise Frequency
10         Property Type
Name: Categorical Variable, dtype: object