In [1]:
import pandas as pd
import scipy.stats as stats

# Load dataset
data = pd.read_csv("data/train.csv")  # Replace with your dataset's file name

# Handle missing values
data = data.fillna(method="ffill")  # Forward fill for simplicity; adjust as needed

# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()

# Define target variable
target_variable = 'Premium Amount'

# Ensure the target variable is numerical
if data[target_variable].dtype != 'float64' and data[target_variable].dtype != 'int64':
    data[target_variable] = pd.to_numeric(data[target_variable], errors='coerce')

# Remove rows with missing target values
data = data.dropna(subset=[target_variable])

# Run ANOVA for each categorical column
anova_results = []
for cat_col in categorical_columns:
    try:
        # Group data by the categorical column
        groups = [group[target_variable].dropna() for _, group in data.groupby(cat_col)]
        
        # Perform ANOVA
        f_stat, p_value = stats.f_oneway(*groups)
        
        # Determine significance
        significance = "Significant" if p_value < 0.05 else "Not Significant"
        
        # Append results
        anova_results.append({
            'Categorical Variable': cat_col,
            'F-Statistic': f_stat,
            'P-Value': p_value,
            'Significance': significance
        })
    except Exception as e:
        # Handle cases where ANOVA cannot be computed
        anova_results.append({
            'Categorical Variable': cat_col,
            'F-Statistic': None,
            'P-Value': None,
            'Significance': f"Error: {str(e)}"
        })

# Convert results to a DataFrame
anova_df = pd.DataFrame(anova_results)

print(anova_df)

# Display results
# import ace_tools as tools; tools.display_dataframe_to_user(name="ANOVA Results for Predicting Premium Amount", dataframe=anova_df)

  data = data.fillna(method="ffill")  # Forward fill for simplicity; adjust as needed


   Categorical Variable  F-Statistic       P-Value     Significance
0                Gender     0.031099  8.600207e-01  Not Significant
1        Marital Status     0.337613  7.134715e-01  Not Significant
2       Education Level     1.145105  3.292494e-01  Not Significant
3            Occupation     0.110829  8.950920e-01  Not Significant
4              Location     0.676414  5.084371e-01  Not Significant
5           Policy Type     0.470910  6.244336e-01  Not Significant
6     Policy Start Date     1.078124  1.524729e-92      Significant
7     Customer Feedback     3.038610  4.790182e-02      Significant
8        Smoking Status     0.031783  8.585039e-01  Not Significant
9    Exercise Frequency     0.483057  6.940501e-01  Not Significant
10        Property Type     1.051181  3.495250e-01  Not Significant


In [2]:
anova_df["Categorical Variable"]

0                 Gender
1         Marital Status
2        Education Level
3             Occupation
4               Location
5            Policy Type
6      Policy Start Date
7      Customer Feedback
8         Smoking Status
9     Exercise Frequency
10         Property Type
Name: Categorical Variable, dtype: object

In [5]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Standardize the continuous variables
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Fit KMeans with a predefined number of clusters
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10, max_iter=300)
cluster_labels = kmeans.fit_predict(scaled_data)

# Assign cluster labels to the DataFrame
data['Cluster_Label'] = cluster_labels

ValueError: could not convert string to float: 'Female'

In [3]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from scipy.stats import f_oneway

# Load the dataset (replace with your actual DataFrame)
# df = pd.read_csv("your_dataset.csv")

# Identify continuous variables
continuous_columns = ['Age', 'Annual Income', 'Number of Dependents', 
                      'Health Score', 'Previous Claims', 'Vehicle Age', 
                      'Credit Score', 'Insurance Duration']

continuous_data = data[continuous_columns]

# Standardize the continuous variables
scaler = StandardScaler()
scaled_data = scaler.fit_transform(continuous_data)

# Determine the optimal number of clusters using BIC
bic_scores = []
n_components_range = range(2, 11)

for n in n_components_range:
    gmm = GaussianMixture(n_components=n, random_state=42)
    gmm.fit(scaled_data)
    bic_scores.append(gmm.bic(scaled_data))

# Optimal number of clusters
optimal_clusters = n_components_range[np.argmin(bic_scores)]
print(f"Optimal number of clusters: {optimal_clusters}")

# Fit GMM with the optimal number of clusters
gmm = GaussianMixture(n_components=optimal_clusters, random_state=42)
gmm.fit(scaled_data)

# Assign cluster labels to the DataFrame
data['Cluster_Label'] = gmm.predict(scaled_data)

Optimal number of clusters: 10


In [4]:
# Run ANOVA with Cluster_Label as the categorical variable
anova_results = []
target_variable = 'Premium Amount'

for col in continuous_columns:
    # Group data by clusters
    groups = [data[data['Cluster_Label'] == cluster][col] for cluster in range(optimal_clusters)]
    
    # Perform ANOVA
    f_stat, p_value = f_oneway(*groups)
    
    # Determine significance
    significance = "Significant" if p_value < 0.05 else "Not Significant"
    
    # Append results
    anova_results.append({
        'Variable': col,
        'F-Statistic': f_stat,
        'P-Value': p_value,
        'Significance': significance
    })

# Convert results to a DataFrame
anova_df = pd.DataFrame(anova_results)

print(anova_df)

               Variable    F-Statistic  P-Value Significance
0                   Age   58224.162306      0.0  Significant
1         Annual Income  413724.560296      0.0  Significant
2  Number of Dependents   82510.865087      0.0  Significant
3          Health Score   17790.941772      0.0  Significant
4       Previous Claims   46415.545299      0.0  Significant
5           Vehicle Age   29611.827694      0.0  Significant
6          Credit Score   10698.037089      0.0  Significant
7    Insurance Duration   58824.966963      0.0  Significant
