In [1]:
import pandas as pd
import scipy.stats as stats

# Load dataset
data = pd.read_csv("data/train.csv")  # Replace with your dataset's file name

# Handle missing values
data = data.fillna(method="ffill")  # Forward fill for simplicity; adjust as needed

# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()

# Define target variable
target_variable = 'Premium Amount'

# Ensure the target variable is numerical
if data[target_variable].dtype != 'float64' and data[target_variable].dtype != 'int64':
    data[target_variable] = pd.to_numeric(data[target_variable], errors='coerce')

# Remove rows with missing target values
data = data.dropna(subset=[target_variable])

# Run ANOVA for each categorical column
anova_results = []
for cat_col in categorical_columns:
    try:
        # Group data by the categorical column
        groups = [group[target_variable].dropna() for _, group in data.groupby(cat_col)]
        
        # Perform ANOVA
        f_stat, p_value = stats.f_oneway(*groups)
        
        # Determine significance
        significance = "Significant" if p_value < 0.05 else "Not Significant"
        
        # Append results
        anova_results.append({
            'Categorical Variable': cat_col,
            'F-Statistic': f_stat,
            'P-Value': p_value,
            'Significance': significance
        })
    except Exception as e:
        # Handle cases where ANOVA cannot be computed
        anova_results.append({
            'Categorical Variable': cat_col,
            'F-Statistic': None,
            'P-Value': None,
            'Significance': f"Error: {str(e)}"
        })

# Convert results to a DataFrame
anova_df = pd.DataFrame(anova_results)

print(anova_df)

# Display results
# import ace_tools as tools; tools.display_dataframe_to_user(name="ANOVA Results for Predicting Premium Amount", dataframe=anova_df)

  data = data.fillna(method="ffill")  # Forward fill for simplicity; adjust as needed


   Categorical Variable  F-Statistic       P-Value     Significance
0                Gender     0.031099  8.600207e-01  Not Significant
1        Marital Status     0.337613  7.134715e-01  Not Significant
2       Education Level     1.145105  3.292494e-01  Not Significant
3            Occupation     0.110829  8.950920e-01  Not Significant
4              Location     0.676414  5.084371e-01  Not Significant
5           Policy Type     0.470910  6.244336e-01  Not Significant
6     Policy Start Date     1.078124  1.524729e-92      Significant
7     Customer Feedback     3.038610  4.790182e-02      Significant
8        Smoking Status     0.031783  8.585039e-01  Not Significant
9    Exercise Frequency     0.483057  6.940501e-01  Not Significant
10        Property Type     1.051181  3.495250e-01  Not Significant


In [None]:
anova_df["Categorical Variable"]

In [7]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Select continuous variables
continuous_columns = ['Age', 'Annual Income', 'Number of Dependents', 
                      'Health Score', 'Previous Claims', 'Vehicle Age', 
                      'Credit Score', 'Insurance Duration']

# Create a copy of the original data to avoid modifying it directly
clustered_data = data.copy()

# Iterate through each continuous variable
for col in continuous_columns:
    # Extract the single variable
    variable_data = clustered_data[[col]]
    
    # Standardize the variable
    scaler = StandardScaler()
    scaled_variable = scaler.fit_transform(variable_data)
    
    # Perform KMeans clustering
    kmeans = KMeans(n_clusters=4, random_state=42)
    cluster_labels = kmeans.fit_predict(scaled_variable)
    
    # Replace the original variable with its cluster labels
    clustered_data[col] = cluster_labels

# Display the updated DataFrame
print(clustered_data.head())


   id  Age  Annual Income  Number of Dependents  Health Score  \
0   0    0              1                     1             3   
1   1    2              3                     3             1   
2   2    0              3                     3             2   
3   3    0              0                     2             1   
4   4    0              3                     1             3   

   Previous Claims  Vehicle Age  Credit Score  Insurance Duration  \
0                3            1             0                   2   
1                2            3             3                   0   
2                2            1             3                   2   
3                2            2             0                   0   
4                1            0             3                   2   

   Premium Amount  Cluster_Label  
0          2869.0              3  
1          1483.0              1  
2           567.0              1  
3           765.0              2  
4          2022.0  

In [8]:
import pandas as pd
from scipy.stats import f_oneway

# Prepare data for ANOVAs
continuous_columns = [col for col in data.columns if col != 'Cluster_Label']  # Exclude the cluster label column

# Perform ANOVA for each continuous variable
anova_results = []

for col in continuous_columns:
    # Group data by Cluster_Label
    groups = [data[data['Cluster_Label'] == cluster][col] for cluster in data['Cluster_Label'].unique()]
    
    # Perform ANOVA
    f_stat, p_value = f_oneway(*groups)
    
    # Determine significance
    significance = "Significant" if p_value < 0.05 else "Not Significant"
    
    # Append results
    anova_results.append({
        'Variable': col,
        'F-Statistic': f_stat,
        'P-Value': p_value,
        'Significance': significance
    })

# Convert results to a DataFrame
anova_df = pd.DataFrame(anova_results)

print(anova_df)

# Display the ANOVA results
# import ace_tools as tools; tools.display_dataframe_to_user(name="ANOVA Results After Clustering", dataframe=anova_df)

               Variable    F-Statistic       P-Value Significance
0                    id      25.814679  1.994393e-21  Significant
1                   Age      75.232938  6.948634e-64  Significant
2         Annual Income  430212.690303  0.000000e+00  Significant
3  Number of Dependents  362266.717930  0.000000e+00  Significant
4          Health Score    6524.890462  0.000000e+00  Significant
5       Previous Claims    1172.774594  0.000000e+00  Significant
6           Vehicle Age    1143.843928  0.000000e+00  Significant
7          Credit Score   34655.297909  0.000000e+00  Significant
8    Insurance Duration  117922.294827  0.000000e+00  Significant
9        Premium Amount  414291.510721  0.000000e+00  Significant


In [6]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Drop the variables that already have ANOVAs done 
data = data.drop(columns=data.select_dtypes(include=['object', 'category']).columns)

# Standardize the continuous variables
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Fit KMeans with a predefined number of clusters
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10, max_iter=300)
cluster_labels = kmeans.fit_predict(scaled_data)

# Assign cluster labels to the DataFrame
data['Cluster_Label'] = cluster_labels

In [4]:
import pandas as pd
from scipy.stats import f_oneway

# Prepare data for ANOVAs
continuous_columns = [col for col in data.columns if col != 'Cluster_Label']  # Exclude the cluster label column

# Perform ANOVA for each continuous variable
anova_results = []

for col in continuous_columns:
    # Group data by Cluster_Label
    groups = [data[data['Cluster_Label'] == cluster][col] for cluster in data['Cluster_Label'].unique()]
    
    # Perform ANOVA
    f_stat, p_value = f_oneway(*groups)
    
    # Determine significance
    significance = "Significant" if p_value < 0.05 else "Not Significant"
    
    # Append results
    anova_results.append({
        'Variable': col,
        'F-Statistic': f_stat,
        'P-Value': p_value,
        'Significance': significance
    })

# Convert results to a DataFrame
anova_df = pd.DataFrame(anova_results)

print(anova_df)

# Display the ANOVA results
# import ace_tools as tools; tools.display_dataframe_to_user(name="ANOVA Results After Clustering", dataframe=anova_df)

               Variable    F-Statistic       P-Value Significance
0                    id      30.363142  1.280213e-19  Significant
1                   Age      99.464197  2.257642e-64  Significant
2         Annual Income  576209.960165  0.000000e+00  Significant
3  Number of Dependents  470801.276751  0.000000e+00  Significant
4          Health Score    8737.197879  0.000000e+00  Significant
5       Previous Claims    1555.744931  0.000000e+00  Significant
6           Vehicle Age      11.590193  1.362402e-07  Significant
7          Credit Score   47941.470166  0.000000e+00  Significant
8    Insurance Duration       4.627809  3.068307e-03  Significant
9        Premium Amount  567918.659059  0.000000e+00  Significant


In [5]:
cluster_means = data.groupby('Cluster_Label').mean()
print(cluster_means)

                          id        Age  Annual Income  Number of Dependents  \
Cluster_Label                                                                  
0              598313.670080  41.031715   29317.943464              2.005068   
1              597541.008099  40.940272   20780.880016              0.784877   
2              598011.748538  41.098077   90941.336472              1.998024   
3              604160.403105  41.432965   21000.273779              3.281481   

               Health Score  Previous Claims  Vehicle Age  Credit Score  \
Cluster_Label                                                             
0                 25.982940         1.103805     9.560442    602.254693   
1                 23.322947         0.969866     9.539014    617.590372   
2                 26.393259         1.085903     9.557416    477.229874   
3                 27.492279         0.955336     9.611425    614.341594   

               Insurance Duration  Premium Amount  
Cluster_Label   

In [None]:
# Run ANOVA with Cluster_Label as the categorical variable
anova_results = []
target_variable = 'Premium Amount'

for col in continuous_columns:
    # Group data by clusters
    groups = [data[data['Cluster_Label'] == cluster][col] for cluster in range(optimal_clusters)]
    
    # Perform ANOVA
    f_stat, p_value = f_oneway(*groups)
    
    # Determine significance
    significance = "Significant" if p_value < 0.05 else "Not Significant"
    
    # Append results
    anova_results.append({
        'Variable': col,
        'F-Statistic': f_stat,
        'P-Value': p_value,
        'Significance': significance
    })

# Convert results to a DataFrame
anova_df = pd.DataFrame(anova_results)

print(anova_df)