In [None]:
# 1 Remove all the entries that have null values


In [2]:
import pandas as pd
import scipy.stats as stats

# Load dataset
data = pd.read_csv("data/train.csv")  # Replace with your dataset's file name

# Handle missing values
data = data.fillna(method="ffill")  # Forward fill for simplicity; adjust as needed

# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()

# Define target variable
target_variable = 'Premium Amount'

# Ensure the target variable is numerical
if data[target_variable].dtype != 'float64' and data[target_variable].dtype != 'int64':
    data[target_variable] = pd.to_numeric(data[target_variable], errors='coerce')

# Remove rows with missing target values
data = data.dropna(subset=[target_variable])

# Run ANOVA for each categorical column
anova_results = []
for cat_col in categorical_columns:
    try:
        # Group data by the categorical column
        groups = [group[target_variable].dropna() for _, group in data.groupby(cat_col)]
        
        # Perform ANOVA
        f_stat, p_value = stats.f_oneway(*groups)
        
        # Determine significance
        significance = "Significant" if p_value < 0.05 else "Not Significant"
        
        # Append results
        anova_results.append({
            'Categorical Variable': cat_col,
            'F-Statistic': f_stat,
            'P-Value': p_value,
            'Significance': significance
        })
    except Exception as e:
        # Handle cases where ANOVA cannot be computed
        anova_results.append({
            'Categorical Variable': cat_col,
            'F-Statistic': None,
            'P-Value': None,
            'Significance': f"Error: {str(e)}"
        })

# Convert results to a DataFrame
anova_df = pd.DataFrame(anova_results)

print(anova_df)

# Display results
# import ace_tools as tools; tools.display_dataframe_to_user(name="ANOVA Results for Predicting Premium Amount", dataframe=anova_df)

  data = data.fillna(method="ffill")  # Forward fill for simplicity; adjust as needed


   Categorical Variable  F-Statistic       P-Value     Significance
0                Gender     0.031099  8.600207e-01  Not Significant
1        Marital Status     0.337613  7.134715e-01  Not Significant
2       Education Level     1.145105  3.292494e-01  Not Significant
3            Occupation     0.110829  8.950920e-01  Not Significant
4              Location     0.676414  5.084371e-01  Not Significant
5           Policy Type     0.470910  6.244336e-01  Not Significant
6     Policy Start Date     1.078124  1.524729e-92      Significant
7     Customer Feedback     3.038610  4.790182e-02      Significant
8        Smoking Status     0.031783  8.585039e-01  Not Significant
9    Exercise Frequency     0.483057  6.940501e-01  Not Significant
10        Property Type     1.051181  3.495250e-01  Not Significant


In [None]:
# The ANOVAs shows that Policy Start Date and Customer Feedback are significant in determining the Preimum amount 

In [2]:
# Linear Regression

!pip install scikit-learn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_squared_log_error

# Load dataset
data = pd.read_csv("data/train.csv")  # Replace with your dataset's filename

# Handle missing values
data = data.fillna(method="ffill")

# Feature Engineering
# Convert Policy Start Date to numerical feature
data['Policy Start Date'] = pd.to_datetime(data['Policy Start Date'])
reference_date = data['Policy Start Date'].min()
data['Policy Days Since Start'] = (data['Policy Start Date'] - reference_date).dt.days

# Encode Customer Feedback as ordinal values
feedback_mapping = {"Poor": 1, "Average": 2, "Good": 3}
data['Customer Feedback'] = data['Customer Feedback'].map(feedback_mapping)

# Select features and target
features = ['Policy Days Since Start', 'Customer Feedback']
target = 'Premium Amount'

# Ensure target is numeric
data = data.dropna(subset=[target])
X = data[features]
y = data[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmsle = np.sqrt(mean_squared_log_error(y_test, np.maximum(y_pred, 0)))  # Ensure no negative predictions

print(f"Baseline Linear Regression RMSE: {rmse}")
print(f"Baseline Linear Regression RMSLE: {rmsle}")

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp311-cp311-macosx_10_9_x86_64.whl.metadata (13 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp311-cp311-macosx_10_9_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scikit-learn
Successfully installed scikit-learn-1.5.2 threadpoolctl-3.5.0


  data = data.fillna(method="ffill")


Baseline Linear Regression RMSE: 864.4064108403377
Baseline Linear Regression RMSLE: 1.1712853742271268
