# Submission - Inquisitive_Turtles

In [None]:
import pandas as pd

In [None]:
train_df = pd.read_csv('/kaggle/input/data-insurance/train.csv')
test_df = pd.read_csv('/kaggle/input/test-data/test (1).csv')

In [None]:
print('\ntrain df stats')
print(train_df.describe())
print('\ntest df stats')
print(test_df.describe())

In [None]:
def check_missing(df):
    missing_values = df.isnull().sum()
    cols_with_missing = missing_values[missing_values > 0]
    print(cols_with_missing)
    print('\n')
print('\nchecking missing values for train df')
check_missing(train_df)
print('\nchecking missing values for test df')
check_missing(test_df)

In [None]:
train_df = train_df.dropna(subset = 'Premium Amount')

In [None]:
def check_outliers(df):
    Q1 = df['Previous Claims'].quantile(0.25)
    Q3 = df['Previous Claims'].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    df_cleaned = df[(train_df['Previous Claims'] >= lower_bound) & (df['Previous Claims'] <= upper_bound)]
    df_outliers = df[(train_df['Previous Claims'] < lower_bound) | (df['Previous Claims'] > upper_bound)]

    return df_cleaned,df_outliers

train_df_cleaned, train_df_outliers = check_outliers(train_df)
print('number of outliers in train df: ',train_df_outliers['Previous Claims'].count())
print("Original DataFrame with outliers in train df:",train_df['Previous Claims'].count())
print("DataFrame after removing outliers in train df:",train_df_cleaned['Previous Claims'].count())

test_df_cleaned, test_df_outliers = check_outliers(test_df)
print('number of outliers in test df: ',test_df_outliers['Previous Claims'].count())
print("Original DataFrame with outliers in test df:",test_df['Previous Claims'].count())
print("DataFrame after removing outliers in test df:",test_df_cleaned['Previous Claims'].count())

In [None]:
print('\n cleaned train df stats')
print(train_df_cleaned.describe())
print('\n cleaned test df stats')
print(test_df_cleaned.describe())

In [None]:
mean_imputation_cols = ['Age','Number of Dependents','Credit Score','Credit Score']

In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
train_df_cleaned[mean_imputation_cols]= imputer.fit_transform(train_df_cleaned[mean_imputation_cols])
test_df_cleaned[mean_imputation_cols]= imputer.transform(test_df_cleaned[mean_imputation_cols])
print(train_df_cleaned.head(5))
print(test_df_cleaned.head(5))


In [None]:
print('\nchecking missing values for cleaned train df')
check_missing(train_df_cleaned)
print('\nchecking missing values for cleaned test df')
check_missing(test_df_cleaned)

In [None]:
mode_imputation_cols=['Annual Income','Health Score','Marital Status','Occupation', 'Customer Feedback']

In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='most_frequent')
train_df_cleaned[mode_imputation_cols]= imputer.fit_transform(train_df_cleaned[mode_imputation_cols])
test_df_cleaned[mode_imputation_cols]= imputer.transform(test_df_cleaned[mode_imputation_cols])
print(train_df_cleaned.head(5))
print(test_df_cleaned.head(5))


In [None]:
print('\nchecking missing values for cleaned train df')
check_missing(train_df_cleaned)
print('\nchecking missing values for cleaned test df')
check_missing(test_df_cleaned)

In [None]:
train_df_cleaned = train_df_cleaned.dropna(subset=['Vehicle Age','Insurance Duration'])
test_df_cleaned = test_df_cleaned.dropna(subset=['Vehicle Age','Insurance Duration'])


In [None]:
print('\nchecking missing values for cleaned train df')
check_missing(train_df_cleaned)
print('\nchecking missing values for cleaned test df')
check_missing(test_df_cleaned)

In [None]:
train_df_cleaned.describe()

In [None]:
train_df_cleaned.dtypes

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoding_cols=['Customer Feedback','Occupation','Education Level','Marital Status',
               'Property Type','Exercise Frequency','Smoking Status','Policy Type','Gender','Location']

encoder = OneHotEncoder(sparse=False)

train_encoded_array = encoder.fit_transform(train_df_cleaned[encoding_cols])
test_encoded_array = encoder.transform(test_df_cleaned[encoding_cols])

train_encoded_df = pd.DataFrame(train_encoded_array, columns=encoder.get_feature_names_out(encoding_cols))
test_encoded_df = pd.DataFrame(test_encoded_array, columns=encoder.get_feature_names_out(encoding_cols))

train_df_cleaned = pd.concat([train_df_cleaned.drop(columns=encoding_cols), train_encoded_df], axis=1)
test_df_cleaned = pd.concat([test_df_cleaned.drop(columns=encoding_cols), test_encoded_df], axis=1)

print(train_df_cleaned.columns)
print(test_df_cleaned.columns)

In [None]:
train_df_cleaned.head(5)

In [None]:
test_df_cleaned.head(5)

In [None]:
train_df_cleaned.dtypes

In [None]:
train_df_cleaned = train_df_cleaned.dropna()

In [None]:
train_df_cleaned.describe()

In [None]:
test_df_cleaned = test_df_cleaned.dropna()

In [None]:
test_df_cleaned.describe()

In [None]:
train_df_cleaned['Number of Dependents'] = train_df_cleaned['Number of Dependents'].astype('int32')
test_df_cleaned['Number of Dependents'] = test_df_cleaned['Number of Dependents'].astype('int32')

In [None]:
train_df_cleaned['Policy Start Date'] = train_df_cleaned['Policy Start Date'].astype('datetime64[ns]')
test_df_cleaned['Policy Start Date'] = test_df_cleaned['Policy Start Date'].astype('datetime64[ns]')

In [None]:
train_df_cleaned['Annual Income'] = train_df_cleaned['Annual Income'].astype('float64')
test_df_cleaned['Annual Income'] = test_df_cleaned['Annual Income'].astype('float64')

In [None]:
train_df_cleaned['Health Score'] = train_df_cleaned['Health Score'].astype('float64')
test_df_cleaned['Health Score'] = test_df_cleaned['Health Score'].astype('float64')

In [None]:
train_df_cleaned.dtypes

In [None]:
test_df_cleaned.dtypes

In [None]:
skewed_cols=['Annual Income','Health Score','Premium Amount']

train_df_cleaned['Annual Income'].describe()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import boxcox, yeojohnson
from scipy.stats import shapiro

# Function to visualize the distribution
def plot_hist(df, col):
    plt.hist(df[col], bins=10, color='blue', edgecolor='black', alpha=0.7)
    plt.title(f'Distribution of {col}')
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.show()

# Applying transformations
def applying_transformations(df):
    for col in skewed_cols:
        transformations = {}
        if col in df.columns:
            # Check initial skewness
            print(f"Initial Skewness of {col}: {df[col].skew()}")

            # Apply cube root transformation
            df[f'{col}_sqrt'] = np.sqrt(df[col])
            transformations[f'{col}_sqrt'] = df[f'{col}_sqrt'].skew()

            # Apply cube root transformation
            df[f'{col}_cbrt'] = np.cbrt(df[col])
            transformations[f'{col}_cbrt'] = df[f'{col}_cbrt'].skew()

            # Apply log transformation
            df[f'{col}_log'] = np.log1p(df[col])
            transformations[f'{col}_log'] = df[f'{col}_log'].skew()

            # Apply Box-Cox transformation (requires positive values)
            if (df[col] > 0).all():
                df[f'{col}_boxcox'], _ = boxcox(df[col] + 1e-6)
                transformations[f'{col}_boxcox'] = pd.Series(df[f'{col}_boxcox']).skew()

            # Apply Yeo-Johnson transformation
            df[f'{col}_yeojohnson'], _ = yeojohnson(df[col])
            transformations[f'{col}_yeojohnson'] = pd.Series(df[f'{col}_yeojohnson']).skew()

            # Plot transformed distributions
            print(f"After transformations for {col}:")
            for key, value in transformations.items():
                print(f"{key}: Skewness = {value}")
                plot_hist(df, key)

    return df

# Apply transformations to train and test sets
print('Applying transformations to train set:')
train_df_cleaned = applying_transformations(train_df_cleaned)

print('Applying transformations to test set:')
test_df_cleaned = applying_transformations(test_df_cleaned)

#annual income - cbrt/boxcox, health score - yeo johnson, premium amount - yeo johnson

Selecting yeo johnson transformation for all the three skewed columns

In [None]:
train_df_cleaned = train_df_cleaned.drop(columns=['Annual Income_sqrt', 'Annual Income_cbrt', 'Annual Income_log','Annual Income_boxcox', 'Health Score_sqrt',
                              'Health Score_cbrt', 'Health Score_log', 'Health Score_boxcox', 'Premium Amount_sqrt', 'Premium Amount_cbrt',
                              'Premium Amount_log', 'Premium Amount_boxcox','Annual Income','Health Score', 'Premium Amount'])

In [None]:
test_df_cleaned = test_df_cleaned.drop(columns=['Annual Income_sqrt', 'Annual Income_cbrt', 'Annual Income_log','Annual Income_boxcox', 'Health Score_sqrt',
                              'Health Score_cbrt', 'Health Score_log', 'Health Score_boxcox','Annual Income','Health Score'])

In [None]:
print(train_df_cleaned.columns)
print(test_df_cleaned.columns)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_correlation_matrix(df, figsize=(10, 8), save_plot=False, filename="correlation_matrix.png"):
    """
    Plot the correlation matrix of a DataFrame.
    Parameters:
        df (pd.DataFrame): The DataFrame containing the features.
        figsize (tuple): Size of the heatmap figure.
        save_plot (bool): Whether to save the plot as a file.
        filename (str): Name of the file to save the plot if save_plot is True.
    """
    # Compute the correlation matrix
    correlation_matrix = df.corr()

    # Create the heatmap
    plt.figure(figsize=figsize)
    sns.heatmap(
        correlation_matrix,
        annot=True,
        fmt=".2f",
        cmap="coolwarm",
        cbar=True,
        square=True,
        linewidths=0.5,
        annot_kws={"size": 8}
    )
    plt.title("Feature Correlation Matrix", fontsize=16)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)

    if save_plot:
        plt.savefig(filename, bbox_inches="tight")
        print(f"Correlation matrix saved as {filename}")
    else:
        plt.show()

# Example: Visualize correlation for train and test DataFrames
print("Correlation Matrix for Train DataFrame")
plot_correlation_matrix(train_df_cleaned, figsize=(12, 10))

print("Correlation Matrix for Test DataFrame")
plot_correlation_matrix(test_df_cleaned, figsize=(12, 10), save_plot=True, filename="test_correlation_matrix.png")


In [None]:
train_df_cleaned.columns

In [None]:
train_df_cleaned = train_df_cleaned.drop(columns=['Policy Start Date'])
test_df_cleaned = test_df_cleaned.drop(columns=['Policy Start Date'])

In [None]:
pip install tqdm

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm  # Import tqdm for progress bar

# Define a function to apply the advanced models
def apply_advanced_regression_models(train_df, test_df, target_col, features):
    X_train = train_df[features]
    y_train = train_df[target_col]
    X_test = test_df[features]
    
    # Standardize data for SVR
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    models = {
        'Gradient Boosting Regressor': GradientBoostingRegressor(),
        'XGBoost': xgb.XGBRegressor(),
        'LightGBM': lgb.LGBMRegressor(),
        'CatBoost': cb.CatBoostRegressor(learning_rate=0.1, depth=6, iterations=100, verbose=0)
    }

    # Dictionary to store results
    results = {}

    # Apply each model and track progress using tqdm
    for model_name, model in tqdm(models.items(), desc="Training Models", total=len(models)):
        print(f"\nTraining {model_name}...")

        if model_name == 'Support Vector Regressor':
            # For SVR, use the scaled data
            model.fit(X_train_scaled, y_train)
            y_pred = model.predict(X_test_scaled)
        else:
            # For other models, use the original features
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

        # If target column is available in test_df, calculate evaluation metrics
        if target_col in test_df.columns:
            y_test = test_df[target_col]
            mae = mean_absolute_error(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            r2 = r2_score(y_test, y_pred)

            # Store the results
            results[model_name] = {
                'MAE': mae,
                'MSE': mse,
                'RMSE': rmse,
                'R2 Score': r2
            }

            # Print the evaluation metrics
            print(f"{model_name} Results:")
            print(f"MAE: {mae:.4f}")
            print(f"MSE: {mse:.4f}")
            print(f"RMSE: {rmse:.4f}")
            print(f"R2 Score: {r2:.4f}")
        else:
            # If no target column in test_df, just print the predictions
            print(f"{model_name} Predictions:")
            print(y_pred[:5])  # Display first 5 predictions

    # Convert results into a DataFrame for better visualization
    if results:
        results_df = pd.DataFrame(results).T
        return results_df
    else:
        return None

# Example usage
# Define your features and target column (replace with your actual column names)

target_col = 'Premium Amount_yeojohnson'  # Replace with the actual target column
features = [col for col in train_df_cleaned.columns if col != target_col]  # Exclude target column from features

# Apply advanced regression models
results_df = apply_advanced_regression_models(train_df_cleaned, test_df_cleaned, target_col, features)

# Display the results if available
if results_df is not None:
    print("\nAdvanced Model Evaluation Results:")
    print(results_df)
