## Imports

In [1]:
!pip install xgboost



In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp39-cp39-win_amd64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.7-cp39-cp39-win_amd64.whl (101.8 MB)
   ---------------------------------------- 101.8/101.8 MB 1.3 MB/s eta 0:00:00
Downloading graphviz-0.20.3-py3-none-any.whl (47 kB)
Installing collected packages: graphviz, catboost
Successfully installed catboost-1.2.7 graphviz-0.20.3


In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from xgboost import XGBRegressor
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor

In [4]:
df = pd.read_csv("data/train.csv")
df.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1200000 non-null  int64  
 1   Age                   1181295 non-null  float64
 2   Gender                1200000 non-null  object 
 3   Annual Income         1155051 non-null  float64
 4   Marital Status        1181471 non-null  object 
 5   Number of Dependents  1090328 non-null  float64
 6   Education Level       1200000 non-null  object 
 7   Occupation            841925 non-null   object 
 8   Health Score          1125924 non-null  float64
 9   Location              1200000 non-null  object 
 10  Policy Type           1200000 non-null  object 
 11  Previous Claims       835971 non-null   float64
 12  Vehicle Age           1199994 non-null  float64
 13  Credit Score          1062118 non-null  float64
 14  Insurance Duration    1199999 non-

In [6]:
nulls = df.isnull()
nulls.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
null_counts = df.isnull().sum()
print(null_counts)

id                           0
Age                      18705
Gender                       0
Annual Income            44949
Marital Status           18529
Number of Dependents    109672
Education Level              0
Occupation              358075
Health Score             74076
Location                     0
Policy Type                  0
Previous Claims         364029
Vehicle Age                  6
Credit Score            137882
Insurance Duration           1
Policy Start Date            0
Customer Feedback        77824
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount               0
dtype: int64


In [8]:
# Select numeric columns for mean imputation
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Fill NaN values in numeric columns with the mean
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Select categorical columns for encoding
object_cols = df.select_dtypes(include=['object']).columns

# Encode categorical columns using Label Encoding
label_encoders = {}
for col in object_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))  # Convert to string to avoid errors
    label_encoders[col] = le  # Save the encoder for future use

# Alternatively, if you want to fill NaNs in categorical columns with the mode
for col in object_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)  # Fill with mode

# Check the DataFrame after processing
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1200000 non-null  int64  
 1   Age                   1200000 non-null  float64
 2   Gender                1200000 non-null  int32  
 3   Annual Income         1200000 non-null  float64
 4   Marital Status        1200000 non-null  int32  
 5   Number of Dependents  1200000 non-null  float64
 6   Education Level       1200000 non-null  int32  
 7   Occupation            1200000 non-null  int32  
 8   Health Score          1200000 non-null  float64
 9   Location              1200000 non-null  int32  
 10  Policy Type           1200000 non-null  int32  
 11  Previous Claims       1200000 non-null  float64
 12  Vehicle Age           1200000 non-null  float64
 13  Credit Score          1200000 non-null  float64
 14  Insurance Duration    1200000 non-

## Baseline

In [9]:
# Define features and target variable
X = df.drop('Premium Amount', axis=1)  # Features
y = df['Premium Amount']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose a regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmsle = np.sqrt(mean_squared_error(np.log1p(y_test), np.log1p(y_pred)))

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
print(f'RMSLE: {rmsle}')

Original_indicies = X.index

Mean Squared Error: 744887.0419032796
R-squared: 0.0032096754783221826
RMSLE: 1.1682038028661481


In [10]:
# Load the data
df1 = pd.read_csv("data/test.csv")

# Check the columns in the DataFrame
print("Columns in DataFrame:", df1.columns)

# Select numeric columns for mean imputation
numeric_cols = df1.select_dtypes(include=['float64', 'int64']).columns

# Fill NaN values in numeric columns with the mean
df1[numeric_cols] = df1[numeric_cols].fillna(df1[numeric_cols].mean())

# Select categorical columns for encoding
object_cols = df1.select_dtypes(include=['object']).columns

# Encode categorical columns using Label Encoding
label_encoders = {}
for col in object_cols:
    le = LabelEncoder()
    df1[col] = le.fit_transform(df1[col].astype(str))  # Convert to string to avoid errors
    label_encoders[col] = le  # Save the encoder for future use

# Fill NaNs in categorical columns with the mode
for col in object_cols:
    df1[col].fillna(df1[col].mode()[0], inplace=True)  # Fill with mode

# Check the DataFrame after processing
print(df1.info())

X = df1

len(X)

Columns in DataFrame: Index(['id', 'Age', 'Gender', 'Annual Income', 'Marital Status',
       'Number of Dependents', 'Education Level', 'Occupation', 'Health Score',
       'Location', 'Policy Type', 'Previous Claims', 'Vehicle Age',
       'Credit Score', 'Insurance Duration', 'Policy Start Date',
       'Customer Feedback', 'Smoking Status', 'Exercise Frequency',
       'Property Type'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    800000 non-null  int64  
 1   Age                   800000 non-null  float64
 2   Gender                800000 non-null  int32  
 3   Annual Income         800000 non-null  float64
 4   Marital Status        800000 non-null  int32  
 5   Number of Dependents  800000 non-null  float64
 6   Education Level       800000 non-null  int32  
 7   

800000

In [11]:
# Make predictions
# y_pred = model.predict(X)
# sub1 = pd.DataFrame({"id": X.index, "Premium Amount": y_pred}).reset_index(drop=True)
# sub1

In [12]:
# # Make predictions
# y_pred = model.predict(X)
# sub1 = pd.DataFrame({id: X.index, "Premium Amount": y_pred})
# sub1

In [13]:
# Make predictions
y_pred = model.predict(X)

# Ensure the id column starts from 1200000
sub1 = pd.DataFrame({
    "id": X.index,  # Generate sequential IDs
    "Premium Amount": y_pred
})

# Save to CSV
sub1.to_csv("submission_files/submission_1.csv", index=False)

## Trying XGBoost

In [14]:
# Load your data
# Assuming df is already defined and processed
X = df.drop('Premium Amount', axis=1)  # Features
y = df['Premium Amount']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose XGBoost as the regression model
model = XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmsle = np.sqrt(mean_squared_error(np.log1p(y_test), np.log1p(y_pred)))

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
print(f'RMSLE: {rmsle}')

# Load the test data for predictions
df1 = pd.read_csv("data/test.csv")

# Process the test data
numeric_cols = df1.select_dtypes(include=['float64', 'int64']).columns
df1[numeric_cols] = df1[numeric_cols].fillna(df1[numeric_cols].mean())

object_cols = df1.select_dtypes(include=['object']).columns
label_encoders = {}
for col in object_cols:
    le = LabelEncoder()
    df1[col] = le.fit_transform(df1[col].astype(str))
    label_encoders[col] = le

for col in object_cols:
    df1[col].fillna(df1[col].mode()[0], inplace=True)

# Prepare features for prediction
X_test_final = df1  # Assuming df1 is processed similarly to the training data

# Make predictions on the test set
y_pred_test = model.predict(X_test_final)

# Create submission DataFrame
sub1 = pd.DataFrame({
    "id": X_test_final.index + 1200000,  # Adjust as needed for starting ID
    "Premium Amount": y_pred_test
})

# Save to CSV
sub1.to_csv("submission_files/submission_2.csv", index=False)

Mean Squared Error: 704818.015576021
R-squared: 0.056829104343620984
RMSLE: 1.1415290627822174


## Trying other models

In [None]:
# Load your data
# Assuming df is already defined and processed
X = df.drop('Premium Amount', axis=1)  # Features
y = df['Premium Amount']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a function to evaluate models
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    rmsle = np.sqrt(mean_squared_error(np.log1p(y_test), np.log1p(y_pred)))
    return mse, r2, rmsle

# Define models
models = {
    "CatBoost": CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=6, verbose=0),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Elastic Net": ElasticNet(random_state=42),
    "Lasso": Lasso(random_state=42),
    "SVR": SVR(kernel='rbf')
}

# Evaluate each model
for name, model in models.items():
    mse, r2, rmsle = evaluate_model(model, X_train, y_train, X_test, y_test)
    print(f'{name} - Mean Squared Error: {mse}, R-squared: {r2}, RMSLE: {rmsle}')

# Load the test data for predictions
df1 = pd.read_csv("data/test.csv")

# Process the test data
numeric_cols = df1.select_dtypes(include=['float64', 'int64']).columns
df1[numeric_cols] = df1[numeric_cols].fillna(df1[numeric_cols].mean())

object_cols = df1.select_dtypes(include=['object']).columns
label_encoders = {}
for col in object_cols:
    le = LabelEncoder()
    df1[col] = le.fit_transform(df1[col].astype(str))
    label_encoders[col] = le

for col in object_cols:
    df1[col].fillna(df1[col].mode()[0], inplace=True)

# Prepare features for prediction
X_test_final = df1  # Assuming df1 is processed similarly to the training data

# Make predictions for all models on the test set
for name, model in models.items():
    y_pred_test = model.predict(X_test_final)
    sub1 = pd.DataFrame({
        "id": X_test_final.index + 1200000,  # Adjust as needed for starting ID
        "Premium Amount": y_pred_test
    })
    # Save to CSV
    sub1.to_csv(f"{name}_submission.csv", index=False)

CatBoost - Mean Squared Error: 701757.1624008886, R-squared: 0.060925066090999125, RMSLE: 1.136832991894153
