In [1]:
import warnings
warnings.filterwarnings("ignore")

import time
import glob

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

from sklearn.decomposition import PCA

from sklearn.metrics import jaccard_score


tqdm.pandas()
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
train_df = pd.read_csv('train.csv')
print("Training Data:" , train_df.shape)

test_df = pd.read_csv('test.csv')
print("Test Data:" , test_df.shape)

train_df.head()

Training Data: (1200000, 21)
Test Data: (800000, 20)


Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


In [3]:
#Policy Start Date drop 
train_df.drop('Policy Start Date', axis=1, inplace=True)
test_df.drop('Policy Start Date', axis=1, inplace=True)

#Drop ID
train_df.drop('id', axis=1, inplace=True)
test_df.drop('id', axis=1, inplace=True)

In [4]:
print(train_df['Credit Score'].isnull().sum())# 109672
print(test_df['Credit Score'].isnull().sum())# 73130

137882
91451


In [5]:
print(train_df['Occupation'].isnull().sum())# 358075
print(test_df['Occupation'].isnull().sum())# 239125

358075
239125


In [6]:
#Age nan values
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].mean())
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].mean())

#Annual Income nan values
train_df['Annual Income'] = train_df['Annual Income'].fillna(train_df['Annual Income'].mean())
test_df['Annual Income'] = test_df['Annual Income'].fillna(test_df['Annual Income'].mean())

#marital status nan values
train_df['Marital Status'] = train_df['Marital Status'].fillna('other')
test_df['Marital Status'] = test_df['Marital Status'].fillna('other')

#Number of Dependents nan values
train_df['Number of Dependents'] = train_df['Number of Dependents'].fillna(0)
test_df['Number of Dependents'] = test_df['Number of Dependents'].fillna(0)

#Occupation nan values
train_df['Occupation'] = train_df['Occupation'].fillna('other')
test_df['Occupation'] = test_df['Occupation'].fillna('other')

#Health Score nan values
train_df['Health Score'] = train_df['Health Score'].fillna(train_df['Health Score'].mode()[0])
test_df['Health Score'] = test_df['Health Score'].fillna(test_df['Health Score'].mode()[0])

#Previous Claims nan values
train_df['Previous Claims'] = train_df['Previous Claims'].fillna(train_df['Previous Claims'].mode()[0])
test_df['Previous Claims'] = test_df['Previous Claims'].fillna(test_df['Previous Claims'].mode()[0])

#Vehicle Age nan values
train_df['Vehicle Age'] = train_df['Vehicle Age'].fillna(train_df['Vehicle Age'].mode()[0])
test_df['Vehicle Age'] = test_df['Vehicle Age'].fillna(test_df['Vehicle Age'].mode()[0])

#Credit Score nan values
train_df['Credit Score'] = train_df['Credit Score'].fillna(train_df['Credit Score'].mean())
test_df['Credit Score'] = test_df['Credit Score'].fillna(test_df['Credit Score'].mean())

#Insurance Duration nan values
train_df['Insurance Duration'] = train_df['Insurance Duration'].fillna(train_df['Insurance Duration'].mode()[0])
test_df['Insurance Duration'] = test_df['Insurance Duration'].fillna(test_df['Insurance Duration'].mode()[0])

#Customer Feedback nan values
train_df['Customer Feedback'] = train_df['Customer Feedback'].fillna('other')
test_df['Customer Feedback'] = test_df['Customer Feedback'].fillna('other')

train_df.head()
test_df.head()



Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,28.0,Female,2310.0,other,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,0.0,19.0,592.904749,1.0,Poor,Yes,Weekly,House
1,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,0.0,14.0,372.0,8.0,Good,Yes,Rarely,Apartment
2,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,0.0,16.0,819.0,9.0,Average,Yes,Monthly,Condo
3,28.0,Female,30424.0,Divorced,3.0,PhD,Self-Employed,5.136225,Suburban,Comprehensive,1.0,3.0,770.0,5.0,Poor,Yes,Daily,House
4,24.0,Male,10863.0,Divorced,2.0,High School,Unemployed,11.844155,Suburban,Premium,0.0,14.0,755.0,7.0,Average,No,Weekly,House


In [7]:
columns = [
    'Age', 'Gender', 'Annual Income', 'Marital Status', 'Number of Dependents', 
    'Education Level', 'Occupation', 'Health Score', 'Location', 'Policy Type', 
    'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration', 
    'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 'Property Type'
]

for column in columns:
    print(train_df[column].isnull().sum())  # 0
    print(test_df[column].isnull().sum())  # 0

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [8]:
#Age Min Max Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train_df['Age'] = scaler.fit_transform(train_df['Age'].values.reshape(-1, 1))
test_df['Age'] = scaler.transform(test_df['Age'].values.reshape(-1, 1))

In [9]:
#Annual Income Min Max Scaling
scaler = MinMaxScaler()
train_df['Annual Income'] = scaler.fit_transform(train_df['Annual Income'].values.reshape(-1, 1))
test_df['Annual Income'] = scaler.transform(test_df['Annual Income'].values.reshape(-1, 1))

In [10]:
#Number of Dependents Min Max Scaling
scaler = MinMaxScaler()
train_df['Number of Dependents'] = scaler.fit_transform(train_df['Number of Dependents'].values.reshape(-1, 1))
test_df['Number of Dependents'] = scaler.transform(test_df['Number of Dependents'].values.reshape(-1, 1))

In [11]:
#Health Score Min Max Scaling
scaler = MinMaxScaler()
train_df['Health Score'] = scaler.fit_transform(train_df['Health Score'].values.reshape(-1, 1))
test_df['Health Score'] = scaler.transform(test_df['Health Score'].values.reshape(-1, 1))

In [12]:
#Vehicle Age Min Max Scaling
scaler = MinMaxScaler()
train_df['Vehicle Age'] = scaler.fit_transform(train_df['Vehicle Age'].values.reshape(-1, 1))
test_df['Vehicle Age'] = scaler.transform(test_df['Vehicle Age'].values.reshape(-1, 1))

In [13]:
#Previous Claims Min Max Scaling
scaler = MinMaxScaler()
train_df['Previous Claims'] = scaler.fit_transform(train_df['Previous Claims'].values.reshape(-1, 1))
test_df['Previous Claims'] = scaler.transform(test_df['Previous Claims'].values.reshape(-1, 1))

In [14]:
#Credit Score Min Max Scaling
scaler = MinMaxScaler()
train_df['Credit Score'] = scaler.fit_transform(train_df['Credit Score'].values.reshape(-1, 1))
test_df['Credit Score'] = scaler.transform(test_df['Credit Score'].values.reshape(-1, 1))

In [15]:
#Insurance Duration Min Max Scaling
scaler = MinMaxScaler()
train_df['Insurance Duration'] = scaler.fit_transform(train_df['Insurance Duration'].values.reshape(-1, 1))
test_df['Insurance Duration'] = scaler.transform(test_df['Insurance Duration'].values.reshape(-1, 1))

In [16]:
#Smoking Status One Hot Encoding
train_df = pd.get_dummies(train_df, columns=['Smoking Status'])
test_df = pd.get_dummies(test_df, columns=['Smoking Status'])

In [17]:
#Gender ONE HOT ENCODING
train_df = pd.get_dummies(train_df, columns=['Gender'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Gender'], drop_first=True)

In [18]:
#Martial Status ONE HOT ENCODING
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
train_encoded = encoder.fit_transform(train_df[['Marital Status']])

train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(['Marital Status']))
train_df = pd.concat([train_df, train_encoded_df], axis=1)
train_df.drop(columns=['Marital Status'], inplace=True)

test_encoded = encoder.transform(test_df[['Marital Status']])
test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(['Marital Status']))
test_df = pd.concat([test_df, test_encoded_df], axis=1)
test_df.drop(columns=['Marital Status'], inplace=True)
train_df.head()

Unnamed: 0,Age,Annual Income,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Exercise Frequency,Property Type,Premium Amount,Smoking Status_No,Smoking Status_Yes,Gender_Male,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_other
0,0.021739,0.066988,0.25,Bachelor's,Self-Employed,0.361397,Urban,Premium,0.222222,0.894737,0.131148,0.5,Poor,Weekly,House,2869.0,True,False,False,0.0,1.0,0.0,0.0
1,0.456522,0.211186,0.75,Master's,other,0.238002,Rural,Comprehensive,0.111111,0.631579,0.717668,0.125,Average,Monthly,House,1483.0,False,True,False,1.0,0.0,0.0,0.0
2,0.108696,0.170678,0.75,High School,Self-Employed,0.792879,Suburban,Premium,0.111111,0.736842,0.53356,0.25,Good,Weekly,House,567.0,False,True,True,1.0,0.0,0.0,0.0
3,0.065217,0.945719,0.5,Bachelor's,other,0.156695,Rural,Basic,0.111111,0.0,0.12204,0.0,Poor,Daily,Apartment,765.0,False,True,True,0.0,1.0,0.0,0.0
4,0.065217,0.26434,0.25,Bachelor's,Self-Employed,0.322378,Rural,Premium,0.0,0.421053,0.542805,0.375,Poor,Weekly,House,2022.0,False,True,True,0.0,0.0,1.0,0.0


In [19]:
#Occupation ONE HOT ENCODING
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
train_encoded = encoder.fit_transform(train_df[['Occupation']])

train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(['Occupation']))
train_df = pd.concat([train_df, train_encoded_df], axis=1)
train_df.drop(columns=['Occupation'], inplace=True)

test_encoded = encoder.transform(test_df[['Occupation']])
test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(['Occupation']))
test_df = pd.concat([test_df, test_encoded_df], axis=1)
test_df.drop(columns=['Occupation'], inplace=True)
train_df.head()

Unnamed: 0,Age,Annual Income,Number of Dependents,Education Level,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Exercise Frequency,Property Type,Premium Amount,Smoking Status_No,Smoking Status_Yes,Gender_Male,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_other,Occupation_Employed,Occupation_Self-Employed,Occupation_Unemployed,Occupation_other
0,0.021739,0.066988,0.25,Bachelor's,0.361397,Urban,Premium,0.222222,0.894737,0.131148,0.5,Poor,Weekly,House,2869.0,True,False,False,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.456522,0.211186,0.75,Master's,0.238002,Rural,Comprehensive,0.111111,0.631579,0.717668,0.125,Average,Monthly,House,1483.0,False,True,False,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.108696,0.170678,0.75,High School,0.792879,Suburban,Premium,0.111111,0.736842,0.53356,0.25,Good,Weekly,House,567.0,False,True,True,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.065217,0.945719,0.5,Bachelor's,0.156695,Rural,Basic,0.111111,0.0,0.12204,0.0,Poor,Daily,Apartment,765.0,False,True,True,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.065217,0.26434,0.25,Bachelor's,0.322378,Rural,Premium,0.0,0.421053,0.542805,0.375,Poor,Weekly,House,2022.0,False,True,True,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [20]:
#Education Level ONE HOT ENCODING
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
train_encoded = encoder.fit_transform(train_df[['Education Level']])

train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(['Education Level']))
train_df = pd.concat([train_df, train_encoded_df], axis=1)
train_df.drop(columns=['Education Level'], inplace=True)

test_encoded = encoder.transform(test_df[['Education Level']])
test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(['Education Level']))
test_df = pd.concat([test_df, test_encoded_df], axis=1)
test_df.drop(columns=['Education Level'], inplace=True)
train_df.head()

Unnamed: 0,Age,Annual Income,Number of Dependents,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Exercise Frequency,Property Type,Premium Amount,Smoking Status_No,Smoking Status_Yes,Gender_Male,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_other,Occupation_Employed,Occupation_Self-Employed,Occupation_Unemployed,Occupation_other,Education Level_Bachelor's,Education Level_High School,Education Level_Master's,Education Level_PhD
0,0.021739,0.066988,0.25,0.361397,Urban,Premium,0.222222,0.894737,0.131148,0.5,Poor,Weekly,House,2869.0,True,False,False,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.456522,0.211186,0.75,0.238002,Rural,Comprehensive,0.111111,0.631579,0.717668,0.125,Average,Monthly,House,1483.0,False,True,False,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.108696,0.170678,0.75,0.792879,Suburban,Premium,0.111111,0.736842,0.53356,0.25,Good,Weekly,House,567.0,False,True,True,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.065217,0.945719,0.5,0.156695,Rural,Basic,0.111111,0.0,0.12204,0.0,Poor,Daily,Apartment,765.0,False,True,True,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,0.065217,0.26434,0.25,0.322378,Rural,Premium,0.0,0.421053,0.542805,0.375,Poor,Weekly,House,2022.0,False,True,True,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [21]:
#Location ONE HOT ENCODING
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
train_encoded = encoder.fit_transform(train_df[['Location']])

train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(['Location']))
train_df = pd.concat([train_df, train_encoded_df], axis=1)
train_df.drop(columns=['Location'], inplace=True)

test_encoded = encoder.transform(test_df[['Location']])
test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(['Location']))
test_df = pd.concat([test_df, test_encoded_df], axis=1)
test_df.drop(columns=['Location'], inplace=True)
train_df.head()

Unnamed: 0,Age,Annual Income,Number of Dependents,Health Score,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Exercise Frequency,Property Type,Premium Amount,Smoking Status_No,Smoking Status_Yes,Gender_Male,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_other,Occupation_Employed,Occupation_Self-Employed,Occupation_Unemployed,Occupation_other,Education Level_Bachelor's,Education Level_High School,Education Level_Master's,Education Level_PhD,Location_Rural,Location_Suburban,Location_Urban
0,0.021739,0.066988,0.25,0.361397,Premium,0.222222,0.894737,0.131148,0.5,Poor,Weekly,House,2869.0,True,False,False,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.456522,0.211186,0.75,0.238002,Comprehensive,0.111111,0.631579,0.717668,0.125,Average,Monthly,House,1483.0,False,True,False,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.108696,0.170678,0.75,0.792879,Premium,0.111111,0.736842,0.53356,0.25,Good,Weekly,House,567.0,False,True,True,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.065217,0.945719,0.5,0.156695,Basic,0.111111,0.0,0.12204,0.0,Poor,Daily,Apartment,765.0,False,True,True,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.065217,0.26434,0.25,0.322378,Premium,0.0,0.421053,0.542805,0.375,Poor,Weekly,House,2022.0,False,True,True,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [22]:
#Policy Type ONE HOT ENCODING
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
train_encoded = encoder.fit_transform(train_df[['Policy Type']])

train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(['Policy Type']))
train_df = pd.concat([train_df, train_encoded_df], axis=1)
train_df.drop(columns=['Policy Type'], inplace=True)

test_encoded = encoder.transform(test_df[['Policy Type']])
test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(['Policy Type']))
test_df = pd.concat([test_df, test_encoded_df], axis=1)
test_df.drop(columns=['Policy Type'], inplace=True)
train_df.head()

Unnamed: 0,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Exercise Frequency,Property Type,Premium Amount,Smoking Status_No,Smoking Status_Yes,Gender_Male,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_other,Occupation_Employed,Occupation_Self-Employed,Occupation_Unemployed,Occupation_other,Education Level_Bachelor's,Education Level_High School,Education Level_Master's,Education Level_PhD,Location_Rural,Location_Suburban,Location_Urban,Policy Type_Basic,Policy Type_Comprehensive,Policy Type_Premium
0,0.021739,0.066988,0.25,0.361397,0.222222,0.894737,0.131148,0.5,Poor,Weekly,House,2869.0,True,False,False,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.456522,0.211186,0.75,0.238002,0.111111,0.631579,0.717668,0.125,Average,Monthly,House,1483.0,False,True,False,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.108696,0.170678,0.75,0.792879,0.111111,0.736842,0.53356,0.25,Good,Weekly,House,567.0,False,True,True,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.065217,0.945719,0.5,0.156695,0.111111,0.0,0.12204,0.0,Poor,Daily,Apartment,765.0,False,True,True,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.065217,0.26434,0.25,0.322378,0.0,0.421053,0.542805,0.375,Poor,Weekly,House,2022.0,False,True,True,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [23]:
#Customer Feedback ONE HOT ENCODING
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
train_encoded = encoder.fit_transform(train_df[['Customer Feedback']])

train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(['Customer Feedback']))
train_df = pd.concat([train_df, train_encoded_df], axis=1)
train_df.drop(columns=['Customer Feedback'], inplace=True)

test_encoded = encoder.transform(test_df[['Customer Feedback']])
test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(['Customer Feedback']))
test_df = pd.concat([test_df, test_encoded_df], axis=1)
test_df.drop(columns=['Customer Feedback'], inplace=True)
train_df.head()

Unnamed: 0,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Exercise Frequency,Property Type,Premium Amount,Smoking Status_No,Smoking Status_Yes,Gender_Male,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_other,Occupation_Employed,Occupation_Self-Employed,Occupation_Unemployed,Occupation_other,Education Level_Bachelor's,Education Level_High School,Education Level_Master's,Education Level_PhD,Location_Rural,Location_Suburban,Location_Urban,Policy Type_Basic,Policy Type_Comprehensive,Policy Type_Premium,Customer Feedback_Average,Customer Feedback_Good,Customer Feedback_Poor,Customer Feedback_other
0,0.021739,0.066988,0.25,0.361397,0.222222,0.894737,0.131148,0.5,Weekly,House,2869.0,True,False,False,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.456522,0.211186,0.75,0.238002,0.111111,0.631579,0.717668,0.125,Monthly,House,1483.0,False,True,False,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.108696,0.170678,0.75,0.792879,0.111111,0.736842,0.53356,0.25,Weekly,House,567.0,False,True,True,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.065217,0.945719,0.5,0.156695,0.111111,0.0,0.12204,0.0,Daily,Apartment,765.0,False,True,True,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.065217,0.26434,0.25,0.322378,0.0,0.421053,0.542805,0.375,Weekly,House,2022.0,False,True,True,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [24]:
#Exercise Frequency ONE HOT ENCODING
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
train_encoded = encoder.fit_transform(train_df[['Exercise Frequency']])

train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(['Exercise Frequency']))
train_df = pd.concat([train_df, train_encoded_df], axis=1)
train_df.drop(columns=['Exercise Frequency'], inplace=True)

test_encoded = encoder.transform(test_df[['Exercise Frequency']])
test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(['Exercise Frequency']))
test_df = pd.concat([test_df, test_encoded_df], axis=1)
test_df.drop(columns=['Exercise Frequency'], inplace=True)
train_df.head()

Unnamed: 0,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Property Type,Premium Amount,Smoking Status_No,Smoking Status_Yes,Gender_Male,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_other,Occupation_Employed,Occupation_Self-Employed,Occupation_Unemployed,Occupation_other,Education Level_Bachelor's,Education Level_High School,Education Level_Master's,Education Level_PhD,Location_Rural,Location_Suburban,Location_Urban,Policy Type_Basic,Policy Type_Comprehensive,Policy Type_Premium,Customer Feedback_Average,Customer Feedback_Good,Customer Feedback_Poor,Customer Feedback_other,Exercise Frequency_Daily,Exercise Frequency_Monthly,Exercise Frequency_Rarely,Exercise Frequency_Weekly
0,0.021739,0.066988,0.25,0.361397,0.222222,0.894737,0.131148,0.5,House,2869.0,True,False,False,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.456522,0.211186,0.75,0.238002,0.111111,0.631579,0.717668,0.125,House,1483.0,False,True,False,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.108696,0.170678,0.75,0.792879,0.111111,0.736842,0.53356,0.25,House,567.0,False,True,True,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.065217,0.945719,0.5,0.156695,0.111111,0.0,0.12204,0.0,Apartment,765.0,False,True,True,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.065217,0.26434,0.25,0.322378,0.0,0.421053,0.542805,0.375,House,2022.0,False,True,True,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [25]:
#Property Type ONE HOT ENCODING
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
train_encoded = encoder.fit_transform(train_df[['Property Type']])

train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(['Property Type']))
train_df = pd.concat([train_df, train_encoded_df], axis=1)
train_df.drop(columns=['Property Type'], inplace=True)

test_encoded = encoder.transform(test_df[['Property Type']])
test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(['Property Type']))
test_df = pd.concat([test_df, test_encoded_df], axis=1)
test_df.drop(columns=['Property Type'], inplace=True)
train_df.head()

Unnamed: 0,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Premium Amount,Smoking Status_No,Smoking Status_Yes,Gender_Male,Marital Status_Divorced,Marital Status_Married,Marital Status_Single,Marital Status_other,Occupation_Employed,Occupation_Self-Employed,Occupation_Unemployed,Occupation_other,Education Level_Bachelor's,Education Level_High School,Education Level_Master's,Education Level_PhD,Location_Rural,Location_Suburban,Location_Urban,Policy Type_Basic,Policy Type_Comprehensive,Policy Type_Premium,Customer Feedback_Average,Customer Feedback_Good,Customer Feedback_Poor,Customer Feedback_other,Exercise Frequency_Daily,Exercise Frequency_Monthly,Exercise Frequency_Rarely,Exercise Frequency_Weekly,Property Type_Apartment,Property Type_Condo,Property Type_House
0,0.021739,0.066988,0.25,0.361397,0.222222,0.894737,0.131148,0.5,2869.0,True,False,False,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.456522,0.211186,0.75,0.238002,0.111111,0.631579,0.717668,0.125,1483.0,False,True,False,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.108696,0.170678,0.75,0.792879,0.111111,0.736842,0.53356,0.25,567.0,False,True,True,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.065217,0.945719,0.5,0.156695,0.111111,0.0,0.12204,0.0,765.0,False,True,True,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.065217,0.26434,0.25,0.322378,0.0,0.421053,0.542805,0.375,2022.0,False,True,True,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [26]:
y_data = train_df['Premium Amount']
X_data = train_df.drop('Premium Amount', axis=1)  # 'Depression' sütunu hedef değişken

print(X_data.head())
print(y_data.head())


        Age  Annual Income  Number of Dependents  Health Score  Previous Claims  Vehicle Age  Credit Score  Insurance Duration  Smoking Status_No  Smoking Status_Yes  Gender_Male  Marital Status_Divorced  Marital Status_Married  Marital Status_Single  Marital Status_other  Occupation_Employed  Occupation_Self-Employed  Occupation_Unemployed  Occupation_other  Education Level_Bachelor's  Education Level_High School  Education Level_Master's  Education Level_PhD  Location_Rural  Location_Suburban  Location_Urban  Policy Type_Basic  Policy Type_Comprehensive  Policy Type_Premium  Customer Feedback_Average  Customer Feedback_Good  Customer Feedback_Poor  Customer Feedback_other  Exercise Frequency_Daily  Exercise Frequency_Monthly  Exercise Frequency_Rarely  Exercise Frequency_Weekly  Property Type_Apartment  Property Type_Condo  Property Type_House
0  0.021739       0.066988                  0.25      0.361397         0.222222     0.894737      0.131148               0.500               T

In [27]:
x_data_test = test_df

In [28]:
train_df.info()
test_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 41 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   Age                          1200000 non-null  float64
 1   Annual Income                1200000 non-null  float64
 2   Number of Dependents         1200000 non-null  float64
 3   Health Score                 1200000 non-null  float64
 4   Previous Claims              1200000 non-null  float64
 5   Vehicle Age                  1200000 non-null  float64
 6   Credit Score                 1200000 non-null  float64
 7   Insurance Duration           1200000 non-null  float64
 8   Premium Amount               1200000 non-null  float64
 9   Smoking Status_No            1200000 non-null  bool   
 10  Smoking Status_Yes           1200000 non-null  bool   
 11  Gender_Male                  1200000 non-null  bool   
 12  Marital Status_Divorced      1200000 non-n

In [29]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import catboost as cb
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# Veriyi eğitim ve test setlerine ayırma
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)

# Modelleri tanımlama
models = {
    "LightGBM": lgb.LGBMRegressor(),
    
}

# Modelleri eğitme ve değerlendirme
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmsle = np.sqrt(mean_squared_error(np.log1p(y_test), np.log1p(y_pred)))
    print(f"{name} - MSE: {mse}, RMSLE: {rmsle}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019810 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 916
[LightGBM] [Info] Number of data points in the train set: 960000, number of used features: 40
[LightGBM] [Info] Start training from score 1102.505529
LightGBM - MSE: 705970.3358667495, RMSLE: 1.1408407266084395


In [31]:
y_pred_test = models['LightGBM'].predict(x_data_test)

submission_df = pd.read_csv('sample_submission.csv')
submission_df['Premium Amount'] = y_pred_test
submission_df.to_csv('submission.csv', index=False)