In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import f1_score, mean_squared_error,make_scorer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.utils.class_weight import compute_class_weight

In [75]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
# df_train.drop(["fuel_type","is_esc","is_adjustable_steering","is_tpms",
#                        "is_parking_sensors","is_parking_camera","rear_brakes_type",
#                        "steering_type",'is_front_fog_lights','is_rear_window_wiper',
#                        'is_rear_window_washer','is_rear_window_defogger','is_brake_assist',
#                        'is_power_door_locks','is_central_locking', 'is_power_steering',
#                        'is_driver_seat_height_adjustable', 'is_day_night_rear_view_mirror',
#                        'is_ecw', 'is_speed_alert'],axis=1,inplace=True)


# df_test.drop(["fuel_type","is_esc","is_adjustable_steering","is_tpms",
#                        "is_parking_sensors","is_parking_camera","rear_brakes_type",
#                        "steering_type",'is_front_fog_lights','is_rear_window_wiper',
#                        'is_rear_window_washer','is_rear_window_defogger','is_brake_assist',
#                        'is_power_door_locks','is_central_locking', 'is_power_steering',
#                        'is_driver_seat_height_adjustable', 'is_day_night_rear_view_mirror',
#                        'is_ecw', 'is_speed_alert'],axis=1,inplace=True)

In [4]:
df_train['make'] = df_train['make'].astype('str')
df_test['make'] = df_test['make'].astype('str')

In [5]:
def get_months(x):
    ym = x.split('and')
    if len(ym) == 2:
        years = int(ym[0].split()[0])
        months = int(ym[1].split()[0])
    elif len(ym) == 1 and 'month' in ym[0]:
        months = int(ym[0].split()[0])
        years = 0
    elif len(ym) == 1 and 'year' in ym[0]:
        years = int(ym[0].split()[0])
        months = 0
    return years*12 + months

In [6]:
def preprocess(df):
    df.drop(columns=['Unnamed: 0','policy_id'],axis=1,inplace=True)
    df['age_of_car']  = df['age_of_car'].apply(get_months)
    for col in df:
        df[col] = df[col].replace({'Yes':1,'No':0})
    return df

In [7]:
df_train = preprocess(df_train)
df_test = preprocess(df_test)

In [8]:
df_train['area'] = df_train['length']*df_train['width']*df_train['height']
df_train.drop(columns=['length','width','height'],inplace=True)

df_test['area'] = df_test['length']*df_test['width']*df_test['height']
df_test.drop(columns=['length','width','height'],inplace=True)

df_train[['max_torque_Nm', 'max_torque_rpm']] = df_train['max_torque'].str.split('@', expand=True)
df_train['max_torque_Nm'] = df_train['max_torque_Nm'].str.extract(r'(\d+\.?\d*)').astype(float)
df_train['max_torque_rpm'] = df_train['max_torque_rpm'].str.extract(r'(\d+)').astype(int)
df_train.drop(columns=['max_torque'], inplace=True)

df_test[['max_torque_Nm', 'max_torque_rpm']] = df_test['max_torque'].str.split('@', expand=True)
df_test['max_torque_Nm'] = df_test['max_torque_Nm'].str.extract(r'(\d+\.?\d*)').astype(float)
df_test['max_torque_rpm'] = df_test['max_torque_rpm'].str.extract(r'(\d+)').astype(int)
df_test.drop(columns=['max_torque'], inplace=True)

df_train[['max_power_bhp', 'max_power_rpm']] = df_train['max_power'].str.split('@', expand=True)
df_train['max_power_bhp'] = df_train['max_power_bhp'].str.extract(r'(\d+\.?\d*)').astype(float)
df_train['max_power_rpm'] = df_train['max_power_rpm'].str.extract(r'(\d+)').astype(int)
df_train.drop(columns=['max_power'], inplace=True)

df_test[['max_power_bhp', 'max_power_rpm']] = df_test['max_power'].str.split('@', expand=True)
df_test['max_power_bhp'] = df_test['max_power_bhp'].str.extract(r'(\d+\.?\d*)').astype(float)
df_test['max_power_rpm'] = df_test['max_power_rpm'].str.extract(r'(\d+)').astype(int)
df_test.drop(columns=['max_power'], inplace=True)

In [9]:
# Create correlation matrix
corr_matrix = df_train.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.97)] #96

# Drop features 
df_train.drop(to_drop, axis=1, inplace=True)
df_test.drop(to_drop, axis=1, inplace=True)

In [10]:
X_train = df_train.drop('age_of_policyholder',axis=1)
y_train = df_train['age_of_policyholder']

In [11]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Columns to apply StandardScaler and OneHotEncoder
numeric_cols = list(set(X_train) - set(X_train.select_dtypes('O')))
categorical_cols = list(set(X_train.select_dtypes('O')))

# Define preprocessing steps for numeric and categorical columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values
    ('scaler', StandardScaler())  # Standard scaling
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encoding
])

# Combine preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)

# Apply preprocessing to the entire dataset
X_train = preprocessor.fit_transform(X_train)

In [60]:
preprocessor.transformers_[0][2]

['area',
 'cylinder',
 'gross_weight',
 'age_of_car',
 'is_day_night_rear_view_mirror',
 'is_speed_alert',
 'is_front_fog_lights',
 'turning_radius',
 'age_of_policyholder',
 'max_torque_rpm',
 'policy_tenure',
 'population_density',
 'is_rear_window_defogger',
 'is_tpms',
 'max_power_bhp',
 'max_power_rpm',
 'is_parking_sensors',
 'is_parking_camera',
 'is_power_steering',
 'airbags',
 'max_torque_Nm',
 'is_esc',
 'is_brake_assist',
 'is_power_door_locks',
 'ncap_rating',
 'displacement',
 'is_adjustable_steering']

In [None]:
X_train_df = pd.DataFrame(X_train_transformed, columns=np.concatenate([feature_names, encoder.transformers_[0][2]]))

In [64]:
a = preprocessor.named_transformers_['num'].get_feature_names_out().tolist() + preprocessor.named_transformers_['cat'].get_feature_names_out().tolist()

In [70]:
np.array(a)

array(['area', 'cylinder', 'gross_weight', 'age_of_car',
       'is_day_night_rear_view_mirror', 'is_speed_alert',
       'is_front_fog_lights', 'turning_radius', 'age_of_policyholder',
       'max_torque_rpm', 'policy_tenure', 'population_density',
       'is_rear_window_defogger', 'is_tpms', 'max_power_bhp',
       'max_power_rpm', 'is_parking_sensors', 'is_parking_camera',
       'is_power_steering', 'airbags', 'max_torque_Nm', 'is_esc',
       'is_brake_assist', 'is_power_door_locks', 'ncap_rating',
       'displacement', 'is_adjustable_steering', 'engine_type_1.0 SCe',
       'engine_type_1.2 L K Series Engine',
       'engine_type_1.2 L K12N Dualjet', 'engine_type_1.5 L U2 CRDi',
       'engine_type_1.5 Turbocharged Revotorq',
       'engine_type_1.5 Turbocharged Revotron',
       'engine_type_F8D Petrol Engine', 'engine_type_G12B',
       'engine_type_K Series Dual jet', 'engine_type_K10C',
       'engine_type_i-DTEC', 'steering_type_Electric',
       'steering_type_Manual', 'st

In [12]:
# # Define the parameter grid
# param_grid = {
#     'n_estimators': [50, 100, 150,200,250,300],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': range(3,40,2),
#     'min_samples_split': range(2,40,2),
#     'min_samples_leaf': range(1,40,2),
#     'max_features': ['auto', 'sqrt', 'log2']
# }

# reg = GradientBoostingRegressor(random_state=42)

# grid_search = RandomizedSearchCV(estimator=reg, 
#                            n_jobs=-1,
#                            n_iter=10,
#                            scoring= 'neg_mean_squared_error',
#                            param_distributions=param_grid, 
#                            cv=3, 
#                            verbose=2)
# grid_search.fit(X_train, y_train)

In [13]:
# print(grid_search.best_params_)

In [14]:
# f1_macro

In [15]:
reg = GradientBoostingRegressor(random_state=42)
reg.fit(X_train,y_train)

In [16]:
X_test = df_test.drop('age_of_policyholder',axis=1)
y_test = df_test['age_of_policyholder']

X_test = preprocessor.transform(X_test)

In [17]:
y_pred = reg.predict(X_test)

In [18]:
mse = mean_squared_error(y_test,y_pred)
mse

90.94198192644126

# Classification

In [32]:
def macro_f1(y_test, y_pred):
    score = f1_score(y_test, y_pred,average='macro')
    return score

In [33]:
macro_f1_score = make_scorer(
    macro_f1,
    greater_is_better=True
)

In [34]:
class_weights = compute_class_weight(
    class_weight='balanced', 
    classes = np.unique(y_train), 
    y = y_train
)

In [35]:
# df_train = df_train.sample(frac=1,random_state=42)
X_train = df_train.drop('is_claim',axis=1)
y_train = df_train['is_claim']

X_test = df_test.drop('is_claim',axis=1)
y_test = df_test['is_claim']

# Columns to apply StandardScaler and OneHotEncoder
numeric_cols = list(set(X_train) - set(X_train.select_dtypes('O')))
categorical_cols = list(set(X_train.select_dtypes('O')))

# Define preprocessing steps for numeric and categorical columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values
    ('scaler', StandardScaler())  # Standard scaling
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encoding
])

# Combine preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

from imblearn.over_sampling import RandomOverSampler,SMOTE
from imblearn.combine import SMOTEENN 
sampler = SMOTEENN(random_state=42)
X_train, y_train = sampler.fit_resample(X_train,y_train)

In [36]:
y_train.value_counts()

1    30025
0    24944
Name: is_claim, dtype: int64

In [37]:
grid_param={
    "criterion": ['gini'],
    "splitter":['best'],
#     'random_state': range(1,100,2),
    "max_depth":range(2,40,1),
    'max_features': range(40,92,2),
    "min_samples_split" : range(2,10,1),
    "min_samples_leaf" : range(1,10,1),
}

tree_clas = DecisionTreeClassifier()
grid_search = RandomizedSearchCV(estimator=tree_clas, 
                           n_jobs=-1,
                           n_iter=3000,
                           scoring= macro_f1_score,
                           param_distributions=grid_param, 
                           cv=3, 
                           verbose=2)
grid_search.fit(X_train, y_train)

In [38]:
# grid_search.best_params_

In [39]:
# {'criterion': 'gini',
#  'max_depth': 50,
#  'max_features': 70,
#  'min_samples_leaf': 1,
#  'min_samples_split': 6,
#  'splitter': 'best'} #51.9

In [40]:
# clf = DecisionTreeClassifier(random_state=42,
                             
#                             )
# clf.fit(X_train,y_train)
# y_pred = clf.predict(X_test)

# score = f1_score(y_test, y_pred,average='macro')
# score # off and .97

In [49]:
float('nan')

nan

In [50]:
param = {'criterion': 'gini',
 'max_depth': 50,
 'max_features': 70,
 'min_samples_leaf': 1,
 'min_samples_split': 6,
 'splitter': 'best'}


param1 = {'objective': 'binary:logistic',
 'use_label_encoder': True,
 'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 0.8267970059231334,
 'enable_categorical': False,
 'gamma': 0,
 'gpu_id': -1,
 'importance_type': None,
 'interaction_constraints': '',
 'learning_rate': 0.15117171590296455,
 'max_delta_step': 0,
 'max_depth': 11,
 'min_child_weight': 1,
 'missing': float('nan'),
 'monotone_constraints': '()',
 'n_estimators': 300,
 'n_jobs': -1,
 'num_parallel_tree': 1,
 'predictor': 'auto',
 'random_state': 42,
 'reg_alpha': 1e-10,
 'reg_lambda': 0.3295926053124784,
 'scale_pos_weight': 1.0,
 'subsample': 1.0,
 'tree_method': 'auto',
 'validate_parameters': 1,
 'verbosity': 0}

In [45]:
# import xgboost as xgb
from xgboost import XGBClassifier

In [52]:
# tree_clas = cc

# tree_clas.fit(X_train,y_train)

# for i in range(1,100):
clf = XGBClassifier(**param1)
clf.fit(X_train,y_train)
# cc = grid_search.best_estimator_
y_pred = clf.predict(X_test)

score = f1_score(y_test, y_pred,average='macro')
#     score # off and .97
print(score) # 0.5233373470773133

0.5251134925394796


In [30]:
# policy_ids = test_df_pre["policy_id"]
# submission_df = pd.DataFrame({'policy_id': policy_ids, 'age': predictions})
# submission_df.to_csv('submission.csv', index=False)

In [31]:
# predictions = model_xgb.predict(test_df)
# policy_ids = test_df_pre["policy_id"]
# submission_df = pd.DataFrame({'policy_id': policy_ids, 'prediction': predictions})
# submission_df.to_csv('submission.csv', index=False)

In [81]:
cols = ['max_power',
'age_of_car',
'gross_weight',
'area_cluster',
'population_density',
'turning_radius',
'area_cluster',
'policy_tenure',
'max_torque',
'engine_type',
'age_of_policyholder',
'ncap_rating',
'steering_type',
'segment',
'cylinder',
'is_parking_camera',
'is_brake_assist',
'is_day_night_rear_view_mirror',
'is_rear_window_washer',
'is_rear_window_defogger',
'max_power',
'displacement',
'transmission_type',
'is_front_fog_lights',
'is_parking_sensors',
'is_adjustable_steering',
'is_esc']

In [82]:
df_train[cols]

Unnamed: 0,max_power,age_of_car,gross_weight,area_cluster,population_density,turning_radius,area_cluster.1,policy_tenure,max_torque,engine_type,age_of_policyholder,ncap_rating,steering_type,segment,cylinder,is_parking_camera,is_brake_assist,is_day_night_rear_view_mirror,is_rear_window_washer,is_rear_window_defogger,max_power.1,displacement,transmission_type,is_front_fog_lights,is_parking_sensors,is_adjustable_steering,is_esc
0,113.45bhp@4000rpm,5 years and 3 months,1720,C9,17804,5.2,C9,54,250Nm@2750rpm,1.5 L U2 CRDi,37,3,Power,C2,4,Yes,Yes,No,Yes,Yes,113.45bhp@4000rpm,1493,Automatic,Yes,Yes,Yes,Yes
1,40.36bhp@6000rpm,8 months,1185,C3,4076,4.6,C3,36,60Nm@3500rpm,F8D Petrol Engine,42,0,Power,A,3,No,No,No,No,No,40.36bhp@6000rpm,796,Manual,No,Yes,No,No
2,40.36bhp@6000rpm,4 months,1185,C5,34738,4.6,C5,79,60Nm@3500rpm,F8D Petrol Engine,37,0,Power,A,3,No,No,No,No,No,40.36bhp@6000rpm,796,Manual,No,Yes,No,No
3,55.92bhp@5300rpm,0 months,1340,C9,17804,4.7,C9,46,82.1Nm@3400rpm,K10C,37,2,Power,B1,3,No,No,No,No,No,55.92bhp@5300rpm,998,Manual,No,Yes,No,No
4,113.45bhp@4000rpm,5 years and 3 months,1720,C10,73430,5.2,C10,66,250Nm@2750rpm,1.5 L U2 CRDi,36,3,Power,C2,4,Yes,Yes,No,Yes,Yes,113.45bhp@4000rpm,1493,Automatic,Yes,Yes,Yes,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,40.36bhp@6000rpm,1 years and 0 months,1185,C10,73430,4.6,C10,25,60Nm@3500rpm,F8D Petrol Engine,34,0,Power,A,3,No,No,No,No,No,40.36bhp@6000rpm,796,Manual,No,Yes,No,No
39996,88.50bhp@6000rpm,3 years and 1 months,1335,C8,8794,4.8,C8,11,113Nm@4400rpm,K Series Dual jet,30,2,Electric,B2,4,No,Yes,Yes,No,No,88.50bhp@6000rpm,1197,Manual,Yes,Yes,Yes,No
39997,113.45bhp@4000rpm,8 years and 0 months,1720,C8,8794,5.2,C8,96,250Nm@2750rpm,1.5 L U2 CRDi,44,3,Power,C2,4,Yes,Yes,No,Yes,Yes,113.45bhp@4000rpm,1493,Automatic,Yes,Yes,Yes,Yes
39998,88.50bhp@6000rpm,2 years and 1 months,1335,C7,6112,4.8,C7,61,113Nm@4400rpm,K Series Dual jet,60,2,Electric,B2,4,No,Yes,Yes,No,No,88.50bhp@6000rpm,1197,Manual,Yes,Yes,Yes,No


In [76]:
df_train['max']

Index(['Unnamed: 0', 'policy_id', 'policy_tenure', 'age_of_car',
       'age_of_policyholder', 'area_cluster', 'population_density', 'make',
       'segment', 'model', 'fuel_type', 'max_torque', 'max_power',
       'engine_type', 'airbags', 'is_esc', 'is_adjustable_steering', 'is_tpms',
       'is_parking_sensors', 'is_parking_camera', 'rear_brakes_type',
       'displacement', 'cylinder', 'transmission_type', 'gear_box',
       'steering_type', 'turning_radius', 'length', 'width', 'height',
       'gross_weight', 'is_front_fog_lights', 'is_rear_window_wiper',
       'is_rear_window_washer', 'is_rear_window_defogger', 'is_brake_assist',
       'is_power_door_locks', 'is_central_locking', 'is_power_steering',
       'is_driver_seat_height_adjustable', 'is_day_night_rear_view_mirror',
       'is_ecw', 'is_speed_alert', 'ncap_rating', 'is_claim'],
      dtype='object')