In [34]:
import pandas as pd

In [35]:
df = pd.read_csv('train.csv')

In [36]:
df.drop(columns=['Unnamed: 0','policy_id'],axis=1,inplace=True)

In [37]:
def get_months(x):
    ym = x.split('and')
    if len(ym) == 2:
        years = int(ym[0].split()[0])
        months = int(ym[1].split()[0])
    elif len(ym) == 1 and 'month' in ym[0]:
        months = int(ym[0].split()[0])
        years = 0
    elif len(ym) == 1 and 'year' in ym[0]:
        years = int(ym[0].split()[0])
        months = 0
    return years*12 + months

In [38]:
df['age_of_car']  = df['age_of_car'].apply(get_months)

In [39]:
for col in df:
    df[col] = df[col].replace({'Yes':1,'No':0})

In [49]:
X_train = df.drop('age_of_policyholder',axis=1)
y_train = df['age_of_policyholder']

In [53]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Columns to apply StandardScaler and OneHotEncoder
numeric_cols = list(set(X_train) - set(X_train.select_dtypes('O')))
categorical_cols = list(set(X_train.select_dtypes('O')))

# Define preprocessing steps for numeric and categorical columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values
    ('scaler', StandardScaler())  # Standard scaling
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encoding
])

# Combine preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)

# Apply preprocessing to the entire dataset
X_train = preprocessor.fit_transform(X_train)

In [54]:
X_train.shape

(40000, 110)

In [55]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

In [72]:
param = {'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.04264031156015375,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': 0.7588282465072769,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.00962114482009193,
 'min_samples_leaf': 3,
 'min_samples_split': 3,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 229,
 'n_iter_no_change': None,
 'random_state': 42,
 'subsample': 0.8981042826479537,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

# this param does not makes much difference
# Use decision tree for classification

reg = GradientBoostingRegressor(**param)


reg.fit(X_train,y_train)

In [57]:
df_test = pd.read_csv('test.csv')

In [58]:
df_test.drop(columns=['Unnamed: 0','policy_id'],axis=1,inplace=True)

In [59]:
df_test['age_of_car']  = df_test['age_of_car'].apply(get_months)

for col in df_test:
    df_test[col] = df_test[col].replace({'Yes':1,'No':0})

X_test = df_test.drop('age_of_policyholder',axis=1)
y_test = df_test['age_of_policyholder']

X_test = preprocessor.transform(X_test)

In [73]:
y_pred = reg.predict(X_test)

In [74]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test,y_pred)
mse

90.82739469501759

In [242]:
# 1) check correlation
# try pycaret 

In [243]:
from sklearn.linear_model import LogisticRegression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from category_encoders import OrdinalEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay,f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import make_pipeline

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df_train.drop(columns=['Unnamed: 0','policy_id'],axis=1,inplace=True)
df_test.drop(columns=['Unnamed: 0','policy_id'],axis=1,inplace=True)

In [4]:
def get_months(x):
    ym = x.split('and')
    if len(ym) == 2:
        years = int(ym[0].split()[0])
        months = int(ym[1].split()[0])
    elif len(ym) == 1 and 'month' in ym[0]:
        months = int(ym[0].split()[0])
        years = 0
    elif len(ym) == 1 and 'year' in ym[0]:
        years = int(ym[0].split()[0])
        months = 0
    return years*12 + months

In [5]:
df_train['age_of_car']  = df_train['age_of_car'].apply(get_months)
df_test['age_of_car']  = df_test['age_of_car'].apply(get_months)

In [7]:
for col in df_train:
    df_train[col] = df_train[col].replace({'Yes':1,'No':0})
    df_test[col] = df_test[col].replace({'Yes':1,'No':0})

In [8]:
X_train = df_train.drop('is_claim',axis=1)
y_train = df_train['is_claim']

In [9]:
X_test = df_test.drop('is_claim',axis=1)
y_test = df_test['is_claim']

In [10]:
from imblearn.over_sampling import RandomOverSampler
sampler = RandomOverSampler(random_state=1)
X_train_over, y_train_over = sampler.fit_resample(X_train,y_train)

In [11]:
X_train_over['is_claim'] = y_train_over

In [255]:
lr = make_pipeline(OneHotEncoder(),
                   LogisticRegression())
dt = make_pipeline(OrdinalEncoder(),
                  DecisionTreeClassifier(random_state=1))
rf = make_pipeline(OrdinalEncoder(),
                  RandomForestClassifier(random_state=1))
gb = make_pipeline(OrdinalEncoder(),
                  GradientBoostingClassifier(random_state=1))

In [256]:
model_list = [("lr", lr),("dt", dt),("rf", rf),("gb", gb)]

In [261]:
for name, model in model_list:
    
    model.fit(X_train_over, y_train_over)
    
    y_pred = model.predict(X_test)
    
    score = f1_score(y_test, y_pred,average='macro')
    print(f"The test accuracy score of {name} is {score}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


The test accuracy score of lr is 0.417769709840323
The test accuracy score of dt is 0.5072867175377134
The test accuracy score of rf is 0.5042491766705003
The test accuracy score of gb is 0.4420553641615713


In [92]:
import pycaret
from pycaret.classification import *
from pycaret.classification import predict_model

In [93]:
df_train = pd.read_csv('train1.csv')
df_test = pd.read_csv('test1.csv')

In [99]:
s = setup(df_train, target = 'target', session_id = 42, fold=5,n_jobs=-1,
          verbose=False)

# best = compare_models(verbose=1)
dt = create_model('dt')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8648,0.5081,0.1003,0.0732,0.0846,0.0136,0.0138
1,0.8752,0.5302,0.1375,0.1076,0.1208,0.0547,0.0551
2,0.8577,0.5043,0.1003,0.0676,0.0807,0.0068,0.0069
3,0.8664,0.4996,0.0714,0.0558,0.0627,-0.0081,-0.0082
4,0.8707,0.5179,0.1171,0.0899,0.1017,0.0334,0.0337
Mean,0.867,0.512,0.1053,0.0788,0.0901,0.0201,0.0203
Std,0.0059,0.0109,0.0218,0.0181,0.0197,0.0218,0.022


In [101]:
dt

In [106]:
s = predict_model(tuned_dt,data=df_test)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Decision Tree Classifier,0.8694,0.5081,0.0909,0.074,0.0816,0.0121,0.0121


In [107]:
score = f1_score(s['target'], s['prediction_label'],average='macro')
score

0.5056385219787966

In [316]:
# pip install tune-sklearn ray[tune]

In [27]:
# tune model tune-sklearn
# tuned_dt = tune_model(best, search_library = 'tune-sklearn', search_algorithm = 'hyperopt')

In [108]:
tuned_dt = tune_model(dt, search_library = 'scikit-optimize',early_stopping=True,
                     optimize='f1',n_iter=100)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9129,0.548,0.0487,0.0983,0.0651,0.0249,0.0265
1,0.9195,0.5517,0.0344,0.0952,0.0505,0.0181,0.0207
2,0.9195,0.5763,0.0372,0.1016,0.0545,0.0218,0.0248
3,0.9145,0.5475,0.0286,0.0671,0.0401,0.0029,0.0032
4,0.9141,0.5424,0.0486,0.103,0.066,0.0271,0.0292
Mean,0.9161,0.5532,0.0395,0.093,0.0553,0.0189,0.0209
Std,0.0028,0.0119,0.008,0.0132,0.0097,0.0086,0.0093


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [322]:
pip install scikit-optimize

Note: you may need to restart the kernel to use updated packages.


In [30]:
# tuned_dt = tune_model(best, search_library = 'optuna')

In [312]:
pip install pycaret[tuners] --user

Collecting hyperopt>=0.2.7 (from pycaret[tuners])
  Obtaining dependency information for hyperopt>=0.2.7 from https://files.pythonhosted.org/packages/b6/cd/5b3334d39276067f54618ce0d0b48ed69d91352fbf137468c7095170d0e5/hyperopt-0.2.7-py2.py3-none-any.whl.metadata
  Using cached hyperopt-0.2.7-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting optuna>=3.0.0 (from pycaret[tuners])
  Obtaining dependency information for optuna>=3.0.0 from https://files.pythonhosted.org/packages/15/da/68883911855d8b4d521f9a370e4e6aab8232b91c1d8d5a8348c4680c6642/optuna-3.6.1-py3-none-any.whl.metadata
  Using cached optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting scikit-optimize>=0.9.0 (from pycaret[tuners])
  Obtaining dependency information for scikit-optimize>=0.9.0 from https://files.pythonhosted.org/packages/90/0e/15deb91b3db0003843e34e72fa865e1d92013781d986fdc65483c99a9f69/scikit_optimize-0.10.1-py2.py3-none-any.whl.metadata
  Using cached scikit_optimize-0.10.1-py2.py3-none-any.whl.metadata (9.



In [306]:
# !pip install optuna

In [47]:
best

In [None]:
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='squared_error',
                          max_depth=3, max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_samples_leaf=1,
                          min_samples_split=2, min_weight_fraction_leaf=0.0,
                          n_estimators=100, n_iter_no_change=None,
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [63]:
s = predict_model(tuned_dt,data=df_test)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Gradient Boosting Regressor,7.7689,91.0181,9.5403,0.0601,0.2477,0.2209


In [69]:
tuned_dt.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.04264031156015375,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': 0.7588282465072769,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.00962114482009193,
 'min_samples_leaf': 3,
 'min_samples_split': 3,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 229,
 'n_iter_no_change': None,
 'random_state': 42,
 'subsample': 0.8981042826479537,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [32]:
score = f1_score(s['is_claim'], s['prediction_label'],average='macro')
score

0.49907075577802096

In [33]:
df_train

Unnamed: 0,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,max_torque,...,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim
0,54,63,37,C9,17804,3,C2,M4,Diesel,250Nm@2750rpm,...,1,1,1,1,1,0,1,1,3,0
1,36,8,42,C3,4076,1,A,M1,CNG,60Nm@3500rpm,...,0,0,0,1,0,0,0,1,0,0
2,79,4,37,C5,34738,1,A,M1,CNG,60Nm@3500rpm,...,0,0,0,1,0,0,0,1,0,0
3,46,0,37,C9,17804,1,B1,M8,CNG,82.1Nm@3400rpm,...,0,1,1,1,0,0,1,1,2,0
4,66,63,36,C10,73430,3,C2,M4,Diesel,250Nm@2750rpm,...,1,1,1,1,1,0,1,1,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,25,12,34,C10,73430,1,A,M1,CNG,60Nm@3500rpm,...,0,0,0,1,0,0,0,1,0,0
39996,11,37,30,C8,8794,1,B2,M6,Petrol,113Nm@4400rpm,...,1,1,1,1,1,1,1,1,2,0
39997,96,96,44,C8,8794,3,C2,M4,Diesel,250Nm@2750rpm,...,1,1,1,1,1,0,1,1,3,0
39998,61,25,60,C7,6112,1,B2,M6,Petrol,113Nm@4400rpm,...,1,1,1,1,1,1,1,1,2,0


In [None]:
param = {'criterion': 'gini',
     'max_depth': 50,
     'max_features': 70,
     'min_samples_leaf': 1,
     'min_samples_split': 6,
     'splitter': 'best'}
    
    param1 = {'splitter': 'best',
     'min_samples_split': 4,
     'min_samples_leaf': 1,
     'max_features': 86,
     'max_depth': 39,
     'criterion': 'gini'}
    
    param2 = {'objective': 'binary:logistic',
     'use_label_encoder': True,
     'base_score': 0.5,
     'booster': 'gbtree',
     'colsample_bylevel': 1,
     'colsample_bynode': 1,
     'colsample_bytree': 0.8267970059231334,
     'enable_categorical': False,
     'gamma': 0,
     'gpu_id': -1,
     'importance_type': None,
     'interaction_constraints': '',
     'learning_rate': 0.15117171590296455,
     'max_delta_step': 0,
     'max_depth': 11,
     'min_child_weight': 1,
     'missing': float('nan'),
     'monotone_constraints': '()',
     'n_estimators': 300,
     'n_jobs': -1,
     'num_parallel_tree': 1,
     'predictor': 'auto',
     'random_state': 42,
     'reg_alpha': 1e-10,
     'reg_lambda': 0.3295926053124784,
     'scale_pos_weight': 1.0,
     'subsample': 1.0,
     'tree_method': 'auto',
     'validate_parameters': 1,
     'verbosity': 0}