In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,CustomerID,State,Customer Lifetime Value,Response,Coverage,Coverage Index,Education,Education Index,Effective To Date,Employment Status,...,Policy Type,Policy Type Index,Policy,Policy Index,Renew Offer Type,Sales Channel,Sales Channel Index,Vehicle Size,Vehicle Size Index,Claim over 1k
0,QC35222,California,3622.69,No,Basic,0,Bachelor,2,1/1/2024,Employed,...,Corporate Auto,1,Corporate L2,4,3,Web,0,Medsize,1,0
1,AE98193,Washington,10610.21,No,Basic,0,High School or Below,0,1/1/2024,Unemployed,...,Personal Auto,0,Personal L1,0,1,Branch,1,Medsize,1,1
2,TM23514,Oregon,13868.02,No,Extended,1,College,1,1/1/2024,Employed,...,Personal Auto,0,Personal L3,2,3,Web,0,Medsize,1,0
3,QZ42725,Washington,3119.69,No,Basic,0,Bachelor,2,1/1/2024,Unemployed,...,Personal Auto,0,Personal L3,2,2,Agent,2,Medsize,1,0
4,SG81493,Arizona,5999.04,No,Premium,2,Bachelor,2,1/1/2024,Employed,...,Corporate Auto,1,Corporate L1,3,2,Web,0,Medsize,1,0


In [4]:

df_clean = data.drop(columns=['CustomerID'])

df_encoded = pd.get_dummies(df_clean, drop_first=True)

X = df_encoded.drop(columns=['Claim over 1k'])
y = df_encoded['Claim over 1k']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

sfm = SelectFromModel(rf, threshold='mean')
sfm.fit(X_train, y_train)

selected_features = X_train.columns[sfm.get_support()]

from sklearn.feature_selection import RFE

rfe = RFE(estimator=rf, n_features_to_select=10)
rfe.fit(X_train, y_train)

rfe_selected_features = X_train.columns[rfe.get_support()]

rfe_selected_features

Index(['Customer Lifetime Value', 'Coverage Index', 'Employment Status Index',
       'Income', 'Marital Status Index', 'Months Since Last Claim',
       'Months Since Policy Inception', 'Number of Policies', 'Policy Index',
       'Sales Channel Index'],
      dtype='object')

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV

data = pd.read_csv('train.csv')  

data['Effective to Date'] = pd.to_datetime(data['Effective To Date'], format='%m/%d/%Y', errors='coerce')

reference_date = pd.Timestamp('2023-01-01') 
data['Days Since Effective Date'] = (data['Effective to Date'] - reference_date).dt.days

data['Days Since Effective Date'].fillna(data['Days Since Effective Date'].median(), inplace=True)

features = ['Sales Channel Index','Marital Status Index',
             'Customer Lifetime Value','Income','Number of Policies','Days Since Effective Date','State','Policy Index']  

X = data[features]
y = data['Claim over 1k']

categorical_cols = ['State']  

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols) 
    ],
    remainder='passthrough') 

X_preprocessed = preprocessor.fit_transform(X)

encoded_state_columns = preprocessor.named_transformers_['cat'].get_feature_names_out(['State'])

X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=list(encoded_state_columns) + features[1:], index=X.index)

states_to_drop = ['State_California', 'State_Nevada', 'State_Arizona', 'State_Washington', 'State_Oregon']
states_to_keep = [col for col in encoded_state_columns if col not in states_to_drop]

X_final = X_preprocessed_df[states_to_keep + features[1:]] 

X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=5, 
                            min_samples_leaf=2, bootstrap=False, random_state=42)

rf.fit(X_train, y_train)

importances = rf.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X_final.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print("Feature Importances:\n", feature_importance_df)

top_features = feature_importance_df['Feature'][:6].values  
X_train_top = X_train[top_features]
X_test_top = X_test[top_features]

rf_top = RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=5, 
                                min_samples_leaf=2, bootstrap=False, random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=rf_top, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_

best_rf.fit(X_train_top, y_train)

y_pred = best_rf.predict(X_test_top)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with top features: {accuracy}")

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Feature Importances:
                      Feature  Importance
2                     Income    0.373407
3         Number of Policies    0.196834
4  Days Since Effective Date    0.180539
1    Customer Lifetime Value    0.114869
5                      State    0.077431
6               Policy Index    0.033034
0       Marital Status Index    0.023886
Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.9s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   1.4s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; t

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

test_data = pd.read_csv('test.csv')  
test_data['Effective to Date'] = pd.to_datetime(test_data['Effective To Date'], format='%m/%d/%Y', errors='coerce')
test_data['Days Since Effective Date'] = (test_data['Effective to Date'] - reference_date).dt.days
test_data['Days Since Effective Date'].fillna(test_data['Days Since Effective Date'].median(), inplace=True)

X_test_new = test_data[features]

X_test_preprocessed = preprocessor.transform(X_test_new)

X_test_preprocessed_df = pd.DataFrame(X_test_preprocessed, columns=list(encoded_state_columns) + features[1:], 
                                      index=test_data.index)

X_test_final = X_test_preprocessed_df.reindex(columns=X_train.columns, fill_value=0)

predictions = rf.predict(X_test_final)

submission = pd.DataFrame({
    'CustomerID': test_data['CustomerID'], 
    'Claim over 1k': predictions
})

submission.to_csv('submission_predictions.csv', index=False)



Submission file created: submission_predictions.csv
