In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from imblearn.over_sampling import SMOTE


df = pd.read_csv("preprocessed_car_claim.csv")


y = df['OUTCOME'].apply(lambda x: 1 if x > 0 else 0)
X = df.drop(columns=['OUTCOME'])
# 2. Initial train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)


rfe_model = RandomForestClassifier(n_estimators=100, random_state=42)


num_features_to_select = min(10, X_train_scaled.shape[1])


rfe = RFE(estimator=rfe_model, 
          n_features_to_select=num_features_to_select, 
          step=1, 
          verbose=1)


rfe.fit(X_train_scaled, y_train_res)


selected_features_mask = rfe.support_
selected_features = X.columns[selected_features_mask]
print("\nTop features selected by RFE:")
print(selected_features.tolist())


X_train_selected = rfe.transform(X_train_scaled)
X_test_selected = rfe.transform(X_test_scaled)


final_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=5,
    min_samples_split=5,
    random_state=42,
    class_weight='balanced'
)
final_model.fit(X_train_selected, y_train_res)


y_pred = final_model.predict(X_test_selected)
y_proba = final_model.predict_proba(X_test_selected)[:, 1]

print("\nFinal Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.

Top features selected by RFE:
['ID', 'AGE', 'DRIVING_EXPERIENCE', 'CREDIT_SCORE', 'VEHICLE_OWNERSHIP', 'VEHICLE_YEAR', 'POSTAL_CODE', 'ANNUAL_MILEAGE', 'SPEEDING_VIOLATIONS', 'PAST_ACCIDENTS']

Final Model Performance:
Accuracy: 0.8232
ROC AUC: 0.8941

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.83      0.87      1369
           1       0.69      0.80      0.74       622

    accuracy                           0.82      1991
   macro avg       0.79      0.82      0.80      1991
weighted avg       0.83      0.82      0.83      1991



In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt


df = pd.read_csv("preprocessed_car_claim.csv")


y = df['OUTCOME'].apply(lambda x: 1 if x > 0 else 0)
X = df.drop(columns=['OUTCOME'])


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)


lasso = LogisticRegression(penalty='l1', solver='liblinear', random_state=42, C=0.1, max_iter=1000)
lasso.fit(X_train_scaled, y_train_res)


lasso_importance = pd.DataFrame({
    'features': X.columns.tolist(),
    'importance': np.abs(lasso.coef_[0])
}).sort_values('importance', ascending=False)


top_features = lasso_importance.head(12)['features'].values
print("Top 12 features selected by LASSO:")
print(top_features)



col_indices = [X.columns.get_loc(col) for col in top_features]
X_train_selected = X_train_scaled[:, col_indices]
X_test_selected = X_test_scaled[:, col_indices]


final_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=4,
    min_samples_split=2,
    random_state=42,
    class_weight='balanced'
)
final_model.fit(X_train_selected, y_train_res)


y_pred = final_model.predict(X_test_selected)
y_proba = final_model.predict_proba(X_test_selected)[:, 1]

print("\nFinal Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


feature_importance = pd.DataFrame({
    'features': top_features,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)



Top 12 features selected by LASSO:
['DRIVING_EXPERIENCE' 'VEHICLE_OWNERSHIP' 'VEHICLE_YEAR' 'POSTAL_CODE'
 'MARRIED' 'GENDER' 'CHILDREN' 'SPEEDING_VIOLATIONS' 'RACE' 'EDUCATION'
 'PAST_ACCIDENTS' 'VEHICLE_TYPE']

Final Model Performance:
Accuracy: 0.8177
ROC AUC: 0.8834

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.83      0.86      1369
           1       0.68      0.79      0.73       622

    accuracy                           0.82      1991
   macro avg       0.79      0.81      0.80      1991
weighted avg       0.83      0.82      0.82      1991



In [13]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE


df = pd.read_csv("preprocessed_car_claim.csv")


y = df['OUTCOME'].apply(lambda x: 1 if x > 0 else 0)
X = df.drop(columns=['OUTCOME'])


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)


selector = SelectKBest(score_func=f_classif, k=12)
selector.fit(X_train_scaled, y_train_res)


anova_scores = pd.DataFrame({
    'features': X.columns.tolist(),
    'f_score': selector.scores_,
    'p_value': selector.pvalues_
}).sort_values('f_score', ascending=False)


top_n = min(12, len(anova_scores))
top_features = anova_scores['features'].head(top_n).values
print("Top 12 features selected by ANOVA F-test:")
print(top_features)


col_indices = [X.columns.get_loc(col) for col in top_features]
X_train_selected = X_train_scaled[:, col_indices]
X_test_selected = X_test_scaled[:, col_indices]


final_model = RandomForestClassifier(
    n_estimators=41,
    max_depth=5,
    min_samples_split=5,
    random_state=42,
    class_weight='balanced'
)
final_model.fit(X_train_selected, y_train_res)

# 8. EVALUATION
y_pred = final_model.predict(X_test_selected)
y_proba = final_model.predict_proba(X_test_selected)[:, 1]

print("\nFinal Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))




Top 12 features selected by ANOVA F-test:
['DRIVING_EXPERIENCE' 'AGE' 'VEHICLE_OWNERSHIP' 'MARRIED' 'PAST_ACCIDENTS'
 'CHILDREN' 'SPEEDING_VIOLATIONS' 'CREDIT_SCORE' 'VEHICLE_YEAR' 'DUIS'
 'EDUCATION' 'ANNUAL_MILEAGE']

Final Model Performance:
Accuracy: 0.8112
ROC AUC: 0.8739

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.83      0.86      1369
           1       0.67      0.77      0.72       622

    accuracy                           0.81      1991
   macro avg       0.78      0.80      0.79      1991
weighted avg       0.82      0.81      0.81      1991


Feature Importance from Final Model:
                feature  importance
0    DRIVING_EXPERIENCE    0.315654
2     VEHICLE_OWNERSHIP    0.232764
1                   AGE    0.191162
4        PAST_ACCIDENTS    0.091896
6   SPEEDING_VIOLATIONS    0.044818
3               MARRIED    0.040041
8          VEHICLE_YEAR    0.036433
5              CHILDREN    0.019514
7        