In [22]:
##RFE
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from imblearn.over_sampling import SMOTE


df = pd.read_csv("preprocessed_car_claim.csv")


y = df['OUTCOME'].apply(lambda x: 1 if x > 0 else 0)
X = df.drop(columns=['OUTCOME'])


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


rfe_model = RandomForestClassifier(n_estimators=100, random_state=42)
num_features_to_select = min(10, X_train_res.shape[1])
rfe = RFE(estimator=rfe_model, 
          n_features_to_select=num_features_to_select, 
          step=1,
          verbose=1)
rfe.fit(X_train_res, y_train_res)

selected_features_mask = rfe.support_
selected_features = X.columns[selected_features_mask]
print("\nTop features selected by RFE:")
print(selected_features.tolist())


X_train_selected = X_train_res.loc[:, selected_features]
X_test_selected = X_test.loc[:, selected_features]


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)


final_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=5,
    min_samples_split=5,
    random_state=42,
    class_weight='balanced'
)
final_model.fit(X_train_scaled, y_train_res)


def predict_from_user_input():
    print("\nPlease enter values for the following features:")
    user_data = {}
    for feature in selected_features:
        value = float(input(f"Enter value for {feature}: "))
        user_data[feature] = [value]

    
    user_df = pd.DataFrame(user_data)[selected_features]
    
    
    user_scaled = scaler.transform(user_df)
    
   
    prediction = final_model.predict(user_scaled)
    probability = final_model.predict_proba(user_scaled)[:, 1]

    print("\nPrediction Results:")
    print(f"Predicted Class: {'Claim' if prediction[0] == 1 else 'No Claim'}")
    print(f"Probability of Claim: {probability[0]:.4f}")


predict_from_user_input()

Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.

Top features selected by RFE:
['ID', 'AGE', 'DRIVING_EXPERIENCE', 'CREDIT_SCORE', 'VEHICLE_OWNERSHIP', 'VEHICLE_YEAR', 'POSTAL_CODE', 'ANNUAL_MILEAGE', 'SPEEDING_VIOLATIONS', 'PAST_ACCIDENTS']

Please enter values for the following features:


Enter value for ID:  567876
Enter value for AGE:  16
Enter value for DRIVING_EXPERIENCE:  0
Enter value for CREDIT_SCORE:  0.87
Enter value for VEHICLE_OWNERSHIP:  0
Enter value for VEHICLE_YEAR:  2000
Enter value for POSTAL_CODE:  10238
Enter value for ANNUAL_MILEAGE:  0.18
Enter value for SPEEDING_VIOLATIONS:  3
Enter value for PAST_ACCIDENTS:  3



Prediction Results:
Predicted Class: Claim
Probability of Claim: 0.6752


### LASSO

In [26]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt


df = pd.read_csv("preprocessed_car_claim.csv")


y = df['OUTCOME'].apply(lambda x: 1 if x > 0 else 0)
X = df.drop(columns=['OUTCOME'])


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)


smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)


lasso = LogisticRegression(penalty='l1', solver='liblinear', random_state=42, C=0.1, max_iter=1000)
lasso.fit(X_train_scaled, y_train_res)


lasso_importance = pd.DataFrame({
    'features': X.columns.tolist(),
    'importance': np.abs(lasso.coef_[0])
}).sort_values('importance', ascending=False)


top_features = lasso_importance.head(12)['features'].values
print("Top 12 features selected by LASSO:")
print(top_features)


col_indices = [X.columns.get_loc(col) for col in top_features]
X_train_selected = X_train_scaled[:, col_indices]
X_test_selected = X_test_scaled[:, col_indices]


final_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=4,
    min_samples_split=2,
    random_state=42,
    class_weight='balanced'
)
final_model.fit(X_train_selected, y_train_res)


def predict_from_user_input():
    print("\nPlease enter values for the following features:")
    user_data = {}
    for feature in top_features:  
        value = float(input(f"Enter value for {feature}: "))
        user_data[feature] = [value]

    user_df = pd.DataFrame(user_data)[top_features]  
    
   
    full_features_df = pd.DataFrame(columns=X.columns)
    for col in X.columns:
        full_features_df[col] = [0] 
    
   
    for feature in top_features:
        full_features_df[feature] = user_data[feature]
    
    
    user_scaled_all = scaler.transform(full_features_df)
    
    
    user_scaled = user_scaled_all[:, col_indices]
    
   
    prediction = final_model.predict(user_scaled)
    probability = final_model.predict_proba(user_scaled)[:, 1]

    print("\nPrediction Results:")
    print(f"Predicted Class: {'Claim' if prediction[0] == 1 else 'No Claim'}")
    print(f"Probability of Claim: {probability[0]:.4f}")


predict_from_user_input()

Top 12 features selected by LASSO:
['DRIVING_EXPERIENCE' 'VEHICLE_OWNERSHIP' 'VEHICLE_YEAR' 'POSTAL_CODE'
 'MARRIED' 'GENDER' 'CHILDREN' 'SPEEDING_VIOLATIONS' 'RACE' 'EDUCATION'
 'PAST_ACCIDENTS' 'VEHICLE_TYPE']

Please enter values for the following features:


Enter value for DRIVING_EXPERIENCE:  0
Enter value for VEHICLE_OWNERSHIP:  0
Enter value for VEHICLE_YEAR:  2000
Enter value for POSTAL_CODE:  10238
Enter value for MARRIED:  0
Enter value for GENDER:  1
Enter value for CHILDREN:  0
Enter value for SPEEDING_VIOLATIONS:  3
Enter value for RACE:  1
Enter value for EDUCATION:  2
Enter value for PAST_ACCIDENTS:  3
Enter value for VEHICLE_TYPE:  0



Prediction Results:
Predicted Class: Claim
Probability of Claim: 0.7441


### ANOVA

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE


df = pd.read_csv("preprocessed_car_claim.csv")


y = df['OUTCOME'].apply(lambda x: 1 if x > 0 else 0)
X = df.drop(columns=['OUTCOME'])


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)


k = min(12, X.shape[1])
selector = SelectKBest(score_func=f_classif, k=k)
selector.fit(X_train_scaled, y_train_res)


anova_scores = pd.DataFrame({
    'features': X.columns.tolist(),
    'f_score': selector.scores_,
    'p_value': selector.pvalues_
}).sort_values('f_score', ascending=False)


top_features = anova_scores['features'].head(k).values
print(f"Top {k} features selected by ANOVA F-test:")
print(top_features)


col_indices = [X.columns.get_loc(col) for col in top_features]
X_train_selected = X_train_scaled[:, col_indices]
X_test_selected = X_test_scaled[:, col_indices]


final_model = RandomForestClassifier(
    n_estimators=41,
    max_depth=5,
    min_samples_split=5,
    random_state=42,
    class_weight='balanced'
)
final_model.fit(X_train_selected, y_train_res)

def predict_from_user_input():
    print("\nPlease enter values for the following features:")
    

    user_data = {col: [0] for col in X.columns} 
    
    
    for feature in top_features:
        value = float(input(f"Enter value for {feature}: "))
        user_data[feature] = [value]
    
    
    user_df = pd.DataFrame(user_data)
    
   
    user_scaled = scaler.transform(user_df)
    
    
    user_selected = user_scaled[:, col_indices]
    
    
    prediction = final_model.predict(user_selected)
    probability = final_model.predict_proba(user_selected)[:, 1]

    print("\nPrediction Results:")
    print(f"Predicted Class: {'Claim' if prediction[0] == 1 else 'No Claim'}")
    print(f"Probability of Claim: {probability[0]:.4f}")


predict_from_user_input()

Top 12 features selected by ANOVA F-test:
['DRIVING_EXPERIENCE' 'AGE' 'VEHICLE_OWNERSHIP' 'MARRIED' 'PAST_ACCIDENTS'
 'CHILDREN' 'SPEEDING_VIOLATIONS' 'CREDIT_SCORE' 'VEHICLE_YEAR' 'DUIS'
 'EDUCATION' 'ANNUAL_MILEAGE']

Please enter values for the following features:


Enter value for DRIVING_EXPERIENCE:  0
Enter value for AGE:  16
Enter value for VEHICLE_OWNERSHIP:  0
Enter value for MARRIED:  0
Enter value for PAST_ACCIDENTS:  3
Enter value for CHILDREN:  0
Enter value for SPEEDING_VIOLATIONS:  3
Enter value for CREDIT_SCORE:  0.87
Enter value for VEHICLE_YEAR:  2000
Enter value for DUIS:  1
Enter value for EDUCATION:  1
Enter value for ANNUAL_MILEAGE:  0.18



Prediction Results:
Predicted Class: Claim
Probability of Claim: 0.6632
