# ****Vehicle Claims Prediction Project****
@Yevheniia-Rudenko

29/04/2025


In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.preprocessing import StandardScaler 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import f1_score

In [2]:
# Load datasets 
labeled_data = pd.read_csv("labeled_data.csv", sep=";")
unlabeled_data = pd.read_csv("unlabeled_data.csv", sep=";")

In [3]:
print(labeled_data.columns)


Index(['ID', 'Maker', ' Genmodel', ' Genmodel_ID', 'Adv_year', 'Adv_month',
       'Color', 'Reg_year', 'Bodytype', 'Runned_Miles', 'Engin_size',
       'Gearbox', 'Fuel_type', 'Price', 'Seat_num', 'Door_num', 'issue',
       'issue_id', 'Adv_day', 'breakdown_date', 'repair_complexity',
       'repair_cost', 'repair_hours', 'repair_date', 'value', 'Label'],
      dtype='object')


In [4]:
id_column = "ID" 
unlabeled_ids = unlabeled_data[id_column] 
drop_cols = ["ID"]

for df in [labeled_data, unlabeled_data]:
    df.drop(columns=drop_cols, inplace=True, errors='ignore')

In [5]:
# Handle anomalies
df["issue"] = df["issue"].str.strip()
df["Engin_size"] = df["Engin_size"].str.replace("L", "").astype(float)
df["Fuel_type"] = df["Fuel_type"].replace("still_Diesel_but_you_found_an_easteregg", "Diesel")

# Handle missing values
numeric_cols = df.select_dtypes(include=["number"]).columns
categorical_cols = df.select_dtypes(include=["object"]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
df[categorical_cols] = df[categorical_cols].apply(lambda x: x.fillna(x.mode()[0]))

In [None]:
# Feature engineering: meaningful interactions
df["car_age"] = 2025 - df["Reg_year"]
df["usage_rate"] = df["Runned_Miles"] / (2025 - df["Reg_year"])
df["repair_efficiency"] = df["repair_cost"] / df["repair_hours"]
df["complexity_ratio"] = df["repair_complexity"] / df["repair_hours"]

In [6]:
# Encode categorical variables 
labeled_data = pd.get_dummies(labeled_data)
unlabeled_data = pd.get_dummies(unlabeled_data)

In [7]:
# Remove low-variance features (prevent excessive filtering)
low_variance_cols = labeled_data.var()[labeled_data.var() < 0.001].index
low_variance_cols = [col for col in low_variance_cols if col != 'Label']
labeled_data.drop(columns=low_variance_cols, inplace=True)


In [8]:
# Remove highly correlated features 
corr_matrix = labeled_data.corr()
high_corr_features = [col for col in corr_matrix.columns if any(corr_matrix[col] > 0.98) and col != 'Label']
if len(high_corr_features) < len(labeled_data.columns) - 1:
    labeled_data.drop(columns=high_corr_features, inplace=True)

In [9]:
print("Remaining features after filtering:", labeled_data.columns)
print("Dataset shape:", labeled_data.shape)  


Remaining features after filtering: Index(['ID', 'Adv_year', 'Adv_month', 'Reg_year', 'Runned_Miles', 'Engin_size',
       'Price', 'Seat_num', 'Door_num', 'issue_id',
       ...
       'repair_date_2021-05-19', 'repair_date_2021-05-20',
       'repair_date_2021-05-21', 'repair_date_2021-05-22',
       'repair_date_2021-05-23', 'repair_date_2021-05-24',
       'repair_date_2021-05-25', 'repair_date_2021-05-26',
       'repair_date_2021-05-27', 'repair_date_2021-05-28'],
      dtype='object', length=682)
Dataset shape: (37636, 682)


In [10]:
# Define features & target
X = labeled_data.drop(columns=['Label'])
y = labeled_data['Label']

# Select only numeric features
numeric_features = X.select_dtypes(include=["number"]).columns
X = X[numeric_features]

In [11]:
# Split dataset (60% train, 20% validation, 20% test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [12]:
# Train model with cross-validation
model = RandomForestClassifier(n_estimators=1000, max_depth=15, min_samples_split=15,min_samples_leaf = 5, class_weight="balanced", random_state=42)# Reduced depth to prevent overfitting
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring="f1_macro")
print(f"Cross-validation F1 Score: {np.mean(cv_scores):.2f}")

Cross-validation F1 Score: 0.49


In [13]:
# Fit the model
model.fit(X_train_scaled, y_train)

In [14]:
# Evaluate model
y_val_pred = model.predict(X_val_scaled)
val_score = f1_score(y_val, y_val_pred, average="macro")
print(f"Validation F1 Score: {val_score:.2f}")

y_test_pred = model.predict(X_test_scaled)
test_score = f1_score(y_test, y_test_pred, average="macro")
print(f"Test F1 Score: {test_score:.2f}")

Validation F1 Score: 0.51
Test F1 Score: 0.50
