### Adnan Altukleh, Abdulkarim Dawalibi
### Course DV2627

### Importing necessary libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report,confusion_matrix
from scipy.stats import skew, kurtosis
from scipy.signal import welch
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import entropy
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier

### Data reading

In [3]:
train_oper=pd.read_csv("train_operational_readouts.csv")
train_tte=pd.read_csv("train_tte.csv")

### Labeling the train data

In [4]:
def set_class_label(row):
    if row['in_study_repair'] == 0:
        return 0
    elif 0 <= row['time'] <= 6:
        return 4
    elif 6 <= row['time'] <= 12:   
        return 3
    elif 12 <= row['time'] <= 24:
        return 2
    elif 24 <= row['time'] <= 48:
        return 1
    elif row['time'] > 48:
        return 1
    else:
        raise ValueError("Invalid time value: {}".format(row['time']))

In [5]:
# Get the last row for each vehicle_id in train_oper
last_rows = train_oper.groupby('vehicle_id').tail(1)

# Merge train_tte with last_rows on vehicle_id
train_oper_label = pd.merge(last_rows, train_tte, on='vehicle_id')

# Calculate the difference between time_step and length_of_study_time_step
train_oper_label['time'] = train_oper_label['length_of_study_time_step'] - train_oper_label['time_step']

In [6]:
train_oper_label['class_label'] = train_oper_label.apply(set_class_label, axis=1)

In [7]:
Label=train_oper_label.drop(columns=train_oper_label.columns[1:-1],axis=1)
training_data=pd.merge(Label,train_oper,on='vehicle_id')

### Selecting specific number of vehicles form each class

In [8]:
# selecting the vehicles from each class with complete data and equal number of vehicles from each class
class_1=training_data.loc[(training_data["class_label"]==1)]["vehicle_id"].unique()
class_2=training_data.loc[(training_data["class_label"]==2)& (training_data["vehicle_id"].isin(training_data.groupby("vehicle_id").filter(lambda x: x.notnull().all().all())["vehicle_id"].unique()))]["vehicle_id"].unique()[:len(class_1)]
class_3=training_data.loc[(training_data["class_label"]==3)& (training_data["vehicle_id"].isin(training_data.groupby("vehicle_id").filter(lambda x: x.notnull().all().all())["vehicle_id"].unique()))]["vehicle_id"].unique()[:len(class_1)]
class_4=training_data.loc[(training_data["class_label"]==4)& (training_data["vehicle_id"].isin(training_data.groupby("vehicle_id").filter(lambda x: x.notnull().all().all())["vehicle_id"].unique()))]["vehicle_id"].unique()[:len(class_1)]


In [9]:
# selecting all vehicles from class 0 with complete data
class_0=training_data.loc[(training_data["class_label"]==0)& (training_data["vehicle_id"].isin(training_data.groupby("vehicle_id").filter(lambda x: x.notnull().all().all())["vehicle_id"].unique()))]["vehicle_id"].unique()

In [10]:
df_1 = training_data.loc[(training_data["vehicle_id"].isin(class_1))]
df_2 = training_data.loc[(training_data["vehicle_id"].isin(class_2))]
df_3 = training_data.loc[(training_data["vehicle_id"].isin(class_3))]
df_4 = training_data.loc[(training_data["vehicle_id"].isin(class_4))]
df_0 = training_data.loc[(training_data["vehicle_id"].isin(class_0))]
training_data=pd.concat([df_0,df_1,df_2,df_3,df_4])

### Applying Power spectral density (PSD)

In [11]:
# function to estimate the sampling frequency
def estimate_fs(group):
    if len(group) < 2:
        return np.nan
    time_diffs = group['time_step'].diff().dropna()
    mean_diff = time_diffs.mean()
    if mean_diff == 0:
        return np.nan
    return 1 / mean_diff

def compute_psd(group):
    # Estimate the sampling frequency for the group
    fs = estimate_fs(group)
       
    psd_features = {}
    # Columns to skip (non-signal data)
    columns_to_skip = ['vehicle_id', 'class_label', 'time_step']
    
    # Iterate over columns that are not in columns_to_skip
    for column in [col for col in group.columns if col not in columns_to_skip]:
        # Calculate the PSD using Welch's method
        f, Pxx_den = welch(group[column], fs=fs)
        psd_features[column] = Pxx_den
        
    # Convert the PSD features dictionary to a DataFrame
    psd_df = pd.DataFrame(psd_features)
    
    # Add the 'vehicle_id' back into the DataFrame if it's a groupby operation
    if 'vehicle_id' in group.columns:
        psd_df['vehicle_id'] = group['vehicle_id'].iloc[0]
    
    
    return psd_df

In [12]:
# Group the training data by 'vehicle_id' and apply the compute_psd function
grouped = training_data.groupby('vehicle_id')
grouped_psd_dfs = grouped.apply(compute_psd)
psd_combined_df = grouped_psd_dfs.reset_index(drop=True)

In [13]:
label=train_oper_label.drop(columns=train_oper_label.columns[1:-1],axis=1)
psd_combined_df=pd.merge(label,psd_combined_df,on='vehicle_id')

### Applying feature extraction

In [14]:
def calculate_spectral_features(signal, fs=1.0):

    f, Pxx = welch(signal, fs=fs, nperseg=min(256, len(signal)))
    psd_norm = Pxx / np.sum(Pxx)
    spectral_entropy = entropy(psd_norm)
    spectral_kurtosis = kurtosis(Pxx, fisher=True, bias=False)
    return spectral_entropy, spectral_kurtosis

def feature_extraction(df, sensor_columns):

    # Initialize a list to hold the features for each vehicle
    features = []

    # Group the dataframe by vehicle_id
    grouped = df.groupby('vehicle_id')
    
    for vehicle_id, group in grouped:
        vehicle_features = [vehicle_id] 
        
        for sensor in sensor_columns:
            sensor_data = group[sensor].values
            
            # Calculate statistical features
            mean = np.mean(sensor_data)
            median = np.median(sensor_data)
            std = np.std(sensor_data)
            var = np.var(sensor_data)
            minimum = np.min(sensor_data)
            maximum = np.max(sensor_data)
            skewness = skew(sensor_data)
            kurt = kurtosis(sensor_data)
            
            # Calculate spectral features
            spectral_entropy, spectral_kurtosis = calculate_spectral_features(sensor_data)
            
            # Append the features for this sensor to the vehicle features
           
            vehicle_features.extend([mean, median, std, var, minimum, maximum, skewness, kurt, spectral_entropy, spectral_kurtosis])
        
        # Append the features for this vehicle to the list
        features.append(vehicle_features)
    
    # Define column names dynamically based on the number of sensors
    column_names = ['vehicle_id']
    for sensor in sensor_columns:
        column_names.extend([f"{sensor}_mean", f"{sensor}_median", f"{sensor}_std", f"{sensor}_var", f"{sensor}_min", f"{sensor}_max", f"{sensor}_skewness", f"{sensor}_kurtosis", f"{sensor}_spectral_entropy", f"{sensor}_spectral_kurtosis"])
    
    # Create a DataFrame from the features
    feature_df = pd.DataFrame(features, columns=column_names)
    
    return feature_df

In [15]:
feature_df = feature_extraction(psd_combined_df, psd_combined_df.columns[2:])

In [16]:
# Merge the feature_df with the label DataFrame
label=train_oper_label.drop(columns=train_oper_label.columns[1:-1],axis=1)
feature_df=pd.merge(label,feature_df,on='vehicle_id')
feature_df

Unnamed: 0,vehicle_id,class_label,171_0_mean,171_0_median,171_0_std,171_0_var,171_0_min,171_0_max,171_0_skewness,171_0_kurtosis,...,397_35_mean,397_35_median,397_35_std,397_35_var,397_35_min,397_35_max,397_35_skewness,397_35_kurtosis,397_35_spectral_entropy,397_35_spectral_kurtosis
0,2,0,9.550564e+12,9.449861e+10,3.210733e+13,1.030881e+27,9.152348e+08,1.368028e+14,3.642203,11.520745,...,9.728932e+04,1217.583597,2.524937e+05,6.375308e+10,6.592829,8.025761e+05,2.368818,3.627734,0.732543,2.393180
1,14,0,1.257518e+13,4.871166e+11,4.300053e+13,1.849046e+27,3.734490e+10,2.064344e+14,4.150941,15.784024,...,2.854375e+05,10449.091961,1.120402e+06,1.255300e+12,174.655193,5.408409e+06,4.333233,16.871694,0.784372,3.918171
2,25,0,5.541963e+13,3.561477e+11,1.928583e+14,3.719434e+28,8.872380e+10,9.363080e+14,4.139557,15.939487,...,7.636945e+05,12255.993323,2.629588e+06,6.914733e+12,2195.484598,1.281450e+07,4.175303,16.187254,0.744001,4.072708
3,27,4,1.698739e+13,2.381277e+10,1.063952e+14,1.131994e+28,1.019048e+09,7.442551e+14,6.651192,42.483808,...,2.042214e+05,11130.355712,7.602224e+05,5.779380e+11,2513.091333,4.904650e+06,5.257984,28.302722,0.686764,12.638653
4,28,0,5.152830e+12,6.470033e+09,3.021224e+13,9.127794e+26,3.278018e+07,2.169414e+14,6.755180,44.352183,...,1.637338e+04,132.919308,8.222028e+04,6.760174e+09,3.091989,5.789769e+05,6.330019,40.050296,0.665177,14.737901
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15160,33637,0,1.610564e+12,1.515586e+10,5.441505e+12,2.960997e+25,4.009629e+08,2.121899e+13,3.321646,9.049819,...,1.258414e+05,6732.107003,2.803024e+05,7.856943e+10,1783.903430,9.543760e+05,2.213600,3.253998,0.798475,0.651522
15161,33639,0,7.298168e+11,2.255199e+10,1.951100e+12,3.806790e+24,7.142390e+09,6.246683e+12,2.472004,4.116759,...,1.750045e+04,1006.591017,4.385876e+04,1.923591e+09,102.287419,1.413186e+05,2.457330,4.073934,1.137420,-0.660083
15162,33640,0,2.874192e+12,1.934799e+10,1.061854e+13,1.127535e+26,5.978833e+08,4.527539e+13,3.728850,11.959977,...,5.249299e+06,92967.089801,1.903836e+07,3.624590e+14,3025.468746,8.121497e+07,3.719906,11.915439,0.871095,1.279314
15163,33642,0,2.056574e+12,1.374424e+10,6.973499e+12,4.862968e+25,2.477302e+08,2.718979e+13,3.323564,9.057884,...,3.887404e+04,1288.814621,8.917474e+04,7.952135e+09,107.994351,3.353173e+05,2.662378,5.843134,0.819212,0.107897


### Add the specifications dataset 

In [17]:
train_spes=pd.read_csv("train_specifications.csv")
train_spes = train_spes.replace(to_replace=r'Cat', value='', regex=True)
train_spes=train_spes.astype(int)
train_spes

Unnamed: 0,vehicle_id,Spec_0,Spec_1,Spec_2,Spec_3,Spec_4,Spec_5,Spec_6,Spec_7
0,0,0,0,0,0,0,0,0,0
1,2,0,1,1,0,0,0,0,1
2,3,0,1,1,1,0,0,0,1
3,4,0,0,2,1,0,0,0,1
4,5,0,2,2,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
23545,33639,0,1,1,0,0,0,1,4
23546,33640,0,14,1,3,0,0,1,4
23547,33641,0,1,1,0,0,0,1,4
23548,33642,0,1,1,0,0,0,1,4


In [18]:
multi_0=feature_df.loc[feature_df["class_label"]==0].head(164)

In [19]:
feature_df_1=feature_df.loc[feature_df["class_label"]!=0]
feature_df_1 = pd.concat([feature_df_1, multi_0])

In [20]:
feature_df_1=feature_df_1.merge(train_spes,on='vehicle_id')

In [21]:
feature_df_1.fillna(0,inplace=True)

### Data splting

##### The classes are 0,1. Class 1 is a combination of classes 1-4, it indicates the vehicle/s are about to fail in the time window (48-0). And class 0 indicates that the vehicle/s time to fail is larger than 48 time_steps.


In [22]:
two_classes_data=feature_df_1
two_classes_data.loc[two_classes_data["class_label"]!=0,"class_label"]=1
X=two_classes_data.drop(columns=["vehicle_id","class_label"],axis=1)
y=two_classes_data["class_label"]
y.value_counts()

1    164
0    164
Name: class_label, dtype: int64

In [23]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

### Apply feature selection using Logistic Regression L1

In [24]:
# Initialize and train the logistic regression model with L1 penalty
model = LogisticRegression(penalty='l1', solver='liblinear', C=1.0)
model.fit(x_train, y_train)

In [25]:
predictions = model.predict(x_val)

report = classification_report(y_val, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.65      0.67      0.66        33
           1       0.66      0.64      0.65        33

    accuracy                           0.65        66
   macro avg       0.65      0.65      0.65        66
weighted avg       0.65      0.65      0.65        66



In [26]:
# Extract coefficients
coefficients = model.coef_[0]

feature_importance = pd.DataFrame({'Feature': x_train.columns, 'Coefficient': coefficients})

# Sort features by the absolute value of their coefficient
feature_importance['Absolute Coefficient'] = feature_importance['Coefficient'].abs()
feature_importance = feature_importance.sort_values(by='Absolute Coefficient', ascending=False)

# the feature importance
feat_import=feature_importance.loc[feature_importance["Absolute Coefficient"]<=0]

In [27]:
x_train=x_train.drop(feat_import["Feature"],axis=1)
x_val=x_val.drop(feat_import["Feature"],axis=1)

### Utilize the best parameters

In [29]:
# Setup parameter grids for each classifier
lr_params = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}

gb_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}

# Create models
lr = LogisticRegression(solver='liblinear')
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()

# Create GridSearchCV objects
lr_grid = GridSearchCV(lr, lr_params, cv=5, scoring='accuracy')
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='accuracy')
gb_grid = GridSearchCV(gb, gb_params, cv=5, scoring='accuracy')

# Fit models
lr_grid.fit(x_train, y_train)
rf_grid.fit(x_train, y_train)
gb_grid.fit(x_train, y_train)

# Print the best parameters
print("Best parameters for LR:", lr_grid.best_params_)
print("Best parameters for RF:", rf_grid.best_params_)
print("Best parameters for GB:", gb_grid.best_params_)

### Modeling and evaluating 

### Logistic

In [30]:
# Initialize the logistic regression model
logreg = LogisticRegression(C= 0.01, penalty= "l2")

# Train the model
logreg.fit(x_train, y_train)

# Predict the labels of the test set
y_pred = logreg.predict(x_val)

# Calculate confusion matrix
conf_mat_log = confusion_matrix(y_val, y_pred)

report1 = classification_report(y_val, y_pred)


report = classification_report(y_val, y_pred, output_dict=True)

# Convert the report to a DataFrame
report_df_log = pd.DataFrame(report).transpose()

# Drop the support column to focus on precision, recall, and f1-score
report_df_log = report_df_log.drop('support', axis=1)

# Evaluate the model
print(report1)
print(conf_mat_log)

              precision    recall  f1-score   support

           0       0.69      0.67      0.68        33
           1       0.68      0.70      0.69        33

    accuracy                           0.68        66
   macro avg       0.68      0.68      0.68        66
weighted avg       0.68      0.68      0.68        66

[[22 11]
 [10 23]]


### Random Forest

In [31]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(max_depth= 30, n_estimators= 100)

# Train the model
rf_model.fit(x_train, y_train)

# Predict the labels of the test set
y_pred_rf = rf_model.predict(x_val)

conf_mat_rf = confusion_matrix(y_val, y_pred_rf)
report1 = classification_report(y_val, y_pred_rf)

report = classification_report(y_val, y_pred_rf, output_dict=True)

# Convert the report to a DataFrame
report_df_rf = pd.DataFrame(report).transpose()

# Drop the support column to focus on precision, recall, and f1-score
report_df_rf = report_df_rf.drop('support', axis=1)


# Evaluate the model
print("Random Forest Classification Report:")
print(report1)
print("Random Forest Confusion Matrix:")
print(conf_mat_rf)

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.76      0.76        33
           1       0.76      0.76      0.76        33

    accuracy                           0.76        66
   macro avg       0.76      0.76      0.76        66
weighted avg       0.76      0.76      0.76        66

Random Forest Confusion Matrix:
[[25  8]
 [ 8 25]]


### Gradient Boosting

In [32]:
# Initialize the Gradient Boosting model
gb_model = GradientBoostingClassifier(learning_rate= 0.2, max_depth= 3, n_estimators= 50)

# Train the model
gb_model.fit(x_train, y_train)

# Predict the labels of the test set
y_pred_gb = gb_model.predict(x_val)

conf_mat_gb = confusion_matrix(y_val, y_pred_gb)

report1 = classification_report(y_val, y_pred_gb)


report = classification_report(y_val, y_pred_gb, output_dict=True)

# Convert the report to a DataFrame
report_df_gb = pd.DataFrame(report).transpose()

# Drop the support column to focus on precision, recall, and f1-score
report_df_gb = report_df_gb.drop('support', axis=1)

# Evaluate the model
print("Gradient Boosting Classification Report:")
print(report1)
print("Gradient Boosting Confusion Matrix:")
print(conf_mat_gb)

Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.76      0.77        33
           1       0.76      0.79      0.78        33

    accuracy                           0.77        66
   macro avg       0.77      0.77      0.77        66
weighted avg       0.77      0.77      0.77        66

Gradient Boosting Confusion Matrix:
[[25  8]
 [ 7 26]]


### Cat Boost

In [33]:
# Initialize the CatBoost Classifier
cb_model = CatBoostClassifier(
    iterations=200,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    verbose=100,
    eval_metric='Accuracy'
)

# Fit the model
cb_model.fit(x_train, y_train, eval_set=(x_val, y_val), use_best_model=True)

# Make predictions
y_pred_cb = cb_model.predict(x_val)

# Evaluate the model
conf_mat_cb = confusion_matrix(y_val, y_pred_cb)
report_cb = classification_report(y_val, y_pred_cb)

print("CatBoost Classification Report:")
print(report_cb)
print("CatBoost Confusion Matrix:")
print(conf_mat_cb)

0:	learn: 0.8015267	test: 0.6363636	best: 0.6363636 (0)	total: 270ms	remaining: 53.7s
100:	learn: 1.0000000	test: 0.7878788	best: 0.8333333 (41)	total: 9.99s	remaining: 9.8s
199:	learn: 1.0000000	test: 0.7878788	best: 0.8333333 (41)	total: 20.3s	remaining: 0us

bestTest = 0.8333333333
bestIteration = 41

Shrink model to first 42 iterations.
CatBoost Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.79      0.83        33
           1       0.81      0.88      0.84        33

    accuracy                           0.83        66
   macro avg       0.84      0.83      0.83        66
weighted avg       0.84      0.83      0.83        66

CatBoost Confusion Matrix:
[[26  7]
 [ 4 29]]


### Valdation data

In [34]:
validation_labels=pd.read_csv("validation_labels.csv")
validation_labels["class_label"].value_counts()

0    4910
4      76
3      30
1      16
2      14
Name: class_label, dtype: int64

In [35]:
valdation_data=pd.read_csv("validation_operational_readouts.csv")
valdation_data=pd.merge(valdation_data,validation_labels,on='vehicle_id')
valdation_data

Unnamed: 0,vehicle_id,time_step,171_0,666_0,427_0,837_0,167_0,167_1,167_2,167_3,...,397_27,397_28,397_29,397_30,397_31,397_32,397_33,397_34,397_35,class_label
0,10,3.0,46590.0,3696.0,2038959.0,1450.0,0.0,273826.0,339584.0,99834.0,...,8026.0,855.0,15.0,495.0,91665.0,169125.0,51900.0,9362.0,0.0,0
1,10,7.4,127110.0,16716.0,6501456.0,4660.0,0.0,635642.0,609742.0,288036.0,...,44312.0,7995.0,75.0,2265.0,414180.0,688891.0,258810.0,63167.0,1005.0,0
2,10,8.0,144015.0,19596.0,7327918.0,5230.0,0.0,654780.0,665756.0,300174.0,...,52203.0,9421.0,75.0,2415.0,487080.0,812071.0,300735.0,77477.0,1500.0,0
3,10,12.0,187560.0,24264.0,9286082.0,7420.0,2647.0,831628.0,794332.0,360066.0,...,61399.0,10727.0,75.0,2610.0,655620.0,1118116.0,403516.0,99587.0,1591.0,0
4,10,12.2,187575.0,24264.0,9286082.0,7420.0,,,,,...,61400.0,10727.0,75.0,2611.0,655620.0,1118116.0,403517.0,99588.0,1592.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196222,33625,67.8,1392945.0,2450.0,60748886.0,0.0,0.0,3772792.0,10954147.0,6446205.0,...,23783.0,1735.0,75.0,17827.0,219069.0,609726.0,41630.0,2284.0,60.0,0
196223,33625,73.8,1519110.0,2654.0,66374974.0,0.0,0.0,4062662.0,11906819.0,7121565.0,...,26033.0,1915.0,75.0,20227.0,247374.0,658866.0,44435.0,2480.0,60.0,0
196224,33625,77.4,1590900.0,2690.0,69656097.0,0.0,0.0,4256241.0,12550791.0,7621393.0,...,27594.0,2081.0,75.0,21307.0,263755.0,690606.0,46416.0,2585.0,60.0,0
196225,33625,83.4,1657335.0,2690.0,72538912.0,0.0,0.0,4488515.0,13016809.0,7995291.0,...,29380.0,2201.0,75.0,22462.0,285700.0,716767.0,48952.0,2810.0,60.0,0


### Selecting data needed

In [36]:
class_healthy = valdation_data.loc[valdation_data['class_label'] == 0]["vehicle_id"].unique()[:136]
df_0 = valdation_data.loc[(valdation_data["vehicle_id"].isin(class_healthy))]
classes_unhealthy = valdation_data.loc[valdation_data['class_label'] != 0]
valdation=pd.concat([classes_unhealthy,df_0])

### Applying PSD

In [37]:
groupedv = valdation.groupby('vehicle_id')
grouped_psd_vdfs = groupedv.apply(compute_psd)
valdation = grouped_psd_vdfs.reset_index(drop=True)

In [38]:
valdation=pd.merge(validation_labels,valdation,on='vehicle_id')

In [39]:
valdation.fillna(0,inplace=True)

### Applying Feature extraction

In [40]:
featurev_df = feature_extraction(valdation, valdation.columns[2:])

In [41]:
valdation_data=pd.merge(featurev_df,validation_labels,on='vehicle_id')

In [42]:
valdation_data.fillna(0,inplace=True)

### Add specifications data

In [43]:
valdation_spes=pd.read_csv("validation_specifications.csv")
valdation_spes = valdation_spes.replace(to_replace=r'Cat', value='', regex=True)
valdation_spes=valdation_spes.astype(int)
valdation_spes

Unnamed: 0,vehicle_id,Spec_0,Spec_1,Spec_2,Spec_3,Spec_4,Spec_5,Spec_6,Spec_7
0,10,0,0,0,0,0,1,0,1
1,16,0,1,1,1,0,0,0,1
2,18,0,1,1,1,0,0,0,1
3,23,0,1,1,1,0,0,0,0
4,45,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...
5041,33596,0,1,1,1,0,0,0,7
5042,33601,0,1,1,0,0,0,1,4
5043,33605,0,1,1,0,0,0,0,4
5044,33612,0,1,1,0,0,0,1,4


In [44]:
valdation_data.loc[valdation_data["class_label"]!=0,"class_label"]=1
valdation_data=valdation_data.merge(valdation_spes,on='vehicle_id')

### Applying feature selection

In [45]:
valdation_data=valdation_data.drop(feat_import["Feature"],axis=1)

### Data spliting

In [46]:
X_v = valdation_data.drop(['class_label',"vehicle_id"], axis=1)
y_v = valdation_data['class_label']

In [47]:
y_v.value_counts()

0    136
1    136
Name: class_label, dtype: int64

### Evaluating the models using the evaluation data

In [48]:
vy_pred_log = logreg.predict(X_v)

conf_mat_log = confusion_matrix(y_v, vy_pred_log)
lreport1 = classification_report(y_v, vy_pred_log)
# Evaluate the model
print("logistic model")
print(lreport1)
print(conf_mat_log)

logistic model
              precision    recall  f1-score   support

           0       0.68      0.29      0.40       136
           1       0.55      0.87      0.67       136

    accuracy                           0.58       272
   macro avg       0.62      0.58      0.54       272
weighted avg       0.62      0.58      0.54       272

[[ 39  97]
 [ 18 118]]


In [49]:
vy_pred_rf = rf_model.predict(X_v)

conf_mat_rf = confusion_matrix(y_v, vy_pred_rf)
rreport1 = classification_report(y_v, vy_pred_rf)
# Evaluate the model
print("Random Forest Classification Report:")
print(rreport1)
print("Random Forest Confusion Matrix:")
print(conf_mat_rf)

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.66      0.70       136
           1       0.70      0.78      0.74       136

    accuracy                           0.72       272
   macro avg       0.72      0.72      0.72       272
weighted avg       0.72      0.72      0.72       272

Random Forest Confusion Matrix:
[[ 90  46]
 [ 30 106]]


In [50]:
vy_pred_gb = gb_model.predict(X_v)

conf_mat_gb = confusion_matrix(y_v, vy_pred_gb)
greport1 = classification_report(y_v, vy_pred_gb)
# Evaluate the model
print("Gradient Boosting model")
print(greport1)
print(conf_mat_gb)

Gradient Boosting model
              precision    recall  f1-score   support

           0       0.71      0.67      0.69       136
           1       0.69      0.73      0.71       136

    accuracy                           0.70       272
   macro avg       0.70      0.70      0.70       272
weighted avg       0.70      0.70      0.70       272

[[91 45]
 [37 99]]


In [51]:
vy_pred_cb = cb_model.predict(X_v)

conf_mat_cb = confusion_matrix(y_v, vy_pred_cb)
cbreport1 = classification_report(y_v, vy_pred_cb)
# Evaluate the model
print("Gradient Boosting model")
print(cbreport1)
print(conf_mat_cb)

Gradient Boosting model
              precision    recall  f1-score   support

           0       0.77      0.65      0.70       136
           1       0.70      0.81      0.75       136

    accuracy                           0.73       272
   macro avg       0.73      0.73      0.73       272
weighted avg       0.73      0.73      0.73       272

[[ 88  48]
 [ 26 110]]


In [52]:
# Create the voting classifier
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('gb', gb_model),
        ('logreg', logreg),
        ('cb', cb_model)
    ],
    voting='hard' 
)

voting_clf.fit(x_train, y_train)

y_pred_voting = voting_clf.predict(X_v)

# Evaluate the voting classifier
conf_mat_voting = confusion_matrix(y_v, y_pred_voting)
report_voting = classification_report(y_v, y_pred_voting)

print("Voting Classifier Classification Report:")
print(report_voting)
print("Voting Classifier Confusion Matrix:")
print(conf_mat_voting)

0:	learn: 0.8015267	total: 103ms	remaining: 20.4s
100:	learn: 1.0000000	total: 6.85s	remaining: 6.71s
199:	learn: 1.0000000	total: 13.9s	remaining: 0us
Voting Classifier Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.66      0.70       136
           1       0.69      0.76      0.73       136

    accuracy                           0.71       272
   macro avg       0.72      0.71      0.71       272
weighted avg       0.72      0.71      0.71       272

Voting Classifier Confusion Matrix:
[[ 90  46]
 [ 32 104]]
