In [2]:
# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report

# Load and preprocess data
df = pd.read_csv("fleet_train_imputed.csv")

# Drop unnecessary columns
columns_to_drop = [
    'record_id', 'fleetid', 'truckid', 'Region', 'Measurement_timestamp',
    'GPS_Longitude', 'GPS_Latitude', 'GPS_Bearing', 'GPS_Altitude',
    'Maintenance_flag', 'Intake_Manifold_Pressure', 'Throttle_Pos_Manifold',
    'Intake_Air_Temp', 'Accel_Pedal_Pos_D', 'Ambient_air_temp',
    'Accel_Ssor_Total', 'Voltage_Control_Module'
]
df = df.drop([col for col in columns_to_drop if col in df.columns], axis=1)

# Define features to normalize
features_to_normalize = [
    'Engine_Load', 'Engine_RPM', 'Engine_Coolant_Temp', 'Vibration',
    'Mass_Air_Flow_Rate', 'Engine_Oil_Temp', 'Trip_Distance',
    'Trip_Time_journey', 'Turbo_Boost_And_Vcm_Gauge'
]

# Initialize and fit scalers
scalers = {}
for feature in features_to_normalize:
    scalers[feature] = MinMaxScaler()
    scalers[feature].fit(df[[feature]])

# Transform features
df_scaled = df.copy()
for feature, scaler in scalers.items():
    if feature in df.columns:
        df_scaled[feature] = scaler.transform(df[[feature]])

# Calculate condition score
def calculate_condition_score(row):
    engine_health_score = (
        row['Engine_Load'] + 
        row['Engine_RPM'] + 
        row['Engine_Coolant_Temp']
    ) / 3
    
    usage_severity = (
        row['Engine_Load'] * 
        (row['Trip_Distance'] + row['Trip_Time_journey'])
    )
    
    anomaly_flag = (
        (row['Vibration'] > 0.7) | 
        (row['Engine_Coolant_Temp'] > 0.8)
    ).astype(int)
    
    return 0.5 * engine_health_score + 0.3 * usage_severity + 0.2 * anomaly_flag

df_scaled['Condition_Score'] = df_scaled.apply(calculate_condition_score, axis=1)

# Calculate risk factors
df_scaled['Overstrain_Risk'] = 0.5 * df_scaled['Engine_Load'] + 0.5 * df_scaled['Engine_RPM']
df_scaled['Heat_Dissipation_Risk'] = 0.4 * df_scaled['Engine_Coolant_Temp'] + 0.6 * df_scaled['Engine_Oil_Temp']
df_scaled['Power_Failure_Risk'] = 0.5 * df_scaled['Mass_Air_Flow_Rate'] + 0.5 * df_scaled['Turbo_Boost_And_Vcm_Gauge']

# Assign failure types
def assign_failure_type(row):
    if row['Condition_Score'] < 0.3:
        return 'No Failure'
    risks = {
        'Overstrain Failure': row['Overstrain_Risk'],
        'Heat Dissipation Failure': row['Heat_Dissipation_Risk'],
        'Power Failure': row['Power_Failure_Risk']
    }
    return max(risks, key=risks.get)

df_scaled['Failure_Type'] = df_scaled.apply(assign_failure_type, axis=1)

# Prepare features for model training
feature_columns = [col for col in df_scaled.columns if col not in ['Failure_Type', 'Priority']]
X = df_scaled[feature_columns]
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_scaled['Failure_Type'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

# Define parameter grid for model
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 50, 100],
    'max_depth': [3, 5, 7],
    'min_data_in_leaf': [20, 30, 50],
    'lambda_l1': [0.1, 0.5, 1],
    'lambda_l2': [0.1, 0.5, 1],
    'feature_fraction': [0.7, 0.8, 0.9],
    'bagging_fraction': [0.7, 0.8, 0.9],
    'bagging_freq': [5, 10]
}

# Initialize base model
base_model = lgb.LGBMClassifier(
    objective='multiclass',
    num_class=len(np.unique(y)),
    boosting_type='gbdt',
    verbose=-1
)

# Perform random search for hyperparameter tuning
random_search = RandomizedSearchCV(
    base_model,
    param_grid,
    n_iter=20,
    scoring='accuracy',
    cv=3,
    random_state=42,
    n_jobs=-1
)

# Train model
random_search.fit(X_train, y_train)
model = random_search.best_estimator_

# Make predictions and evaluate model
y_pred = model.predict(X_test)
print("\nModel Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Function to make predictions on new data
def predict_failure(input_data):
    if not isinstance(input_data, pd.DataFrame):
        input_data = pd.DataFrame([input_data])
        
    # Scale features
    input_scaled = input_data.copy()
    for feature, scaler in scalers.items():
        if feature in input_data.columns:
            input_scaled[feature] = scaler.transform(input_data[[feature]])
    
    # Make prediction
    pred = model.predict(input_scaled[feature_columns])
    return label_encoder.inverse_transform(pred)

# Function to get user input
def get_user_input():
    user_records = []
    while True:
        print("\nEnter vehicle parameters:")
        record = {
            'Vehicle_speed_sensor': float(input("Enter Vehicle Speed Sensor: ")),
            'Vibration': float(input("Enter Vibration: ")),
            'Engine_Load': float(input("Enter Engine Load: ")),
            'Engine_Coolant_Temp': float(input("Enter Engine Coolant Temp: ")),
            'Engine_RPM': float(input("Enter Engine RPM: ")),
            'Speed_OBD': float(input("Enter Speed OBD: ")),
            'Mass_Air_Flow_Rate': float(input("Enter Mass Air Flow Rate: ")),
            'Engine_Oil_Temp': float(input("Enter Engine Oil Temp: ")),
            'Speed_GPS': float(input("Enter Speed GPS: ")),
            'Turbo_Boost_And_Vcm_Gauge': float(input("Enter Turbo Boost & VCM Gauge: ")),
            'Trip_Distance': float(input("Enter Trip Distance: ")),
            'Litres_Per_100km_Inst': float(input("Enter Litres Per 100km Inst: ")),
            'CO2_in_g_per_km_Inst': float(input("Enter CO2 in g/km Inst: ")),
            'Trip_Time_journey': float(input("Enter Trip Time Journey: "))
        }
        user_records.append(record)
        
        if input("\nDo you want to enter another instance? (yes/no): ").lower() != 'yes':
            break
    
    return pd.DataFrame(user_records)

# Save model and scalers
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('scalers.pkl', 'wb') as f:
    pickle.dump(scalers, f)
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)
with open('feature_columns.pkl', 'wb') as f:
    pickle.dump(feature_columns, f)

# Example usage:
# To make predictions on new data:
"""
# Get user input
new_data = get_user_input()

# Make predictions
predictions = predict_failure(new_data)
print("\nPredicted Failure Types:", predictions)
"""


Model Evaluation:
Accuracy: 0.9900066622251832

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       889
           1       1.00      1.00      1.00       385
           2       0.97      0.96      0.97       221
           3       1.00      0.50      0.67         6

    accuracy                           0.99      1501
   macro avg       0.99      0.86      0.91      1501
weighted avg       0.99      0.99      0.99      1501



'\n# Get user input\nnew_data = get_user_input()\n\n# Make predictions\npredictions = predict_failure(new_data)\nprint("\nPredicted Failure Types:", predictions)\n'