In [2]:
pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.13-py2.py3-none-any.whl.metadata (12 kB)
Collecting lightgbm (from lazypredict)
  Downloading lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Collecting xgboost (from lazypredict)
  Downloading xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Downloading lazypredict-0.2.13-py2.py3-none-any.whl (12 kB)
Downloading lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m132.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m179.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost, lightgbm, lazypredict
Successfully installed lazypredict-0.2.13 lightgbm-4.5.0 xgboost-2.1.3
Note: you may need to restart the kernel to use updated packag

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import classification_report

# Load the pre-split datasets
train_df = pd.read_csv('/teamspace/studios/this_studio/Assignment-TechstaX/data/imputed_dataset.csv')
val_df = pd.read_csv('/teamspace/studios/this_studio/Assignment-TechstaX/data/val_data.csv')
test_df = pd.read_csv('/teamspace/studios/this_studio/Assignment-TechstaX/data/val_data.csv')
print("train shape: ", train_df.shape)
print("val shape: ", val_df.shape)
print("test shape: ", test_df.shape)


train shape:  (5796296, 34)
val shape:  (1159259, 34)
test shape:  (1159259, 34)


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import classification_report
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

def preprocess_data(train_df, val_df, test_df):
    """
    Preprocesses the data by handling categorical, boolean, and numeric features.
    Returns preprocessed training, validation, and test sets.
    """
    # Separate features and target
    feature_cols = [col for col in train_df.columns if col not in ['Severity', 'Description']]
    X_train = train_df[feature_cols]
    X_val = val_df[feature_cols]
    X_test = test_df[feature_cols]
    
    y_train = train_df['Severity']
    y_val = val_df['Severity']
    y_test = test_df['Severity']

    # Identify column types
    categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
    numeric_cols = [col for col in X_train.columns if X_train[col].dtype != 'object' and col not in [
        'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 
        'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal'
    ]]
    boolean_cols = [col for col in X_train.columns if X_train[col].dtype == 'bool']

    # Store preprocessed features
    processed_features = {}

    # Handle categorical features with frequency encoding
    for col in categorical_cols:
        freq = X_train[col].value_counts(normalize=True)
        processed_features[f"{col}_freq"] = {
            'train': X_train[col].map(freq),
            'val': X_val[col].map(freq),
            'test': X_test[col].map(freq)
        }

    # Handle boolean features
    for col in boolean_cols:
        processed_features[col] = {
            'train': X_train[col].astype(int),
            'val': X_val[col].astype(int),
            'test': X_test[col].astype(int)
        }

    # Handle numeric features with scaling
    scaler = StandardScaler()
    scaled_train = scaler.fit_transform(X_train[numeric_cols])
    scaled_val = scaler.transform(X_val[numeric_cols])
    scaled_test = scaler.transform(X_test[numeric_cols])

    # Combine all processed features
    final_train = np.hstack([scaled_train] + [processed_features[col]['train'].values.reshape(-1, 1) 
                                            for col in processed_features])
    final_val = np.hstack([scaled_val] + [processed_features[col]['val'].values.reshape(-1, 1) 
                                        for col in processed_features])
    final_test = np.hstack([scaled_test] + [processed_features[col]['test'].values.reshape(-1, 1) 
                                          for col in processed_features])

    # Create column names for the final dataset
    final_columns = numeric_cols + list(processed_features.keys())
    
    # Convert to DataFrame
    final_train_df = pd.DataFrame(final_train, columns=final_columns)
    final_val_df = pd.DataFrame(final_val, columns=final_columns)
    final_test_df = pd.DataFrame(final_test, columns=final_columns)

    return final_train_df, final_val_df, final_test_df, y_train, y_val, y_test



def train_and_evaluate_model(X_train, X_val, X_test, y_train, y_val, y_test, model, model_name):
    """
    Trains a specific model and evaluates its performance on validation and test sets.
    """
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Predict on validation and test sets
    y_pred_val = model.predict(X_val)
    y_pred_test = model.predict(X_test)
    
    # Print classification reports
    print(f"\n{model_name} Validation Set Classification Report:")
    print(classification_report(y_val, y_pred_val))
    
    print(f"\n{model_name} Test Set Classification Report:")
    print(classification_report(y_test, y_pred_test))
    
    return model


In [4]:
# Define the models
models = {
    'LightGBM': lgb.LGBMClassifier(),
    'RandomForest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Preprocess the data
final_train_df, final_val_df, final_test_df, y_train, y_val, y_test = preprocess_data(train_df, val_df, test_df)

# Train and evaluate each model
for model_name, model in models.items():
    print(f"Training and evaluating {model_name}...")
    trained_model = train_and_evaluate_model(final_train_df, final_val_df, final_test_df, y_train, y_val, y_test, model, model_name)


Training and evaluating LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.367026 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2998
[LightGBM] [Info] Number of data points in the train set: 5796296, number of used features: 32
[LightGBM] [Info] Start training from score -4.855604
[LightGBM] [Info] Start training from score -0.248050
[LightGBM] [Info] Start training from score -1.688948
[LightGBM] [Info] Start training from score -3.605232

LightGBM Validation Set Classification Report:
              precision    recall  f1-score   support

           1       0.63      0.61      0.62     13428
           2       0.91      0.97      0.94    980300
           3       0.69      0.47      0.56    137193
           4       0.61      0.11      0.19     28338

    accuracy                           0.89   1159259
   macro avg       

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values