In [1]:
# Import the necessary packages
import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# Suppress specific future warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:
# Import the clean data
data = pd.read_csv('source/data.csv', low_memory=False)

In [4]:
# Copy of the original dataset for feature engineering and preprocessing
data_processed = data.copy()

In [5]:
# Drop unnecessary columns
data_processed = data_processed.drop(['AccID', 'birth_year', 'vehicleID', 'num_veh'], axis=1)

In [6]:
# Converting 'time', 'day', 'month', and 'year' to float type
data_processed['time'] = data_processed['time'].astype('float64')
data_processed['day'] = data_processed['day'].astype('float64')
data_processed['month'] = data_processed['month'].astype('float64')
data_processed['year'] = data_processed['year'].astype('float64')

In [7]:
# Selecting features and target variable
features = ['lum', 'atm_condition', 'collision_type', 'route_category', 'traffic_regime', 'reserved_lane_code', 
            'longitudinal_profile', 'upstream_terminal_number', 'plan', 'surface_condition', 'infra', 'accident_situation', 
            'traffic_direction', 'vehicle_category', 'fixed_obstacle', 'mobile_obstacle', 'initial_impact_point', 'manv', 
            'motor', 'seat', 'user_category', 'gender', 'reason_travel', 'safety_equipment1', 'maximum_speed', 'age', 
            'lat', 'long', 'distance_upstream_terminal', 'total_number_lanes', 'day', 'time', 'month', 'year']
target = 'gravity'

In [8]:
# Handling categorical features with One Hot Encoding
X = pd.get_dummies(data_processed[features], drop_first=True)
y = data_processed[target]

In [9]:
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Standardization: Fit only on the training data, then apply to both train and test
scaler = StandardScaler()
numerical_columns = X.select_dtypes(include=['float64']).columns

X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

In [11]:
# Check the dimensions
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

Shape of X_train: (358136, 34)
Shape of X_test: (89534, 34)


Apply ML model ---->

In [14]:
!pip install streamlit

Defaulting to user installation because normal site-packages is not writeable


In [15]:
import lightgbm as lgb
from sklearn.metrics import classification_report, accuracy_score

# Create the LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Define the parameters for LightGBM
params = {
    'objective': 'multiclass',
    'num_class': len(y.unique()),  # Number of classes in the target variable
    'metric': 'multi_logloss',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'min_data_in_leaf': 20,
    'max_depth': -1,
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'random_state': 42
}

# Train the LightGBM model
model = lgb.train(params, train_data, num_boost_round=100, valid_sets=[train_data, test_data], early_stopping_rounds=10, verbose_eval=False)

# Predict on the test set
y_pred = model.predict(X_test)
y_pred_class = y_pred.argmax(axis=1)  # Convert probabilities to class labels

# Calculate accuracy and classification report
accuracy = accuracy_score(y_test, y_pred_class)
report = classification_report(y_test, y_pred_class)



ModuleNotFoundError: No module named 'lightgbm'

In [None]:
print(accuracy)

In [None]:
print(report)