In [1]:
!pip install imbalanced-learn



In [39]:
#/users/lokeshthangavel/documents/loki coding/sure ride/namma_yatri_imbalanced_dataset_stronger_correlations.csv

In [79]:
!pip install lightgbm



In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("/users/lokeshthangavel/documents/loki coding/sure ride/namma_yatri_imbalanced_dataset_stronger_correlations.csv")

# Function to handle outliers using IQR
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = np.clip(df[column], lower_bound, upper_bound)
    return df

# Handle outliers in numerical columns
outlier_cols = [
    "distance_to_pickup_km",
    "estimated_trip_distance_km",
    "estimated_trip_time_min",
    "hours_already_worked",
    "fare_amount",
    "driver_earnings",
]
for col in outlier_cols:
    df = remove_outliers_iqr(df, col)

# Drop non-numeric columns
columns_to_drop = ["driver_id", "timestamp", "preferred_areas"]
df_cleaned = df.drop(columns=columns_to_drop, errors='ignore')

# Encode categorical variables
categorical_cols = ["service_type", "vehicle_quality", "weather_condition"]
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_cleaned[col] = le.fit_transform(df_cleaned[col])
    label_encoders[col] = le

# Convert 'traffic_congestion_level' to numerical if needed
if df_cleaned["traffic_congestion_level"].dtype == "object":
    df_cleaned["traffic_congestion_level"] = LabelEncoder().fit_transform(df_cleaned["traffic_congestion_level"])

# Sample dataset for faster training
df_sampled = df_cleaned.sample(n=20000, random_state=42)

# Define features and target
X = df_sampled.drop(columns=["acceptance_probability", "accepted_ride"], errors='ignore')
y = df_sampled["acceptance_probability"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model with reduced depth
xgb_model = XGBRegressor(n_estimators=30, max_depth=4, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Feature importance
feature_importance = pd.Series(xgb_model.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_importance)


service_type                  0.289498
weather_condition             0.167481
distance_to_pickup_km         0.161526
estimated_trip_distance_km    0.104265
traffic_congestion_level      0.062580
driver_experience_months      0.053029
hour_of_day                   0.041066
day_of_week                   0.037989
fare_amount                   0.026741
hours_already_worked          0.022367
is_peak_hour                  0.021116
historical_acceptance_rate    0.006584
estimated_trip_time_min       0.005759
driver_earnings               0.000000
day                           0.000000
vehicle_quality               0.000000
month                         0.000000
dtype: float32


In [17]:
import pandas as pd

file_path = "/users/lokeshthangavel/documents/loki coding/sure ride/namma_yatri_imbalanced_dataset_stronger_correlations.csv"
df = pd.read_csv(file_path)

# Display column names
print(df.columns)

Index(['driver_id', 'timestamp', 'month', 'day', 'hour_of_day', 'day_of_week',
       'is_peak_hour', 'service_type', 'driver_experience_months',
       'vehicle_quality', 'preferred_areas', 'historical_acceptance_rate',
       'distance_to_pickup_km', 'estimated_trip_distance_km',
       'estimated_trip_time_min', 'fare_amount', 'driver_earnings',
       'weather_condition', 'traffic_congestion_level', 'hours_already_worked',
       'acceptance_probability', 'accepted_ride'],
      dtype='object')


In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.calibration import CalibratedClassifierCV

# Load dataset
file_path = "/users/lokeshthangavel/documents/loki coding/sure ride/namma_yatri_imbalanced_dataset_stronger_correlations.csv"
df = pd.read_csv(file_path)

# Select relevant features
important_features = [
    "service_type", "weather_condition", "distance_to_pickup_km",
    "estimated_trip_distance_km", "traffic_congestion_level"
]

target = "accepted_ride"

# Drop missing values
df = df.dropna()

# Identify categorical columns
categorical_cols = ["service_type", "weather_condition", "traffic_congestion_level"]

# Apply Label Encoding to all categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for future decoding

# Standardize numerical features
scaler = StandardScaler()
df[["distance_to_pickup_km", "estimated_trip_distance_km"]] = \
    scaler.fit_transform(df[["distance_to_pickup_km", "estimated_trip_distance_km"]])

# Split data into train and test sets
X = df[important_features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Gradient Boosting Classifier
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
calibrated_gbm = CalibratedClassifierCV(gbm, method='sigmoid')
calibrated_gbm.fit(X_train, y_train)

# Predict acceptance probability
y_proba = calibrated_gbm.predict_proba(X_test)[:, 1]  # Probability of acceptance

# Evaluate model performance
y_pred = calibrated_gbm.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save probabilities to dataframe
df_test = X_test.copy()
df_test["predicted_acceptance_probability"] = y_proba
df_test["actual_accepted"] = y_test.values

# Display sample predictions
print(df_test.head(10))

Accuracy: 0.9219
              precision    recall  f1-score   support

           0       0.77      0.50      0.60      1195
           1       0.93      0.98      0.96      8805

    accuracy                           0.92     10000
   macro avg       0.85      0.74      0.78     10000
weighted avg       0.91      0.92      0.91     10000

       service_type  weather_condition  distance_to_pickup_km  \
33553             0                  3              -0.948499   
9427              0                  2               1.108486   
199               0                  1               0.139220   
12447             0                  2               1.536984   
39489             0                  0               1.259470   
42724             0                  2              -0.822355   
10822             0                  0              -0.561601   
49498             0                  0              -0.521733   
4144              0                  2              -0.464637   
36958 