<a href="https://colab.research.google.com/github/beyondinfinity9988/RideML/blob/main/rideML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import zipfile
import os

zip_file = "/ridedataset.zip"

# Extract the ZIP file
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall("extracted_files")  # Extracts to a folder named 'extracted_files'

# Verify extraction
print(os.listdir("extracted_files"))


['ratings.csv', 'rides.csv', 'users.csv', 'drivers.csv', 'vehicles.csv']


In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('/content/extracted_files/drivers.csv')

# Display the first few rows of the DataFrame
print(df.head())


   driver_id                name  vehicle_id  rating  total_rides  available
0          1    Virginia Johnson         284    3.44           30       True
1          2       Daniel Maddox          18    2.59           61      False
2          3    Patricia Salazar         298    4.93           75       True
3          4  Cameron Hinton DDS          49    2.59           12       True
4          5      Jennifer Ayala         138    4.69           71       True


# **Investigate patterns in ride-sharing data to analyze peak hours, average trip distances, and trip durations across different city zones**

## **Load Data**

In [6]:
import pandas as pd

df = pd.read_csv('/content/extracted_files/drivers.csv')

df

Unnamed: 0,driver_id,name,vehicle_id,rating,total_rides,available
0,1,Virginia Johnson,284,3.44,30,True
1,2,Daniel Maddox,18,2.59,61,False
2,3,Patricia Salazar,298,4.93,75,True
3,4,Cameron Hinton DDS,49,2.59,12,True
4,5,Jennifer Ayala,138,4.69,71,True
...,...,...,...,...,...,...
295,296,Jason Pace,115,4.79,73,False
296,297,Dr. Kelly Martin,40,1.54,119,True
297,298,Brooke Snyder,22,4.75,100,False
298,299,Bruce Robinson,141,3.04,179,False


## Data preparation
  **Data separation as X and Y**

In [7]:
y = df['rating']
y

Unnamed: 0,rating
0,3.44
1,2.59
2,4.93
3,2.59
4,4.69
...,...
295,4.79
296,1.54
297,4.75
298,3.04


In [8]:
X = df.drop('rating', axis=1)
X

Unnamed: 0,driver_id,name,vehicle_id,total_rides,available
0,1,Virginia Johnson,284,30,True
1,2,Daniel Maddox,18,61,False
2,3,Patricia Salazar,298,75,True
3,4,Cameron Hinton DDS,49,12,True
4,5,Jennifer Ayala,138,71,True
...,...,...,...,...,...
295,296,Jason Pace,115,73,False
296,297,Dr. Kelly Martin,40,119,True
297,298,Brooke Snyder,22,100,False
298,299,Bruce Robinson,141,179,False


## Data Splitting

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)


In [11]:
X_train

Unnamed: 0,driver_id,name,vehicle_id,total_rides,available
118,119,Jessica Lewis,272,184,False
289,290,Angela Li,23,173,True
59,60,Jacob Durham,138,101,False
69,70,David Chavez,206,183,True
161,162,Steven Lopez,188,123,False
...,...,...,...,...,...
66,67,Michael Smith,201,178,True
53,54,Zachary Ball,159,188,False
79,80,Dalton Chan,274,27,False
280,281,Cynthia Jones,210,52,False


## Model Building

### Linear Regression Model Training

In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Apply one-hot encoding to categorical columns
categorical_cols = ['name', 'available']  # Specify the categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)],
    remainder='passthrough')  # Leave other columns untouched

X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)


In [13]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

for col in ['name', 'available']:
    le = LabelEncoder()
    combined_data = pd.concat([X_train[col], X_test[col]])
    le.fit(combined_data)
    X_train[col] = le.transform(X_train[col])
    X_test[col] = le.transform(X_test[col])


In [14]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train_encoded, y_train)  # Use the processed X_train
y_pred = lr.predict(X_test_encoded)


In [16]:
print("Training features:", X_train.columns)
print("Test features:", X_test.columns)


Training features: Index(['driver_id', 'name', 'vehicle_id', 'total_rides', 'available'], dtype='object')
Test features: Index(['driver_id', 'name', 'vehicle_id', 'total_rides', 'available'], dtype='object')


In [17]:
print("X_train shape:", X_train.shape)  # Should match the model's input shape
print("X_test shape:", X_test.shape)    # Should match X_train


X_train shape: (240, 5)
X_test shape: (60, 5)


In [18]:
y_lr_train_pred = lr.predict(X_train_encoded)
y_lr_test_pred = lr.predict(X_test_encoded)

In [19]:
y_lr_train_pred

array([1.31287112, 4.89543828, 3.32412942, 4.09456683, 4.98373289,
       1.55359786, 4.43424324, 4.17573541, 4.413401  , 1.15342976,
       2.24554664, 2.54289948, 4.09638269, 4.08147064, 2.93397206,
       3.98481391, 3.04413653, 2.60410261, 1.53359543, 1.64591739,
       2.7833499 , 4.63658104, 4.07589055, 4.22380914, 2.85441113,
       3.11312787, 2.96475137, 1.58574796, 4.75339902, 2.58664447,
       4.01294825, 3.87483999, 1.38455498, 2.30471879, 4.72554552,
       3.84584097, 1.47656757, 1.89361626, 2.37338326, 3.58622686,
       1.61380993, 3.96426614, 3.77173535, 1.35620997, 4.97598918,
       2.32355838, 1.67595809, 3.5152895 , 1.58482968, 3.38472272,
       3.08346342, 1.25383803, 3.5062376 , 4.16412674, 3.18423441,
       2.95354334, 4.25393374, 2.43321698, 3.27289609, 1.30573499,
       3.82592705, 4.27527719, 2.63414382, 1.29478919, 1.7935262 ,
       1.0846828 , 3.23401963, 1.02386551, 1.97593607, 3.60501758,
       4.45640058, 2.09606779, 1.05474584, 2.08379843, 1.16543

In [21]:
y_lr_test_pred

array([ 3.88375941,  5.11038822,  1.49291484,  1.99364378,  1.5282055 ,
        4.11012395,  3.88016093,  3.6520826 ,  3.51683995,  3.31205186,
        3.10304101,  0.0304448 ,  4.73250027,  3.01534393,  5.84273593,
        1.00757019,  6.45673317,  2.6990099 ,  3.26393575,  2.10759777,
        1.55938218,  5.52108207,  4.79376913,  5.48987422,  2.93943305,
        1.37362775,  0.32229961,  5.01918636,  1.79728993,  4.46787229,
        3.0699451 ,  3.14601738,  2.9358801 ,  5.03600156,  4.95168693,
        1.96734117,  6.07543349,  3.66298706,  4.28801648,  4.50180805,
        2.55379214,  2.34861544,  0.70827518,  2.69175935,  0.35158119,
        4.81024069,  5.77712815,  4.39033546,  3.30628768,  2.09294132,
        1.1633268 ,  3.19852309, -0.94506483, -0.30785057,  4.88239062,
        5.78166496,  1.81591247,  4.34639509,  3.05018708,  4.76259661])

In [22]:
from sklearn.metrics import mean_squared_error, r2_score

lr_train_mse = mean_squared_error(y_train, y_lr_train_pred)
lr_train_r2 = r2_score(y_train, y_lr_train_pred)

lr_test_mse = mean_squared_error(y_test, y_lr_test_pred)
lr_test_r2 = r2_score(y_test, y_lr_test_pred)


In [23]:
print('LR MSE (Train): ', lr_train_mse)
print('LR R2 (Train): ', lr_train_r2)
print('LR MSE (Test): ', lr_test_mse)
print('LR R2 (Test): ', lr_test_r2)


LR MSE (Train):  1.82174432928949e-05
LR R2 (Train):  0.9999870012348427
LR MSE (Test):  4.86459152808696
LR R2 (Test):  -2.354093244442026


In [24]:
lr_results = pd.DataFrame(['Linear regression', lr_train_mse, lr_train_r2, lr_test_mse, lr_test_r2]).transpose()
lr_results.columns = ['Method', 'Training MSE', 'Training R2', 'Test MSE', 'Test R2']


In [25]:
lr_results

Unnamed: 0,Method,Training MSE,Training R2,Test MSE,Test R2
0,Linear regression,1.8e-05,0.999987,4.864592,-2.354093


In [27]:
print("Training data shape:", X_train_encoded.shape)
print("Test data shape:", X_test_encoded.shape)


Training data shape: (240, 243)
Test data shape: (60, 243)


## My model is overfitted but so i comparing it with random forests

In [28]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(max_depth=2, random_state=100)
rf.fit(X_train_encoded, y_train)

In [29]:
y_rf_train_pred = rf.predict(X_train_encoded)
y_rf_test_pred = rf.predict(X_test_encoded)

In [30]:
from sklearn.metrics import mean_squared_error, r2_score

rf_train_mse = mean_squared_error(y_train, y_rf_train_pred)
rf_train_r2 = r2_score(y_train, y_rf_train_pred)

rf_test_mse = mean_squared_error(y_test, y_rf_test_pred)
rf_test_r2 = r2_score(y_test, y_rf_test_pred)


rf_results = pd.DataFrame(['Random forest', rf_train_mse, rf_train_r2, rf_test_mse, rf_test_r2]).transpose()
rf_results.columns = ['Method', 'Training MSE', 'Training R2', 'Test MSE', 'Test R2']
rf_results

Unnamed: 0,Method,Training MSE,Training R2,Test MSE,Test R2
0,Random forest,1.301769,0.071144,1.484862,-0.023799
