<a href="https://colab.research.google.com/github/arnoldkiirya1/Machine_Learning/blob/main/Ride_Ticket_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the datasets
train_data = pd.read_csv('Train.csv')

# Group by ride_id and calculate the number of seats for each ride
ride_seat_counts = train_data.groupby('ride_id')['seat_number'].count().reset_index()
ride_seat_counts.columns = ['ride_id', 'number_of_ticket']

# Remove duplicate ride IDs from the original dataset
unique_train_data = train_data.drop_duplicates(subset='ride_id', keep='first')

# Merge the unique_train_data with ride_seat_counts
new_train_data = unique_train_data.merge(ride_seat_counts, on='ride_id', how='left')

# Select desired columns
selected_columns = [
    'ride_id', 'seat_number', 'payment_method', 'payment_receipt', 'travel_date',
    'travel_time', 'travel_from', 'travel_to', 'car_type', 'max_capacity', 'number_of_ticket'
]
final_train_data = new_train_data[selected_columns]

# Drop payment_method and payment_receipt column:
columns_to_drop = ['seat_number','payment_method', 'payment_receipt']
final_train_data = final_train_data.drop(columns=columns_to_drop, axis=1)
# Display the new DataFrame to a CSV file
final_train_data.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,travel_to,car_type,max_capacity,number_of_ticket
0,1442,17-10-17,7:15,Migori,Nairobi,Bus,49,1
1,5437,19-11-17,7:12,Migori,Nairobi,Bus,49,1
2,5710,26-11-17,7:05,Keroka,Nairobi,Bus,49,1
3,5777,27-11-17,7:10,Homa Bay,Nairobi,Bus,49,5
4,5778,27-11-17,7:12,Migori,Nairobi,Bus,49,31


In [None]:
# Function to convert date format in Training data
def convert_date_format(date_str):
    try:
        date = pd.to_datetime(date_str, format='%d-%m-%y')
        return date.strftime('%d/%m/%Y')
    except ValueError:
        return date_str

# Apply the function to the 'travel_date' column
final_train_data['travel_date'] = final_train_data['travel_date'].apply(convert_date_format)

In [None]:
final_train_data.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,travel_to,car_type,max_capacity,number_of_ticket
0,1442,17/10/2017,7:15,Migori,Nairobi,Bus,49,1
1,5437,19/11/2017,7:12,Migori,Nairobi,Bus,49,1
2,5710,26/11/2017,7:05,Keroka,Nairobi,Bus,49,1
3,5777,27/11/2017,7:10,Homa Bay,Nairobi,Bus,49,5
4,5778,27/11/2017,7:12,Migori,Nairobi,Bus,49,31


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# Load the datasets
train_data = final_train_data
test_data = pd.read_csv('Test.csv')

# Drop travel_date and travel_time column:
# column_to_drop = ['travel_date', 'travel_time']
# test_data = test_data.drop(columns=column_to_drop, axis=1)

test_data.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,travel_to,car_type,max_capacity
0,4446,2018-04-27,09:00,Kisii,Nairobi,shuttle,11
1,13962,2018-04-23,07:10,Homa Bay,Nairobi,Bus,49
2,5569,2018-04-24,07:20,Kisii,Nairobi,shuttle,11
3,1675,2018-05-01,11:01,Kisii,Nairobi,shuttle,11
4,5711,2018-04-22,10:51,Kisii,Nairobi,shuttle,11


In [None]:
# Function to convert date format in the Testing Data
def convert_date_format(date_str):
    try:
        date = pd.to_datetime(date_str, format='%Y-%m-%d')
        return date.strftime('%d/%m/%Y')
    except ValueError:
        return date_str

# Apply the function to the 'travel_date' column
test_data['travel_date'] = test_data['travel_date'].apply(convert_date_format)


In [None]:
test_data.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,travel_to,car_type,max_capacity
0,4446,27/04/2018,09:00,Kisii,Nairobi,shuttle,11
1,13962,23/04/2018,07:10,Homa Bay,Nairobi,Bus,49
2,5569,24/04/2018,07:20,Kisii,Nairobi,shuttle,11
3,1675,01/05/2018,11:01,Kisii,Nairobi,shuttle,11
4,5711,22/04/2018,10:51,Kisii,Nairobi,shuttle,11


In [None]:
# training data for label encoding
training_data = pd.concat([train_data], axis=0, ignore_index=True)

# Convert categorical columns to strings before encoding
for col in ['ride_id','travel_from', 'travel_to', 'car_type']:
    training_data[col] = training_data[col].astype(str)

# Encode categorical features using Label Encoding
label_encoder = LabelEncoder()
for col in ['ride_id','travel_from', 'travel_to','car_type']:
    training_data[col] = label_encoder.fit_transform(training_data[col])


# If 'Bus' was encoded as 1 and 'shuttle' as 0, you can invert the encoding
if label_encoder.classes_[0] == 'Bus':
    training_data['car_type'] = 1 - training_data['car_type']

training_data.head(10)

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,travel_to,car_type,max_capacity,number_of_ticket
0,3138,17/10/2017,7:15,9,0,1,49,1
1,3146,19/11/2017,7:12,9,0,1,49,1
2,3147,26/11/2017,7:05,4,0,1,49,1
3,3148,27/11/2017,7:10,1,0,1,49,5
4,3149,27/11/2017,7:12,9,0,1,49,31
5,3150,27/11/2017,7:09,1,0,1,49,26
6,3151,20/04/2018,5:10,7,0,0,11,1
7,3152,20/04/2018,9:50,7,0,0,11,1
8,3153,20/04/2018,7:06,7,0,1,49,2
9,3154,20/04/2018,6:00,7,0,0,11,1


In [None]:
# Testing data for label encoding
testing_data = pd.concat([test_data], axis=0, ignore_index=True)

# Convert categorical columns to strings before encoding
for col in ['ride_id','travel_from', 'travel_to', 'car_type']:
    testing_data[col] = testing_data[col].astype(str)

# Encode categorical features using Label Encoding
label_encoder = LabelEncoder()
for col in ['ride_id','travel_from', 'travel_to', 'car_type']:
    testing_data[col] = label_encoder.fit_transform(testing_data[col])


# If 'Bus' was encoded as 1 and 'shuttle' as 0, you can invert the encoding
if label_encoder.classes_[0] == 'Bus':
    training_data['car_type'] = 1 - training_data['car_type']

testing_data.head(10)

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,travel_to,car_type,max_capacity
0,660,27/04/2018,09:00,5,0,1,11
1,81,23/04/2018,07:10,1,0,0,49
2,791,24/04/2018,07:20,5,0,1,11
3,490,01/05/2018,11:01,5,0,1,11
4,842,22/04/2018,10:51,5,0,1,11
5,537,30/04/2018,19:04,5,0,0,49
6,366,02/05/2018,07:13,2,0,0,49
7,500,01/05/2018,19:04,5,0,0,49
8,414,03/05/2018,07:08,2,0,0,49
9,198,26/04/2018,23:10,14,0,0,49


In [None]:
# Convert 'travel_date' to numerical features
training_data['travel_date'] = pd.to_datetime(training_data['travel_date'], format='%d/%m/%Y').apply(lambda x: x.timestamp()).astype(int)
testing_data['travel_date'] = pd.to_datetime(testing_data['travel_date'], format='%d/%m/%Y').apply(lambda x: x.timestamp()).astype(int)

# Convert 'travel_time' to hour of the day
training_data['travel_time'] = pd.to_datetime(training_data['travel_time']).dt.hour
testing_data['travel_time'] = pd.to_datetime(testing_data['travel_time']).dt.hour

# Apply one-hot encoding to 'travel_time_hour' column
training_data = pd.get_dummies(training_data, columns=['travel_time'], prefix='hour')
testing_data = pd.get_dummies(testing_data, columns=['travel_time'], prefix='hour')

# Extract day of the week and apply one-hot encoding
training_data['day_of_week'] = training_data['travel_date'].apply(lambda x: pd.Timestamp(x, unit='s').dayofweek)
testing_data['day_of_week'] = testing_data['travel_date'].apply(lambda x: pd.Timestamp(x, unit='s').dayofweek)

# Apply one-hot encoding to 'day_of_week' column
training_data = pd.get_dummies(training_data, columns=['day_of_week'], prefix='day')
testing_data = pd.get_dummies(testing_data, columns=['day_of_week'], prefix='day')


In [None]:

# Drop travel_date and travel_time column:
column_to_drop = ['travel_date']
training_data = training_data.drop(columns=column_to_drop, axis=1)


In [None]:
training_data.head(10)

Unnamed: 0,ride_id,travel_from,travel_to,car_type,max_capacity,number_of_ticket,hour_5,hour_6,hour_7,hour_8,...,hour_11,hour_19,hour_23,day_0,day_1,day_2,day_3,day_4,day_5,day_6
0,3138,9,0,0,49,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,3146,9,0,0,49,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,3147,4,0,0,49,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,3148,1,0,0,49,5,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,3149,9,0,0,49,31,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
5,3150,1,0,0,49,26,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
6,3151,7,0,1,11,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
7,3152,7,0,1,11,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8,3153,7,0,0,49,2,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
9,3154,7,0,1,11,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [None]:
column_to_drop = ['travel_date']
testing_data = testing_data.drop(columns=column_to_drop, axis=1)
testing_data.head(10)

Unnamed: 0,ride_id,travel_from,travel_to,car_type,max_capacity,hour_5,hour_6,hour_7,hour_8,hour_9,...,hour_11,hour_19,hour_23,day_0,day_1,day_2,day_3,day_4,day_5,day_6
0,660,5,0,1,11,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,81,1,0,0,49,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,791,5,0,1,11,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,490,5,0,1,11,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
4,842,5,0,1,11,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,537,5,0,0,49,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
6,366,2,0,0,49,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
7,500,5,0,0,49,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
8,414,2,0,0,49,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
9,198,14,0,0,49,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0


In [None]:
# Split features and target variable
X = training_data.drop(['number_of_ticket'], axis=1)
y = training_data['number_of_ticket']

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(testing_data)

# Hyperparameter Tuning
param_grid = {
    'learning_rate': [0.1, 0.2, 0.3],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 2, 3],
    'n_estimators': [50, 100, 200]
}

grid_search = GridSearchCV(estimator=XGBRegressor(random_state=42),
                           param_grid=param_grid,
                           scoring='neg_mean_absolute_error',
                           cv=5,
                           n_jobs=-1)

grid_search.fit(X, y)

best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# Model Training with Best Parameters
model = XGBRegressor(**best_params, random_state=42)
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_val_scaled)

# Evaluate the model
mae = mean_absolute_error(y_val, y_pred)
print(f'Mean Absolute Error: {mae}')

# Make predictions on test data
test_predictions = model.predict(X_test_scaled)
test_predictions = np.maximum(test_predictions, 0)
test_predictions = np.round(test_predictions).astype(int)

# Create submission DataFrame
submission = pd.DataFrame({
    'ride_id': test_data['ride_id'],
    'number_of_ticket': test_predictions
})

Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 50}
Mean Absolute Error: 4.228912772560119
