<a href="https://www.kaggle.com/code/singhayush16/uber-fare-prediction?scriptVersionId=143788349" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error

In [None]:
import joblib

In [None]:
car_data=pd.read_csv('/kaggle/input/uber-lyft-cab-prices/cab_rides.csv')

In [None]:
car_data.shape

In [None]:
car_data.info()

In [None]:
print(f"Memory usage by Car Ride Dataset: {round(car_data.memory_usage().sum()/1024**2,2)} MB")

In [None]:
car_data['Date-Time']=pd.to_datetime(car_data.time_stamp)

In [None]:
car_data.info()

In [None]:
car_data['Time']=car_data['Date-Time'].dt.time

In [None]:
car_data['Date']=car_data['Date-Time'].dt.date

In [None]:
car_data['Hour']=car_data['Date-Time'].dt.hour

In [None]:
car_data.Date.unique()

In [None]:
car_data.sample()

**Since all data of Uber is of sametime so we can't consider it.**

In [None]:
car_data.columns

**Also we only take the Uber's data so we also drop the *cab_type* column**

In [None]:
uber=car_data[car_data.cab_type=='Uber'].drop(['time_stamp','cab_type','Date-Time', 'Time','Date', 'Hour'],axis=1)

In [None]:
uber.head()

In [None]:
uber.shape

In [None]:
uber.isnull().sum()

In [None]:
385663 - 55095

**As we have lots of data and we have to predict the Price so we drop the missing values in the price column for convinience**

In [None]:
uber.dropna(inplace=True)

In [None]:
uber.isnull().sum()

In [None]:
uber.shape

In [None]:
uber.info()

In [None]:
print(f"Memory usage by Uber Ride Dataset: {round(uber.memory_usage().sum()/1024**2,2)} MB")

In [None]:
categories=[]
for el in uber.columns:
    if uber[el].dtype=='object':
        categories.append(el)

In [None]:
categories #list of all columns have categorical data

In [None]:
uber.nunique()

**Since all id's are unique so we don't have to encode the id column and we also have the option to make it the index column for the uber dataset**

In [None]:
# Categories to be Encoded are: Destination,Source,Product Id and Name
cat_encode=['destination','dource','product_id','name']

In [None]:
uber.destination.unique().sort()==uber.source.unique().sort()

The above cell shows that we have limited number of locations in the dataset.

In [None]:
uber.sample()

In [None]:
X=uber.drop(['price','id','surge_multiplier'],axis=1)
y=uber['price']

In [None]:
X_train,X_valid,y_train,y_valid=train_test_split(X,y,random_state=42)

In [None]:
X_train.shape,X_valid.shape,y_train.shape,y_valid.shape

In [None]:
X.sample()

In [None]:
X.info()

In [None]:
trf=ColumnTransformer([
    ('encode_destination',OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=True),[1]),
    ('encode_source',OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=True),[2]),
    ('encode_product_id',OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=True),[3]),
    ('encode_name',OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=True),[4]),
],remainder='passthrough')

In [None]:
pipe=Pipeline([
    ('trf',trf),
    ('model',LinearRegression())
])

In [None]:
print(np.mean(cross_val_score(pipe,X_train,y_train,cv=3)))

In [None]:
pipe.fit(X_train,y_train)

In [None]:
from sklearn.metrics import mean_absolute_percentage_error

In [None]:
pipe.score(X_valid,y_valid)

In [None]:
y_pred=pipe.predict(X_valid)

In [None]:
pd.DataFrame({'Actual':y_valid,'Predicted':y_pred,'Diff.':(y_valid-y_pred)})

In [None]:
mean_absolute_percentage_error(y_valid,y_pred)

As our model's accuracy is about **92%**.

In [None]:
joblib.dump(pipe,'uber_fare_prediction_model.pkl')

# Using ANN

In [None]:
import tensorflow 
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import optimizers

In [None]:
uber.sample()

In [None]:
X_ann=uber.drop(['price','id','surge_multiplier'],axis=1)
y_ann=uber['price']

In [None]:
X_ann['distance'].hist()

In [None]:
X_train_ann,X_test_ann,y_train_ann,y_test_ann=train_test_split(X_ann,y_ann,random_state=42)

In [None]:
X_ann.shape

In [None]:
X_ann.sample()

In [None]:
X_train_ann.shape

In [None]:
model=Sequential()

model.add(Dense(15,activation='relu',input_shape=[33]))
model.add(Dense(5,activation='relu',input_shape=[5]))
model.add(Dense(1,activation='linear'))

model.compile(loss='mean_absolute_error',optimizer=optimizers.Adam(learning_rate=0.001))

In [None]:
trf=ColumnTransformer([
    ('encode_destination',OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=False),[1]),
    ('encode_source',OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=False),[2]),
    ('encode_product_id',OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=False),[3]),
    ('encode_name',OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=False),[4]),
],remainder='passthrough')

In [None]:
X_train_ann=trf.fit_transform(X_train_ann)
X_test_ann=trf.transform(X_test_ann)

In [None]:
X_train_ann

In [None]:
history=model.fit(X_train_ann,y_train_ann,epochs=10,validation_split=0.2)

In [None]:
y_pred_ann=model.predict(X_test_ann)
mean_absolute_percentage_error(y_test_ann,y_pred_ann)

In [None]:
y_pred_ann[:5]

In [None]:
y_test_ann.head()

In [None]:
print(f"Accuracy:{100-0.07696529131371008:.3f}%")

In [None]:
history.history

In [None]:
plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.legend()
plt.show()

In [None]:
model.save('my_model.h5')

In [None]:
uber.sample()