In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Description

| Attribute | Description |
|----------|----------|
|key 	| a unique identifier for each trip
|fare_amount 	|The cost of each trip in usd
|pickup_datetime 	|Date and time when the meter was engaged
|passenger_count	|The number of passengers in the vehicle (driver entered value)
|pickup_longitude 	|The longitude where the meter was engaged
|pickup_latitude 	|The latitude where the meter was engaged
|dropoff_longitude	|The longitude where the meter was disengaged
|dropoff_latitude 	|The latitude where the meter was disengaged

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# Reading Data


In [None]:
df = pd.read_csv('/kaggle/input/uber-fares-dataset/uber.csv')


# Exploring Data

In [None]:
df.head()# Will give you first 5 records

In [None]:
#gain insights of your data set and see general information about each feature and the total number of non-null values in each variable
df.info()

Well, as a first note we can see that we are out of luck, because we have missing values in some columns (__dropoff_longitude ,dropoff_latitude__). Let's see which of these variables contain missing values.
More on these values later.

__Data type__ for __pickup_datetime__ column need to be fixed is datetime.

Remove __Unnamed: 0,key__ column

In [None]:
df.drop(['Unnamed: 0', 'key'], axis=1, inplace=True)

In [None]:
df.head()

# Data Cleaning

### Check Duplicated

In [None]:
df.duplicated().sum()

## Check Missing Values

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
# Change datetime to datetime 
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

In [None]:
df.info()

In [None]:
# get the day, weekday, month, year, hour from pickup_datetime
df['day'] = df['pickup_datetime'].dt.day
df['weekday'] = df['pickup_datetime'].dt.weekday
df['month'] = df['pickup_datetime'].dt.month
df['year'] = df['pickup_datetime'].dt.year
df['hour'] = df['pickup_datetime'].dt.hour

# drop pickup_datetime
df.drop('pickup_datetime', axis=1, inplace=True)

In [None]:
 #!pip install geopy

In [None]:
# calculate the distance between pickup and dropoff using geopy library
from geopy.distance import great_circle

In [None]:
# Filter Longitude and Latitude
def filter_latitude(val):
    if val < -90 or val > 90:
        return np.nan
    else:
        return val

def filter_longitude(val):
    if val < -180 or val > 180:
        return np.nan
    else:
        return val        

In [None]:
# Filter Longitude and Latitude

df['pickup_longitude'] = df['pickup_longitude'].apply(filter_longitude)
df['pickup_latitude'] = df['pickup_latitude'].apply(filter_latitude)
df['dropoff_longitude'] = df['dropoff_longitude'].apply(filter_longitude)
df['dropoff_latitude'] = df['dropoff_latitude'].apply(filter_latitude)

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
from geopy.distance import great_circle

def distance_km(x):
    pickup = (x['pickup_latitude'], x['pickup_longitude'])
    dropoff = (x['dropoff_latitude'], x['dropoff_longitude'])
    return great_circle(pickup, dropoff).km

In [None]:
pickup = (40.6441666667, -73.7822222222)
dropoff = (40.6413111, -73.7881761)
great_circle(pickup, dropoff).km

In [None]:
df['distance_km'] = df.apply(lambda x: distance_km(x), axis=1)

In [None]:
df.head()

In [None]:
df.drop(['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.describe()

- __fare_amount__ wrong value <= 0
- __distance_km__ wrong value = 0

In [None]:
df.drop(df[df['fare_amount'] <= 0].index, inplace=True)
df.drop(df[df['distance_km'] == 0].index, inplace=True)

In [None]:
df.shape  

# Data Visualisation  

In [None]:
px.scatter(df, x='distance_km', y='fare_amount', width=700, height=500)

In [None]:
px.box(df, y='distance_km', width=700, height=500)

In [None]:
px.histogram(df, x='distance_km', width=700, height=500)

In [None]:
df.distance_km.describe()

In [None]:
df[df['distance_km'] > 100]

In [None]:
df.distance_km.quantile([0.8, 0.9, 0.95, 0.99, 0.995, 0.999])

In [None]:
df.drop(df[df['distance_km'] > 100].index, inplace=True)

In [None]:
px.box(df, y='distance_km', width=700, height=500)

In [None]:
df.describe()

In [None]:
px.scatter(df, x='distance_km', y='fare_amount', width=700, height=500)

In [None]:
px.box(df, y='fare_amount', width=700, height=500)

In [None]:
df.drop(df[df['fare_amount'] > 200].index, inplace=True)

In [None]:
px.scatter(df, x='distance_km', y='fare_amount', width=700, height=500)

In [None]:
df[df['distance_km'] < 0.5]

In [None]:
df.drop(df[df['distance_km'] < 1].index, inplace=True)

In [None]:
px.scatter(df, x='distance_km', y='fare_amount', width=700, height=500)

In [None]:
df.fare_amount.describe()

In [None]:
df[df['fare_amount']< 1]

In [None]:
px.box(df, y='fare_amount', width=700, height=500)

In [None]:
df.head()

In [None]:
for col in ['passenger_count','day','weekday','year','hour']:
    sns.countplot(data = df, x = col)
    plt.xticks(rotation=45)
    plt.show()

# Data Splitting

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
# Data Splitting
X = df.drop('fare_amount', axis=1)
y = df['fare_amount']

In [None]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
X_train.columns

In [None]:
df.columns

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])



# Bundle preprocessing for numerical 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer,['passenger_count', 'day', 'weekday', 'month', 'year', 'hour','distance_km'])])
        

X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Model Building

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train_preprocessed , y_train)
print('Train Score: ', lr.score(X_train_preprocessed, y_train))
print('Test Score: ', lr.score(X_test_preprocessed, y_test))

In [None]:
from sklearn.linear_model import Ridge

r = Ridge()
r.fit(X_train_preprocessed, y_train)

print('Training Score: ', r.score(X_train_preprocessed, y_train))
print('Testing Score: ', r.score(X_test_preprocessed, y_test))

In [None]:
from sklearn.linear_model import Lasso

lasso = Lasso()
lasso.fit(X_train_preprocessed, y_train)

print('Training Score: ', lasso.score(X_train_preprocessed, y_train))
print('Testing Score: ', lasso.score(X_test_preprocessed, y_test))

In [None]:
# Polynomial Regression
from sklearn.preprocessing import PolynomialFeatures

poly_reg = PolynomialFeatures(degree = 2)
X_poly_train = poly_reg.fit_transform(X_train_preprocessed)
X_poly_test = poly_reg.transform(X_test_preprocessed)

lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_poly_train, y_train)

print('Train Score: ', lin_reg_2.score(X_poly_train, y_train))
print('Test Score: ', lin_reg_2.score(X_poly_test, y_test))

In [None]:
# DecisionTreeRegressor

from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor()

dt.fit(X_train_preprocessed, y_train)

print('Training Score: ', dt.score(X_train_preprocessed, y_train))
print('Testing Score: ', dt.score(X_test_preprocessed, y_test))

In [None]:
# Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=10 ,random_state=42, max_depth=10)

rf.fit(X_train_preprocessed, y_train)

print('Training Score: ', rf.score(X_train_preprocessed, y_train))
print('Testing Score: ', rf.score(X_test_preprocessed, y_test))

In [None]:
from sklearn.model_selection import RandomizedSearchCV
parameters = {
    'n_estimators':range(8,12),
    'max_depth': range(5,25),
}
clf = RandomForestRegressor()
RCV = RandomizedSearchCV(estimator=clf, 
                   param_distributions=parameters, 
                   n_iter=3,
                   cv=3)

In [None]:
RCV.fit(X_train_preprocessed,y_train)


In [None]:
RCV.best_params_


In [None]:
# Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=11 ,random_state=42, max_depth=6)

rf.fit(X_train_preprocessed, y_train)

print('Training Score: ', rf.score(X_train_preprocessed, y_train))
print('Testing Score: ', rf.score(X_test_preprocessed, y_test))

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'n_estimators':range(8,12),
    'max_depth': range(5,25),
}
clf = RandomForestRegressor()
GCV = GridSearchCV(estimator=clf, 
                   param_grid=parameters, 
                   cv=3)

In [None]:
GCV.fit(X_train_preprocessed , y_train)


In [None]:
GCV.best_params_


In [None]:
GCV.best_score_


In [None]:
# Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=9,random_state=42, max_depth=9)

rf.fit(X_train_preprocessed, y_train)

print('Training Score: ', rf.score(X_train_preprocessed, y_train))
print('Testing Score: ', rf.score(X_test_preprocessed, y_test))

In [None]:
importance = rf.feature_importances_
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))

In [None]:
pd.DataFrame({'Feature': X_train.columns, 'Importance': rf.feature_importances_.round(3)}).sort_values('Importance', ascending=False)


In [None]:
plt.figure(figsize=(12,8))
sns.barplot( y =rf.feature_importances_, x =X_train.columns
)

In [None]:
# Voting Regressor

from sklearn.ensemble import  VotingRegressor


lin_reg = LinearRegression()
dt_reg = DecisionTreeRegressor(max_depth=10)

# Voting Regressor
voting_reg = VotingRegressor(estimators=[('lr', lin_reg), ('dt', dt_reg)])

# Fitting the data
voting_reg.fit(X_train_preprocessed, y_train)

# Checking the score
print('Training Score: ', voting_reg.score(X_train_preprocessed, y_train))
print('Testing Score: ', voting_reg.score(X_test_preprocessed, y_test))

In [None]:
# BaggingRegressor

from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso

lin_reg = LinearRegression()
ridge = Ridge()
lasso = Lasso()
dt_reg = DecisionTreeRegressor(max_depth=10)

bag_reg = BaggingRegressor(dt_reg, n_estimators=100, bootstrap=True, random_state=42,
                           max_features=0.8)
                           

# Fitting the data
bag_reg.fit(X_train_preprocessed, y_train)

# Checking the score
print('Training Score: ', bag_reg.score(X_train_preprocessed, y_train))
print('Testing Score: ', bag_reg.score(X_test_preprocessed, y_test))

In [None]:
# AdaBoost Regressor

from sklearn.ensemble import AdaBoostRegressor

ada_reg = AdaBoostRegressor(n_estimators=200, random_state=42)

# Fitting the data
ada_reg.fit(X_train_preprocessed, y_train)

# Checking the score
print('Training Score: ', ada_reg.score(X_train_preprocessed, y_train))
print('Testing Score: ', ada_reg.score(X_test_preprocessed, y_test))

In [None]:
# GradientBoostingRegressor

from sklearn.ensemble import GradientBoostingRegressor

grad_reg = GradientBoostingRegressor(n_estimators=200, max_depth=3, random_state=42)

# Fitting the data
grad_reg.fit(X_train_preprocessed, y_train)

# Checking the score
print('Training Score: ', grad_reg.score(X_train_preprocessed, y_train))
print('Testing Score: ', grad_reg.score(X_test_preprocessed, y_test))

In [None]:
from sklearn.model_selection import RandomizedSearchCV
parameters = {
    'n_estimators':[100,200,300,400,500],
    'max_depth': range(2,8),
}
clf = GradientBoostingRegressor()
RCV = RandomizedSearchCV(estimator=clf, 
                   param_distributions=parameters, 
                   n_iter=3,
                   cv=3)

In [None]:
RCV.fit(X_train_preprocessed,y_train)


In [None]:
RCV.best_params_


In [None]:
RCV.best_score_


In [None]:
# GradientBoostingRegressor

from sklearn.ensemble import GradientBoostingRegressor

grad_reg = GradientBoostingRegressor(n_estimators=200, max_depth=5)

# Fitting the data
grad_reg.fit(X_train_preprocessed, y_train)

# Checking the score
print('Training Score: ', grad_reg.score(X_train_preprocessed, y_train))
print('Testing Score: ', grad_reg.score(X_test_preprocessed, y_test))

In [None]:
#pip install xgboost


In [None]:
# XGBoost

from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators=300, max_depth=3, random_state=42)

# Fitting the data
xgb.fit(X_train_preprocessed, y_train)

# Checking the score
print('Training Score: ', xgb.score(X_train_preprocessed, y_train))
print('Testing Score: ', xgb.score(X_test_preprocessed, y_test))

In [None]:
from sklearn.model_selection import RandomizedSearchCV
parameters = {
    'n_estimators':[100,200,300,400,500],
    'max_depth': range(2,8),
}
clf = XGBRegressor()
RCV = RandomizedSearchCV(estimator=clf, 
                   param_distributions=parameters, 
                   n_iter=3,
                   cv=3)

In [None]:
RCV.fit(X_train_preprocessed,y_train)


In [None]:
RCV.best_params_


In [None]:
RCV.best_score_


In [None]:
Models_Summarization = pd.DataFrame(
                {
                    
'Training Accuracy': [round(lr.score(X_train_preprocessed, y_train) * 100, 2), 
                                          round(lin_reg_2.score(X_poly_train, y_train) * 100, 2), 
                                          round(dt.score(X_train_preprocessed, y_train) * 100, 2), 
                                          round(rf.score(X_train_preprocessed, y_train) * 100, 2), 
                                          round(voting_reg.score(X_train_preprocessed, y_train) * 100, 2), 
                                          round(bag_reg.score(X_train_preprocessed, y_train) * 100, 2),
                                          round(ada_reg.score(X_train_preprocessed, y_train) * 100, 2),
                                          round(grad_reg.score(X_train_preprocessed, y_train) * 100, 2),
                                          round(xgb.score(X_train_preprocessed, y_train) * 100, 2)], 
'Teasting Accuracy': [round(lr.score(X_test_preprocessed, y_test) * 100, 2), 
                                         round( lin_reg_2.score(X_poly_test, y_test) * 100, 2), 
                                         round(dt.score(X_test_preprocessed, y_test) * 100, 2), 
                                         round(rf.score(X_test_preprocessed, y_test) * 100, 2), 
                                         round(voting_reg.score(X_test_preprocessed, y_test) * 100, 2), 
                                         round(bag_reg.score(X_test_preprocessed, y_test) * 100, 2), 
                                         round(ada_reg.score(X_test_preprocessed, y_test) * 100, 2),
                                         round(grad_reg.score(X_test_preprocessed, y_test) * 100, 2),
                                         round(xgb.score(X_test_preprocessed, y_test) * 100, 2)], 
              
                },
    
    index = ['Linear Regression', 'Polynomial Regression',' DecisionTreeRegressor', 
         'Random Forest Regressor', 'Voting Regressor', 'BaggingRegressor',
         'AdaBoost Regressor' , 'GradientBoostingRegressor','XGBoost']
            )

In [None]:
Models_Summarization

# Saving 

In [None]:
import pickle

In [None]:
# Preprocessor Saving
import pickle
pickle.dump(preprocessor, open('preprocessor.pkl', 'wb'))

# Model Saving
pickle.dump(grad_reg, open('model.pkl', 'wb'))

## <center>Thanks For Following </center>
