In [1]:
# importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

# We ignore the warnings
warnings.filterwarnings("ignore")

Importing the data


In [2]:
df = pd.read_csv("./datasets/uber.csv")
data = df.copy()
data.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


1. Pre-processing the dataset


In [3]:
# print the dataset information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  int64  
 1   key                200000 non-null  object 
 2   fare_amount        200000 non-null  float64
 3   pickup_datetime    200000 non-null  object 
 4   pickup_longitude   200000 non-null  float64
 5   pickup_latitude    200000 non-null  float64
 6   dropoff_longitude  199999 non-null  float64
 7   dropoff_latitude   199999 non-null  float64
 8   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB


In [4]:
# Check the null values
data.isnull().sum()

Unnamed: 0           0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

In [5]:
# removing the null values
uber = data.drop(["Unnamed: 0", "key"], axis=1)
uber = uber.dropna(axis=0)

In [6]:
# Check the null values
uber.isnull().sum()

fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [7]:
# Statistics of data
uber.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,199999.0,199999.0,199999.0,199999.0,199999.0,199999.0
mean,11.359892,-72.527631,39.935881,-72.525292,39.92389,1.684543
std,9.90176,11.437815,7.720558,13.117408,6.794829,1.385995
min,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0
25%,6.0,-73.992065,40.734796,-73.991407,40.733823,1.0
50%,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0
75%,12.5,-73.967154,40.767158,-73.963658,40.768001,2.0
max,499.0,57.418457,1644.421482,1153.572603,872.697628,208.0


2. Identify the outliers


In [8]:
# Checking the outliers
uber.min()

fare_amount                            -52.0
pickup_datetime      2009-01-01 01:15:22 UTC
pickup_longitude                 -1340.64841
pickup_latitude                   -74.015515
dropoff_longitude                 -3356.6663
dropoff_latitude                 -881.985513
passenger_count                            0
dtype: object

In [9]:
# drop rows where fare amount is negative or zero
uber = uber.drop(uber[uber["fare_amount"] <= 0].index)
uber.min()

fare_amount                             0.01
pickup_datetime      2009-01-01 01:15:22 UTC
pickup_longitude                 -1340.64841
pickup_latitude                   -74.015515
dropoff_longitude                 -3356.6663
dropoff_latitude                 -881.985513
passenger_count                            0
dtype: object

In [10]:
# drop rows where number of passenger is zero
uber = uber.drop(uber[uber["passenger_count"] == 0].index)
uber.min()

fare_amount                             0.01
pickup_datetime      2009-01-01 01:15:22 UTC
pickup_longitude                 -1340.64841
pickup_latitude                   -74.015515
dropoff_longitude                 -3356.6663
dropoff_latitude                 -881.985513
passenger_count                            1
dtype: object

In [11]:
uber.dropna(inplace=True)

3. Correlation


In [12]:
# pickup_datetime is not in required data format
uber["pickup_datetime"] = pd.to_datetime(uber["pickup_datetime"])
corr = uber.corr()
corr.style.background_gradient(cmap="BuGn")

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
fare_amount,1.0,0.122884,0.01042,-0.008441,0.009064,-0.011098,0.009616
pickup_datetime,0.122884,1.0,0.009376,-0.009684,0.007918,-0.010703,0.005429
pickup_longitude,0.01042,0.009376,1.0,-0.815878,0.833047,-0.846383,-0.000247
pickup_latitude,-0.008441,-0.009684,-0.815878,1.0,-0.774615,0.701959,-0.001695
dropoff_longitude,0.009064,0.007918,0.833047,-0.774615,1.0,-0.916715,7.3e-05
dropoff_latitude,-0.011098,-0.010703,-0.846383,0.701959,-0.916715,1.0,-0.000686
passenger_count,0.009616,0.005429,-0.000247,-0.001695,7.3e-05,-0.000686,1.0


4. Linear Regression


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
# Take x as predictor variable
x = uber.drop("fare_amount", axis=1)
# And y as target variable
y = uber["fare_amount"]

In [15]:
# Necessary to apply model
x["pickup_datetime"] = pd.to_numeric(pd.to_datetime(x["pickup_datetime"]))
x = x.loc[:, x.columns.str.contains("^Unnamed")]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
print(type(X_train))
print(type(y_train))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [17]:
from sklearn.linear_model import LinearRegression

In [18]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

ValueError: at least one array or dtype is required