In [None]:
pip install pandas numpy seaborn matplotlib datetime

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import calendar
import datetime

In [None]:
df = pd.read_csv('uber.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df = df.drop(['Unnamed: 0', 'key'], axis=1)

In [None]:
df.head()

In [None]:
df['dropoff_longitude'].fillna(df['dropoff_longitude'].mean(), inplace=True)
df['dropoff_latitude'].fillna(df['dropoff_latitude'].median(), inplace=True)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df.pickup_datetime = pd.to_datetime(df.pickup_datetime, errors='coerce')

In [None]:
df = df.assign(hour = df.pickup_datetime.dt.hour, day = df.pickup_datetime.dt.day, 
               month = df.pickup_datetime.dt.month, year = df.pickup_datetime.dt.year, 
               dayofweek = df.pickup_datetime.dt.dayofweek)

In [None]:
df.head()

In [None]:
df = df.drop('pickup_datetime', axis=1)

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.plot(kind='box', subplots=True, layout=(7,2), figsize=(15,20))

In [None]:
def remove_outlier(df1, col):
    Q1 = df1[col].quantile(0.25)
    Q3 = df1[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_whisker = Q1 - 1.5*IQR
    upper_whisker = Q3 + 1.5*IQR
    df[col] = np.clip(df1[col], lower_whisker, upper_shisker)
    return df1

def treat_outliers_all(df1, col_list):
    for c in col_list:
        df1 = remove_outlier(df, c)
    return df1

df = treat_outliers_all(df, df.iloc[: , 0::])
df.plot(kind = 'box', subplots=True, layout=(7,2), figsize=(15,20))

In [None]:
pip install haversine

In [None]:
import haversine as hs

In [None]:
travel_dist = []
for pos in range(len(df['pickup_longitude'])):
    long1,lat1,long2,lat2 = [df['pickup_longitude'][pos], df['pickup_latitude'][pos],df['dropoff_longitude'][pos],df['dropoff_latitude'][pos]]
    loc1 = (lat1, long1)
    loc2 = (lat2, long2)
    c = hs.haversine(loc1, loc2)
    travel_dist.append(c)
print(travel_dist)
df['dist_travel_km'] = travel_dist
df.head()

In [None]:
df = df.loc[(df.dist_travel_km >= 1) | (df.dist_travel_km <= 130)]
print("Remaining observations: ", df.shape)

In [None]:
incorrect_coordinates = df.loc[(df.pickup_latitude > 90) | (df.pickup_latitude < -90) | 
                                (df.dropoff_latitude > 90) | (df.dropoff_latitude < -90) | 
                                (df.pickup_longitude > 180) | (df.pickup_longitude < -180) | 
                                (df.dropoff_longitude > 180) | (df.dropoff_longitude < -180)]

In [None]:
df.drop(incorrect_coordinates, inplace=True, errors="ignore")
df.head()

In [None]:
sns.heatmap(df.isnull())

In [None]:
corr = df.corr()
corr

In [None]:
fig, axis = plt.subplots(figsize = (10,6))
sns.heatmap(df.corr(), annot=True)

In [None]:
x = df[['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'passenger_count', 'hour', 'day', 'month', 'year', 'dayofweek', 'dist_travel_km']]
y = df['fare_amount']

In [None]:
pip install scikit-learn

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.33)

In [None]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(X_train, Y_train)
regression.coef_
regression.intercept_

In [None]:
prediction = regression.predict(X_test)
print(prediction)
Y_test

In [None]:
from sklearn.metrics import r2_score
print(r2_score(Y_test, prediction))
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(Y_test, prediction)
MSE

In [None]:
RMSE = np.sqrt(MSE)
RMSE

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, Y_train)
y_pred = rf.predict(X_test)
y_pred

In [None]:
R2_Random = r2_score(Y_test, y_pred)
print("Accuracy: ", R2_Random)
MSE_Random = mean_squared_error(Y_test, y_pred)
print("Mean Squared Error: ", MSE_Random)
RMSE_Random = np.sqrt(MSE_Random)
print(RMSE_Random)