In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from statistics import mean

In [0]:
df_train = pd.read_csv('Train.csv')
df_rider = pd.read_csv('Riders.csv')
df_sample = pd.read_csv('SampleSubmission.csv')

UNDERSTANDING THE DATA

In [0]:
df_train.dtypes  #Checking datatypes of each column

In [0]:
df_rider.dtypes

In [0]:
df_train.describe() #Checking the statistical summary of the dataframe

In [0]:
df_rider.describe()

DEALING WITH MISSING DATA

In [0]:
df_train.info()  #Checking for null values

In [0]:
df_rider.info()

In [0]:
df_train = df_train.fillna(df_train.mean()) #Replacing null values with the mean of the columns

In [0]:
"""One of the Rider Id rows contains a NaN value"""
df_train.dropna(inplace = True) #Removing this entire row

In [0]:
df_train.info()  #Checking for null values again.
"""There are no longer any null values"""

JOIN THE RIDERS AND TRAIN DATASETS

In [0]:
df = pd.merge(df_train, df_rider, on  = 'Rider Id')

In [0]:
corr = df.corr()
ax = sns.heatmap(corr, cmap = sns.diverging_palette(20, 220, n=200),  vmax = 1, vmin = -1 , center = 0 )  

ANALYSE THE JOINED DATASETS

In [0]:
df.dtypes

In [0]:
"""Group data by Rider Id to show how the averages correlate"""
#Drop all object type data
#Group by Rider Id recording the mean value of each column for the riders.
df_grp = df.drop(['Order No', 'User Id', 'Vehicle Type', 'Personal or Business', 'Placement - Time','Confirmation - Time','Arrival at Pickup - Time', 'Pickup - Time', 'Arrival at Destination - Time' ], axis = 1)
df_grp = df_grp.groupby('Rider Id').mean()
df_grp.head()

In [0]:
"""Heatmap of grouped dataset"""
corr = df_grp.corr()
ax = sns.heatmap(corr, cmap = sns.diverging_palette(20, 220, n=200),  vmax = 1, vmin = -1 , center = 0 )  

SCATTER PLOTS

In [0]:
"""Function used to determine slope of best fit line"""
def best_fit_slope(X,y):
  m = ((mean(X) * mean(y)) - mean(X * y))/ ((mean(X)**2) - (mean(X**2)))
  return m

In [0]:
"""Function used to determine y-intercept of best fit line"""
def best_fit_yintercept(X, y, m):
  b = mean(y) - (m*mean(X))
  return b

In [0]:
X = df['Average_Rating']
y = df['Time from Pickup to Arrival']
m = best_fit_slope(X,y)
b = best_fit_yintercept(X, y, m)
y_line = m*X + b
plt.plot(X, y_line, color = 'red')
plt.scatter(X, y)

In [0]:
X = df['Distance (KM)']
y = df['Time from Pickup to Arrival']
m = best_fit_slope(X,y)
b = best_fit_yintercept(X, y, m)
y_line = m*X + b
plt.plot(X, y_line, color = 'red')
plt.scatter(X, y)

In [0]:
X = df['Average_Rating']
y = df['No_of_Ratings']
m = best_fit_slope(X,y)
b = best_fit_yintercept(X, y, m)
y_line = m*X + b
plt.plot(X, y_line, color = 'red')
plt.scatter(X, y)