In [None]:
# Add Matplotlib inline magic command
%matplotlib inline

In [None]:
# Dependencies and setup
import matplotlib.pyplot as plt
import pandas as pd
import os
import numpy as np
import scipy.stats as sts
# Import mpl to change the plot configurations using rcParams.
import matplotlib as mpl

In [None]:
city_data_to_load = os.path.join("Resources", "city_data.csv")
ride_data_to_load = os.path.join("Resources", "ride_data.csv")

In [None]:
# read csv to pandas DataFrame
city_data_df = pd.read_csv(city_data_to_load)
ride_data_df = pd.read_csv(ride_data_to_load)

In [None]:
city_data_df.head()

In [None]:
ride_data_df.head()

In [None]:
# Get the columns and rows that are not null
city_data_df.count()


In [None]:
# To make sure there are no nulll values use isnull()
city_data_df.isnull().sum()

In [None]:
# Get the data types of each column
city_data_df.dtypes

In [None]:
# Get the unique data points for each type of city
city_data_df['type'].unique()

In [None]:
sum(city_data_df['type']=='Urban')

In [None]:
sum((city_data_df.type=='Suburban') | (city_data_df.type=='Rural'))

In [None]:
sum(city_data_df.type=='Suburban')

In [None]:
sum(city_data_df.type=='Rural')

In [None]:
# Inspect ride data
ride_data_df.count()

In [None]:
# Get the columns and rows that are not null
ride_data_df.isnull().sum()

In [None]:
ride_data_df.dtypes

In [None]:
# Merge the DataFrames
pyber_data_df = pd.merge(ride_data_df, city_data_df, how='left', on=['city', 'city'])

In [None]:
pyber_data_df.head()

In [None]:
# Create the Urban city DataFrame
urban_cities_df = pyber_data_df[pyber_data_df["type"]=='Urban']

In [None]:
urban_cities_df.head()

In [None]:
suburban_cities_df = pyber_data_df[pyber_data_df['type']=='Suburban']

In [None]:
suburban_cities_df.head()

In [None]:
rural_cities_df = pyber_data_df[pyber_data_df['type']=='Rural']

In [None]:
rural_cities_df.head()

In [None]:
# Get the number of rides for urban cities
urban_ride_count = urban_cities_df.groupby(['city']).count()['ride_id']
urban_ride_count.head()

In [None]:
suburban_ride_count = suburban_cities_df.groupby(['city']).count()['ride_id']
suburban_ride_count.head()

In [None]:
rural_ride_count = rural_cities_df.groupby(['city']).count()['ride_id']
rural_ride_count.head()

In [None]:
# Get average fare for each city in the urban cities
urban_avg_fare = urban_cities_df.groupby(['city']).mean()['fare']
urban_avg_fare.head()

In [None]:
suburban_avg_fare = suburban_cities_df.groupby(['city']).mean()['fare']
suburban_avg_fare.head()

In [None]:
rural_avg_fare = rural_cities_df.groupby(['city']).mean()['fare']
rural_avg_fare.head()

In [None]:
urban_driver_count = urban_cities_df.groupby(['city']).mean()['driver_count']
urban_driver_count.head()

In [None]:
suburban_driver_count= suburban_cities_df.groupby(['city']).mean()['driver_count']
suburban_driver_count.head()

In [None]:
rural_driver_count = rural_cities_df.groupby(['city']).mean()['driver_count']
rural_driver_count.head()

In [None]:
# Build the scatter plots for urban cities
plt.scatter(urban_ride_count, urban_avg_fare, s= 10*urban_driver_count, c='coral', edgecolor='black', linewidths=1, alpha=.8, label='Urban')
plt.title('PyBer Ride-Sharing Data (2019)')
plt.ylabel('Average Fare ($)')
plt.xlabel('Total Number of rides (Per City)')
plt.grid(True)
# Add the legend
plt.legend()

In [None]:
plt.scatter(suburban_ride_count, suburban_avg_fare, s= 10*urban_driver_count, c='skyblue', edgecolor = 'black', linewidths=1, alpha=.8, label='Suburban')
plt.title('PyBer Ride Sharing Data (2019)')
plt.ylabel('Average Fare ($)')
plt.xlabel('Total Number of Rides (Per City)')
plt.grid(True)
# Add the legend
plt.legend()

In [None]:
plt.scatter(rural_ride_count, rural_avg_fare, s=10*rural_driver_count, c='gold', edgecolor='black', linewidths=1, alpha=.8, label='Rural')
plt.title('PyBer Ride Sharing Data (2019)')
plt.ylabel("Average Fare ($)")
plt.xlabel('Total number of Rides (Per City)')
plt.grid(True)
# Add the legend
plt.legend()

In [None]:
# Add the scatter charts for each type of city
plt.subplots(figsize=(10,6))
plt.scatter(urban_ride_count, urban_avg_fare, s= 10*urban_driver_count, c='coral', edgecolor='black', linewidths=1, alpha=.8, label='Urban')
plt.scatter(suburban_ride_count, suburban_avg_fare, s=10*suburban_driver_count, c= 'skyblue', edgecolor='black', linewidth=1, alpha=.8, label='Suburban')
plt.scatter(rural_ride_count, rural_avg_fare, s=10*rural_driver_count, c='gold', edgecolor='black', linewidths=1, alpha=.8, label='Rural')

# Graph properties
plt.title('PyBer Ride Sharing Date (2019)', fontsize=20)
plt.ylabel('Average Fare ($)', fontsize=12)
plt.xlabel('Total Number of Rides (Per City)', fontsize=12)
plt.grid(True)

# Add the legend
lgnd= plt.legend(fontsize='12', mode='Expanded', scatterpoints=1, loc='best', title='City Types')
lgnd.legendHandles[0]._sizes=[75]
lgnd.legendHandles[1]._sizes=[75]
lgnd.legendHandles[2]._sizes=[75]
lgnd.get_title().set_fontsize(12)
# Incorporate a text label about circle size.
plt.text(42, 35, "Note:\nCircle size correlates\nwith driver count per city.", fontsize="12")

#show the plot
plt.show()
# Save the Figure
plt.savefig("analysis/Fig1.png")

In [None]:
# Get summary statistics
urban_ride_count.describe()

In [None]:
suburban_ride_count.describe()

In [None]:
rural_ride_count.describe()

In [None]:
# Calculate the mean of the ride count for each city type.
round(urban_ride_count.mean(),2), round(suburban_ride_count.mean(),2), round(rural_ride_count.mean(),2)

In [None]:
# Calculate the median of the ride count for each city type
round(urban_ride_count.median(),2), round(suburban_ride_count.median(),2), round(rural_ride_count.median(),2)

In [None]:
urban_ride_count.mode(), suburban_ride_count.mode(), rural_ride_count.mode()

In [None]:
#Calculate the measures of central tendency for the ride count for the urban cities
mean_urban_ride_count = np.mean(urban_ride_count)
print(f"The mean for the ride counts for the urban trips is {mean_urban_ride_count:.2f}.")
median_urban_ride_count = np.median(urban_ride_count)
print(f"The median for the ride counts for the urban trips is {median_urban_ride_count:.2f}.")
mode_urban_ride_count= sts.mode(urban_ride_count)
print(f"The mode for the ride counts for the urban trips is {mode_urban_ride_count}.")

In [None]:
#Calculate the measures of central tendency for the ride count for the suburban cities
mean_suburban_ride_count = np.mean(suburban_ride_count)
print(f"The mean for the ride counts for the suburban trips is {mean_suburban_ride_count:.2f}.")
median_suburban_ride_count = np.median(suburban_ride_count)
print(f"The median for the ride counts for the suburban trips is {median_suburban_ride_count:.2f}.")
mode_suburban_ride_count= sts.mode(suburban_ride_count)
print(f"The mode for the ride counts for the suburban trips is {mode_suburban_ride_count}.")

In [None]:
#Calculate the measures of central tendency for the ride count for the suburban cities
mean_rural_ride_count = np.mean(rural_ride_count)
print(f"The mean for the ride counts for the rural trips is {mean_rural_ride_count:.2f}.")
median_rural_ride_count = np.median(rural_ride_count)
print(f"The median for the ride counts for the rural trips is {median_rural_ride_count:.2f}.")
mode_rural_ride_count= sts.mode(rural_ride_count)
print(f"The mode for the ride counts for the rural trips is {mode_rural_ride_count}.")

In [None]:
# Get the fares for the urban cities
urban_fares = urban_cities_df['fare']
urban_fares.head()

In [None]:
# Calculate the measures of central tendency for the average fare for the urban cities.
mean_urban_fares = np.mean(urban_fares)
print(f"The mean fare price for urban trips is ${mean_urban_fares:.2f}.")

median_urban_fares = np.median(urban_fares)
print(f"The median fare price for urban trips is ${median_urban_fares:.2f}.")

mode_urban_fares = sts.mode(urban_fares)
print(f"The mode fare price for urban trips is {mode_urban_fares}.")

In [None]:
# Get the fares for the urban cities
suburban_fares = suburban_cities_df['fare']
suburban_fares.head()

In [None]:
# Calculate the measures of central tendency for the average fare for the urban cities.
mean_suburban_fares = np.mean(suburban_fares)
print(f"The mean fare price for suburban trips is ${mean_suburban_fares:.2f}.")

median_suburban_fares = np.median(suburban_fares)
print(f"The median fare price for suburban trips is ${median_suburban_fares:.2f}.")

mode_suburban_fares = sts.mode(suburban_fares)
print(f"The mode fare price for suburban trips is {mode_suburban_fares}.")

In [None]:
# Get the fares for the urban cities
rural_fares = rural_cities_df['fare']
rural_fares.head()

In [None]:
# Calculate the measures of central tendency for the average fare for the urban cities.
mean_rural_fares = np.mean(rural_fares)
print(f"The mean fare price for rural trips is ${mean_rural_fares:.2f}.")

median_rural_fares = np.median(rural_fares)
print(f"The median fare price for rural trips is ${median_rural_fares:.2f}.")

mode_rural_fares = sts.mode(rural_fares)
print(f"The mode fare price for rural trips is {mode_rural_fares}.")

In [None]:
# Get the driver count data from the urban cities.
urban_drivers = urban_cities_df['driver_count']
urban_drivers.head()

In [None]:
mean_urban_drivers = np.mean(urban_drivers)
print(f"The mean amount of urban drivers is {mean_urban_drivers:.2f}.")
median_urban_drivers = np.median(urban_drivers)
print(f"The median amount of urban drivers is {median_urban_drivers:.2f}.")
mode_urban_drivers = sts.mode(urban_drivers)
print(f"The mode of the urban drivers is {mode_urban_drivers}.")

In [None]:
suburban_drivers = suburban_cities_df['driver_count']
suburban_drivers.head()

In [None]:
mean_suburban_drivers = np.mean(suburban_drivers)
print(f"The mean amount of suburban drivers is {mean_suburban_drivers:.2f}.")
median_suburban_drivers = np.median(suburban_drivers)
print(f"The median amount of suburban drivers is {median_suburban_drivers:.2f}.")
mode_suburban_drivers = sts.mode(suburban_drivers)
print(f"The mode of the suburban drivers is {mode_suburban_drivers}.")

In [None]:
rural_drivers = rural_cities_df['driver_count']
rural_drivers.head()

In [None]:
mean_rural_drivers = np.mean(rural_drivers)
print(f"The mean amount of rural drivers is {mean_rural_drivers:.2f}.")
median_rural_drivers = np.median(rural_drivers)
print(f"The median amount of rural drivers is {median_rural_drivers:.2f}.")
mode_rural_drivers = sts.mode(rural_drivers)
print(f"The mode of the rural drivers is {mode_rural_drivers}.")

In [None]:
# Create a box-and-whisker plot for the urban cities ride count.
x_labels = ['Urban', 'Suburban', 'Rural']
ride_count_data = [urban_ride_count, suburban_ride_count, rural_ride_count]
fig, ax = plt.subplots(figsize=(10,6))

# Add the title, y-axis label and the grid
ax.set_title('Ride Count Data (2019)', fontsize=20)
ax.set_ylabel('Number of Rides', fontsize=14)
ax.set_xlabel('City Types', fontsize=14)
ax.boxplot(ride_count_data, labels=x_labels)
ax.set_yticks(np.arange(0,45, step=3.0))
ax.grid()
# Save figure
plt.savefig('analysis/Fig2.png')
plt.show()

In [None]:
# Fet the city with the highest ride count, which we can see is 39 from the chart above
urban_city_outlier = urban_ride_count[urban_ride_count==39].index[0]
print(f"{urban_city_outlier} has the highest rider count.")

In [None]:
# Create a box-and-whisker plot for the urban fare data.
x_labels = ["Urban"]
fig, ax = plt.subplots()
ax.boxplot(urban_fares, labels=x_labels)
# Add the title, y-axis label and grid.
ax.set_title('Ride Fare Data (2019)')
ax.set_ylabel('Fare($USD)')
ax.set_yticks(np.arange(0, 51, step=5.0))
ax.grid()
plt.show()
print("Summary Statistics")
urban_fares.describe()

In [None]:
# Create a box-and-whisker plot for the suburban fare data.
x_labels = ["Suburban"]
fig, ax = plt.subplots()
ax.boxplot(suburban_fares, labels=x_labels)
# Add the title, y-axis label and grid.
ax.set_title('Ride Fare Data (2019)')
ax.set_ylabel('Fare($USD)')
ax.set_yticks(np.arange(0, 51, step=5.0))
ax.grid()
plt.show()
print("Summary Statistics")
suburban_fares.describe()

In [None]:
# Create a box-and-whisker plot for the urban fare data.
x_labels = ["Rural"]
fig, ax = plt.subplots()
ax.boxplot(rural_fares, labels=x_labels)
# Add the title, y-axis label and grid.
ax.set_title('Ride Fare Data (2019)')
ax.set_ylabel('Fare($USD)')
ax.set_yticks(np.arange(0, 60, step=5.0))
ax.grid()
plt.show()
print("Summary Statistics")
rural_fares.describe()

In [None]:
# Create a box-and-whisker plot for the urban cities ride count.
x_labels = ['Urban', 'Suburban', 'Rural']
ride_fares_data = [urban_fares, suburban_fares, rural_fares]
fig, ax = plt.subplots(figsize=(10,6))

# Add the title, y-axis label and the grid
ax.set_title('Ride Fare Data (2019)', fontsize=20)
ax.set_ylabel('Fares ($USD)', fontsize=14)
ax.set_xlabel('City Types', fontsize=14)
ax.boxplot(ride_fares_data, labels=x_labels)
ax.set_yticks(np.arange(0,61, step=5.0))
ax.grid()
# Save figure
plt.savefig('analysis/Fig3.png')
plt.show()

In [None]:
# Create the box-and-whisker plot for the urban driver count data.
x_labels = ["Urban"]
fig, ax = plt.subplots()
ax.boxplot(urban_drivers,labels=x_labels)
# Add the title, y-axis label and grid.
ax.set_title('Driver Count Data (2019)')
ax.set_ylabel('Number of Drivers)')
ax.set_yticks(np.arange(0, 90, step=5.0))
ax.grid()
plt.show()
print("Summary Statistics")
urban_drivers.describe()

In [None]:
# Create the box-and-whisker plot for the urban driver count data.
x_labels = ["Suburban"]
fig, ax = plt.subplots()
ax.boxplot(suburban_drivers,labels=x_labels)
# Add the title, y-axis label and grid.
ax.set_title('Driver Count Data (2019)')
ax.set_ylabel('Number of Drivers)')
ax.set_yticks(np.arange(0, 90, step=5.0))
ax.grid()
plt.show()
print("Summary Statistics")
suburban_drivers.describe()

In [None]:
# Create the box-and-whisker plot for the urban driver count data.
x_labels = ["Rural"]
fig, ax = plt.subplots()
ax.boxplot(rural_drivers,labels=x_labels)
# Add the title, y-axis label and grid.
ax.set_title('Driver Count Data (2019)')
ax.set_ylabel('Number of Drivers)')
ax.set_yticks(np.arange(0, 90, step=5.0))
ax.grid()
plt.show()
print("Summary Statistics")
rural_drivers.describe()

In [None]:
# Create a box-and-whisker plot for the urban cities ride count.
x_labels = ['Urban', 'Suburban', 'Rural']
drivers_data = [urban_drivers, suburban_drivers, rural_drivers]
fig, ax = plt.subplots(figsize=(10,6))

# Add the title, y-axis label and the grid
ax.set_title('Driver Data (2019)', fontsize=20)
ax.set_ylabel('Number of Drivers', fontsize=14)
ax.set_xlabel('City Types', fontsize=14)
ax.boxplot(drivers_data, labels=x_labels)
ax.set_yticks(np.arange(0,81, step=5.0))
ax.grid()
# Save figure
plt.savefig('analysis/Fig4.png')
plt.show()

In [None]:
# Get the sum of the fares for each city type
sum_fares_by_type = pyber_data_df.groupby(['type']).sum()['fare']
sum_fares_by_type

In [None]:
# Get the sum of all the fares
total_fares = pyber_data_df["fare"].sum()
total_fares

In [None]:
type_percents = 100 * sum_fares_by_type/total_fares
type_percents

In [None]:
# Calculate the percentage of fare for each city type in one line instead
type_percents = 100 * pyber_data_df.groupby(['type']).sum()['fare'] / pyber_data_df['fare'].sum()
type_percents

In [None]:
# Build Pie Chart
plt.subplots(figsize=(10, 6))
plt.pie(type_percents,
labels=["Rural", "Suburban", "Urban"],
colors=["gold", "lightskyblue", "lightcoral"],
explode=[0, 0, 0.1],
autopct='%1.1f%%',
shadow=True, startangle=150)
plt.title("% of Total Fares by City Type")
# Change the default font size from 10 to 14.
mpl.rcParams['font.size'] = 14
# Save Figure
plt.savefig("analysis/Fig5.png")
# Show Figure
plt.show()

In [None]:
# Calculate the percentage of rides for each city type.
ride_percents = 100 * pyber_data_df.groupby(["type"]).count()["ride_id"] / pyber_data_df["ride_id"].count()
ride_percents

In [None]:
# Build percentage of rides by city type pie chart.
plt.subplots(figsize=(10, 6))
plt.pie(ride_percents,
labels=["Rural", "Suburban", "Urban"],
colors=["gold", "lightskyblue", "lightcoral"],
explode=[0, 0, 0.1],
autopct='%1.1f%%',
shadow=True, startangle=150)
plt.title("% of Total Rides by City Type")
# Change the default font size from 10 to 14.
mpl.rcParams['font.size'] = 14
# Save Figure
plt.savefig("analysis/Fig6.png")
# Show Figure
plt.show()

In [None]:
# Calculate the percentafe of the drivers for each city type
driver_percents = 100 * pyber_data_df.groupby(['type']).sum()['driver_count']/ pyber_data_df['driver_count'].sum()
driver_percents

In [None]:
# Build percentage of rides by city type pie chart.
plt.subplots(figsize=(10, 6))
plt.pie(driver_percents,
labels=["Rural", "Suburban", "Urban"],
colors=["gold", "lightskyblue", "lightcoral"],
explode=[0, 0, 0.1],
autopct='%1.1f%%',
shadow=True, startangle=165)
plt.title("% of Total Rides by City Type")
# Change the default font size from 10 to 14.
mpl.rcParams['font.size'] = 14
# Save Figure
plt.savefig("analysis/Fig7.png")
# Show Figure
plt.show()

# Challenge: 

In [None]:
pyber_data_df.head()

In [None]:
# Get the total rides
total_rides = pyber_data_df.groupby(['type'])['ride_id'].count()

In [None]:
total_rides

In [None]:
# Get the total drivers
total_drivers = city_data_df.groupby(['type'])['driver_count'].sum()

In [None]:
total_drivers

In [None]:
# Get the total fares
total_fares = pyber_data_df.groupby(['type'])['fare'].sum()

In [None]:
total_fares

In [None]:
# Get the average fare per ride

avg_fare_per_ride = (total_fares/total_rides)
avg_fare_per_ride

In [None]:
# Get the Average fare per driver
avg_fare_per_driver = total_fares/total_drivers
avg_fare_per_driver

In [None]:
ch_summary_df = pd.DataFrame({
"Total Rides": total_rides,
"Total Drivers": total_drivers,
"Total Fares": total_fares,
"Average Fare per Ride": avg_fare_per_ride,
"Average Fare per Driver": avg_fare_per_driver
})

In [None]:
ch_summary_df.index.name=None
ch_summary_df['Total Rides'] = ch_summary_df['Total Rides'].map("{:,}".format) 

In [None]:
ch_summary_df['Total Drivers'] = ch_summary_df['Total Drivers'].map("{:,}".format) 

In [None]:
ch_summary_df['Total Fares'] = ch_summary_df['Total Fares'].map('${:,.2f}'.format) 

In [192]:
ch_summary_df['Average Fare per Ride'] = ch_summary_df['Average Fare per Ride'].map('${:,.2f}'.format) 

In [194]:
ch_summary_df['Average Fare per Driver'] = ch_summary_df['Average Fare per Driver'].map('${:,.2f}'.format) 

In [195]:
ch_summary_df

Unnamed: 0,Total Rides,Total Drivers,Total Fares,Average Fare per Ride,Average Fare per Driver
Rural,125,78,"$4,327.93",$34.62,$55.49
Suburban,625,490,"$19,356.33",$30.97,$39.50
Urban,1625,2405,"$39,854.38",$24.53,$16.57
