In [None]:
# Add Matplotlib inline magic command
%matplotlib inline

# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import pathlib
import numpy as np
import scipy.stats as sts

plt.style.use('default')

## Import Data Files and Create DataFrames

In [None]:
# Files to load
city_data_to_load = pathlib.Path("resources/city_data.csv")
ride_data_to_load = pathlib.Path("resources/ride_data.csv")

In [None]:
# Read the city and ride data files and store them in a pandas DataFrames
city_df = pd.read_csv(city_data_to_load)
ride_df = pd.read_csv(ride_data_to_load)

print(city_df.head(10))
print(ride_df.head(10))

<br>
<hr/>

## Inspect and Clean the Data (if needed)
### Inspect City DataFrame

In [None]:
# Get the columns and the rows that are not null
city_df.count()
# city_df.isnull().sum()

In [None]:
# Get the data types of each column
city_df.dtypes

In [None]:
# Get the unique values of the type of city
city_df["type"].unique()

In [None]:
# Get the number of data points for each type of city
city_df.groupby("type")["city"].count()

### Inspect Ride DataFrame

In [None]:
# Get the columns and the rows that are not null
ride_df.count()
# ride_df.isnull().sum()

In [None]:
# Get the data types of each column
ride_df.dtypes

## Merge the DataFrames

In [None]:
# Combine the data into a single dataset
pyber_df = pd.merge(ride_df, city_df, how="left", on=["city", "city"])
pyber_df.head()

<br>
<hr/>

## Bubble Chart: Average Fare vs Number of Rides for Each City by City Type
### Create DataFrames

In [None]:
# Create DataFrames for each city type
urban_cities_df = pyber_df[pyber_df["type"] == "Urban"]
suburban_cities_df = pyber_df[pyber_df["type"] == "Suburban"]
rural_cities_df = pyber_df[pyber_df["type"] == "Rural"]

urban_cities_df.head()

### Ride Counts for Each City

In [None]:
# Get ride counts per city for each city type
urban_ride_count = urban_cities_df.groupby("city")["ride_id"].count()
suburban_ride_count = suburban_cities_df.groupby("city")["ride_id"].count()
rural_ride_count = rural_cities_df.groupby("city")["ride_id"].count()

urban_ride_count.head()

### Average Fare for Each City

In [None]:
# Get average fare per city for each city type
urban_avg_fare = urban_cities_df.groupby("city")["fare"].mean()
suburban_avg_fare = suburban_cities_df.groupby("city")["fare"].mean()
rural_avg_fare = rural_cities_df.groupby("city")["fare"].mean()

urban_avg_fare.head()

### Average Number of Drivers for Each City

In [None]:
# Get average number of drivers per city for each city type
urban_driver_count = urban_cities_df.groupby("city")["driver_count"].mean()
suburban_driver_count = suburban_cities_df.groupby("city")["driver_count"].mean()
rural_driver_count = rural_cities_df.groupby("city")["driver_count"].mean()

urban_driver_count.head()

### Bubble Chart

In [None]:
# Build the bubble chart
plt.subplots(figsize=(10, 6))

# Plot urban cities
plt.scatter(x=urban_ride_count,
            y=urban_avg_fare,
            s=10*urban_driver_count,
            c="coral",
            edgecolor="black",
            linewidths=1,
            alpha=0.8,
            label="Urban")

# Plot suburban cities
plt.scatter(x=suburban_ride_count,
            y=suburban_avg_fare,
            s=10*suburban_driver_count,
            c="skyblue",
            edgecolor="black",
            linewidths=1,
            alpha=0.8,
            label="Suburban")

# Plot rural cities
plt.scatter(x=rural_ride_count,
            y=rural_avg_fare,
            s=10*rural_driver_count,
            c="gold",
            edgecolor="black",
            linewidths=1,
            alpha=0.8,
            label="Rural")

# Add graph properties
plt.title(label="PyBer Ride-Sharing Data (2019)", fontsize=20)
plt.xlabel(xlabel="Total Number of Rides (Per City)", fontsize=12)
plt.ylabel(ylabel="Average Fare ($)", fontsize=12)
plt.grid(True)

# Create a legend
lgnd = plt.legend(fontsize=12,
                  scatterpoints=1,
                  loc="best",
                  title="City Types")
lgnd.legendHandles[0]._sizes = [75]
lgnd.legendHandles[1]._sizes = [75]
lgnd.legendHandles[2]._sizes = [75]
lgnd.get_title().set_fontsize(12)

# Incorporate a text label about circle size
plt.text(41.5, 35, "Note:\nCircle size correlates\nwith driver count per\ncity.", fontsize=12)

# Save the figure
plt.savefig("analysis/pyber_bubble_chart.png", bbox_inches="tight")
plt.savefig("analysis/pyber_bubble_chart.svg", bbox_inches="tight")
plt.show()

<br>
<hr/>

## Calculate Summary Statistics

### City Type DataFrames: describe()

In [None]:
# Get summary statistics
urban_cities_df.describe()

In [None]:
suburban_cities_df.describe()

In [None]:
rural_cities_df.describe()

<hr/>

### Ride Count Series: describe()

In [None]:
# Get summary statistics
urban_ride_count.describe()

In [None]:
suburban_ride_count.describe()

In [None]:
rural_ride_count.describe()

<hr/>

### Ride Count Series: mean(), median(), mode()

In [None]:
# Calculate the mean of the ride count for each city type
round(urban_ride_count.mean(),2), round(suburban_ride_count.mean(),2), round(rural_ride_count.mean(),2)

In [None]:
# Calculate the median of the ride count for each city type
round(urban_ride_count.median(),2), round(suburban_ride_count.median(),2), round(rural_ride_count.median(),2)

In [None]:
# Calculate the mode of the ride count for each city type
urban_ride_count.mode(), suburban_ride_count.mode(), rural_ride_count.mode()

<hr/>

### Ride Count Series: np.mean(), np.median(), sts.mode()
#### Urban Cities

In [None]:
# Calculate the measures of central tendency for the ride count for the urban cities
mean_urban_ride_count = np.mean(urban_ride_count)
print(f"The mean for the ride counts for urban trips is {mean_urban_ride_count:.2f}.")

median_urban_ride_count = np.median(urban_ride_count)
print(f"The median for the ride counts for urban trips is {median_urban_ride_count}.")

mode_urban_ride_count = sts.mode(urban_ride_count)
print(f"The mode for the ride counts for urban trips is {mode_urban_ride_count}.")

#### Suburban Cities

In [None]:
# Calculate the measures of central tendency for the ride count for the suburban cities
mean_suburban_ride_count = np.mean(suburban_ride_count)
print(f"The mean for the ride counts for suburban trips is {mean_suburban_ride_count:.2f}.")

median_suburban_ride_count = np.median(suburban_ride_count)
print(f"The median for the ride counts for suburban trips is {median_suburban_ride_count}.")

mode_suburban_ride_count = sts.mode(suburban_ride_count)
print(f"The mode for the ride counts for suburban trips is {mode_suburban_ride_count}.")

#### Rural Cities

In [None]:
# Calculate the measures of central tendency for the ride count for the rural cities
mean_rural_ride_count = np.mean(rural_ride_count)
print(f"The mean for the ride counts for rural trips is {mean_rural_ride_count:.2f}.")

median_rural_ride_count = np.median(rural_ride_count)
print(f"The median for the ride counts for rural trips is {median_rural_ride_count}.")

mode_rural_ride_count = sts.mode(rural_ride_count)
print(f"The mode for the ride counts for rural trips is {mode_rural_ride_count}.")

<hr/>

### Fares: np.mean(), np.median(), sts.mode()
#### Urban Cities

In [None]:
# Get the fares for the urban cities
urban_fares = urban_cities_df["fare"]

# Calculate the measures of central tendency for the average fare for the urban cities
mean_urban_fares = np.mean(urban_fares)
print(f"The mean fare price for urban trips is ${mean_urban_fares:.2f}.")

median_urban_fares = np.median(urban_fares)
print(f"The median fare price for urban trips is ${median_urban_fares:.2f}.")

mode_urban_fares = sts.mode(urban_fares)
print(f"The mode fare price for urban trips is {mode_urban_fares}.")

#### Suburban Cities

In [None]:
# Get the fares for the suburban cities
suburban_fares = suburban_cities_df["fare"]

# Calculate the measures of central tendency for the average fare for the suburban cities
mean_suburban_fares = np.mean(suburban_fares)
print(f"The mean fare price for suburban trips is ${mean_suburban_fares:.2f}.")

median_suburban_fares = np.median(suburban_fares)
print(f"The median fare price for suburban trips is ${median_suburban_fares:.2f}.")

mode_suburban_fares = sts.mode(suburban_fares)
print(f"The mode fare price for suburban trips is {mode_suburban_fares}.")

#### Rural Cities

In [None]:
# Get the fares for the rural cities
rural_fares = rural_cities_df["fare"]

# Calculate the measures of central tendency for the average fare for the rural cities
mean_rural_fares = np.mean(rural_fares)
print(f"The mean fare price for rural trips is ${mean_rural_fares:.2f}.")

median_rural_fares = np.median(rural_fares)
print(f"The median fare price for rural trips is ${median_rural_fares:.2f}.")

mode_rural_fares = sts.mode(rural_fares)
print(f"The mode fare price for rural trips is {mode_rural_fares}.")

<hr/>

### Driver Count: np.mean(), np.median(), sts.mode()
#### Urban Cities

In [None]:
# Get the driver count data from the urban cities
urban_drivers = urban_cities_df['driver_count']

# Calculate the measures of central tendency for the driver count for the urban cities
mean_urban_drivers = np.mean(urban_drivers)
print(f"The mean driver count for urban trips is {mean_urban_drivers:.2f}.")

median_urban_drivers = np.median(urban_drivers)
print(f"The median driver count for urban trips is {median_urban_drivers:.1f}.")

mode_urban_drivers = sts.mode(urban_drivers)
print(f"The mode driver count for urban trips is {mode_urban_drivers}.")

#### Suburban Cities

In [None]:
# Get the driver count data from the suburban cities
suburban_drivers = suburban_cities_df['driver_count']

# Calculate the measures of central tendency for the driver count for the suburban cities
mean_suburban_drivers = np.mean(suburban_drivers)
print(f"The mean driver count for suburban trips is {mean_suburban_drivers:.2f}.")

median_suburban_drivers = np.median(suburban_drivers)
print(f"The median driver count for suburban trips is {median_suburban_drivers:.1f}.")

mode_suburban_drivers = sts.mode(suburban_drivers)
print(f"The mode driver count for suburban trips is {mode_suburban_drivers}.")

#### Rural Cities

In [None]:
# Get the driver count data from the rural cities
rural_drivers = rural_cities_df['driver_count']

# Calculate the measures of central tendency for the driver count for the rural cities
mean_rural_drivers = np.mean(rural_drivers)
print(f"The mean driver count for rural trips is {mean_rural_drivers:.2f}.")

median_rural_drivers = np.median(rural_drivers)
print(f"The median driver count for rural trips is {median_rural_drivers:.1f}.")

mode_rural_drivers = sts.mode(rural_drivers)
print(f"The mode driver count for rural trips is {mode_rural_drivers}.")

<hr/>

### Box-and-Whisker Plots
#### Ride Count

In [None]:
# Add all ride count box-and-whisker plots to the same graph
x_labels = ["Urban", "Suburban","Rural"]
ride_count_data = [urban_ride_count, suburban_ride_count, rural_ride_count]

# Labels
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Ride Count Data (2019)',fontsize=20)
ax.set_ylabel('Number of Rides',fontsize=14)
ax.set_xlabel("City Types",fontsize=14)

# Make the 3 box-and-whisker plots on one axes
ax.boxplot(ride_count_data, labels=x_labels)

ax.set_yticks(np.arange(0, 45, step=3.0))
ax.grid()

# Save the figure.
plt.savefig("analysis/pyber_ride_count_box-and-whisker_plot.png", bbox_inches="tight")
plt.savefig("analysis/pyber_ride_count_box-and-whisker_plot.svg", bbox_inches="tight")
plt.show()

In [None]:
# Get the city that matches 39 (the outlier seen in the box-and-whisker plot)
urban_city_outlier = urban_ride_count[urban_ride_count==39].index[0]
print(f"{urban_city_outlier} has the highest rider count.")

#### Ride Fare

In [None]:
# Add all ride fare box-and-whisker plots to the same graph
x_labels = ["Urban", "Suburban","Rural"]
ride_fare_data = [urban_fares, suburban_fares, rural_fares]

# Labels
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Ride Fare Data (2019)',fontsize=20)
ax.set_ylabel('Fare ($USD)',fontsize=14)
ax.set_xlabel("City Types",fontsize=14)

# Make the 3 box-and-whisker plots on one axes
ax.boxplot(ride_fare_data, labels=x_labels)

ax.set_yticks(np.arange(0, 61, step=5.0))
ax.grid()

# Save the figure.
plt.savefig("analysis/pyber_ride_fare_box-and-whisker_plot.png", bbox_inches="tight")
plt.savefig("analysis/pyber_ride_fare_box-and-whisker_plot.svg", bbox_inches="tight")
plt.show()

#### Driver Count

In [None]:
# Add all ride driver count box-and-whisker plots to the same graph
x_labels = ["Urban", "Suburban","Rural"]
driver_count_data = [urban_driver_count, suburban_driver_count, rural_driver_count]

# Labels
fig, ax = plt.subplots(figsize=(10, 6))
ax.set_title('Driver Count Data (2019)',fontsize=20)
ax.set_ylabel('Number of Drivers',fontsize=14)
ax.set_xlabel("City Types",fontsize=14)

# Make the 3 box-and-whisker plots on one axes
ax.boxplot(driver_count_data, labels=x_labels)

ax.set_yticks(np.arange(0, 76, step=5.0))
ax.grid()

# Save the figure.
plt.savefig("analysis/pyber_driver_count_box-and-whisker_plot.png", bbox_inches="tight")
plt.savefig("analysis/pyber_driver_count_box-and-whisker_plot.svg", bbox_inches="tight")
plt.show()