# 5.2.3 Load and Read the CSV files

In [1]:
# Add Matplotlib inline magic command
%matplotlib inline
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Files to load
city_data_to_load = "Resources/city_data.csv"
ride_data_to_load = "Resources/ride_data.csv"

In [3]:
# Read the city data file and store it in a pandas DataFrame.
city_data_df = pd.read_csv(city_data_to_load)
city_data_df.head(5)

#NOTE, you could also load and read ALL IN ONE STEP:
#city_data_df = pd.read_csv("Resources/city_data.csv")
#ride_data_to_load = pd.read_csv("Resources/ride_data.csv")

Unnamed: 0,city,driver_count,type
0,Richardfort,38,Urban
1,Williamsstad,59,Urban
2,Port Angela,67,Urban
3,Rodneyfort,34,Urban
4,West Robert,39,Urban


In [4]:
# Read the ride data file and store it in a pandas DataFrame.
ride_data_df = pd.read_csv(ride_data_to_load)
ride_data_df.head(5)

Unnamed: 0,city,date,fare,ride_id
0,Lake Jonathanshire,2019-01-14 10:14:22,13.83,5739410935873
1,South Michelleport,2019-03-04 18:24:09,30.24,2343912425577
2,Port Samanthamouth,2019-02-24 04:29:00,33.44,2005065760003
3,Rodneyfort,2019-02-10 23:22:03,23.44,5149245426178
4,South Jack,2019-03-06 04:28:35,34.58,3908451377344


# 5.2.4 Explore the Data in Pandas

### Inspect the City Data Dataframe
1. Get all the rows that contain null values.
2. Make sure the driver_count column has an integer data type.
3. Find out how many data points there are for each type of city.


In [14]:
city_data_df.head(3)

Unnamed: 0,city,driver_count,type
0,Richardfort,38,Urban
1,Williamsstad,59,Urban
2,Port Angela,67,Urban


In [5]:
# Get the columns and the rows that are not null.

city_data_df.count()

city            120
driver_count    120
type            120
dtype: int64

In [6]:
# Get the columns and the rows that are not null.
city_data_df.isnull().sum()

city            0
driver_count    0
type            0
dtype: int64

In [7]:
# Get the data types of each column.
  # we need to see if the driver_count column has a numerical data type 
  # because we plan to perform mathematical calculations on that column.

city_data_df.dtypes

city            object
driver_count     int64
type            object
dtype: object

In [8]:
# Get the unique values of the type of city.
city_data_df["type"].unique()

array(['Urban', 'Suburban', 'Rural'], dtype=object)

In [9]:
# Get the number of data points from the Urban cities.
print(sum(city_data_df["type"]=="Urban"))
print(sum(city_data_df["type"]=="Rural"))
print(sum(city_data_df["type"]=="Suburban"))

66
18
36


### Inspect Ride Data DataFrame
1. Get all the rows that contain Null Values
2. Make sure the fare and ride_id columns are numerical data types

In [10]:
ride_data_df.head(3)

Unnamed: 0,city,date,fare,ride_id
0,Lake Jonathanshire,2019-01-14 10:14:22,13.83,5739410935873
1,South Michelleport,2019-03-04 18:24:09,30.24,2343912425577
2,Port Samanthamouth,2019-02-24 04:29:00,33.44,2005065760003


In [28]:
# Get the columns and the rows that are not null.
ride_data_df.count()

array([2375])

In [12]:
# Get the columns and rows that are not null.
ride_data_df.isnull().sum()

city       0
date       0
fare       0
ride_id    0
dtype: int64

In [13]:
# Get the data types of each column.
ride_data_df.dtypes

city        object
date        object
fare       float64
ride_id      int64
dtype: object

### Merge DataFrames

First review the columns of each:
    
    city_data_df:
            -city
            -driver_count
            -type
            
    ride_data_df:
            -city
            -date
            -fare
            -ride_id

Then merge on a column with the same data & column name with this syntax:

new_df = pd.merge(leftdf, rightdf, on=["column_leftdf","column_rightdf"])

Watch this youtube vid if need review:
https://www.youtube.com/watch?v=h4hOPGo4UVU


In [38]:
# Combine the data into a single dataset
pyber_data_df = pd.merge(ride_data_df, city_data_df, how="left", on=["city", "city"])

# Display the DataFrame
pyber_data_df.head()

Unnamed: 0,city,date,fare,ride_id,driver_count,type
0,Lake Jonathanshire,2019-01-14 10:14:22,13.83,5739410935873,5,Urban
1,South Michelleport,2019-03-04 18:24:09,30.24,2343912425577,72,Urban
2,Port Samanthamouth,2019-02-24 04:29:00,33.44,2005065760003,57,Urban
3,Rodneyfort,2019-02-10 23:22:03,23.44,5149245426178,34,Urban
4,South Jack,2019-03-06 04:28:35,34.58,3908451377344,46,Urban
