In [202]:
from io import StringIO
import os

import boto3
import pandas as pd

pd.set_option("display.max_columns", 50)

In [203]:
aws_access_key_id = os.getenv("AWS_ACCESS_KEY")
aws_secret_key = os.getenv("AWS_SECRET_KEY")

In [204]:
def read_csv_from_s3(bucket: str, path: str, filename: str) -> pd.DataFrame:
  """Downloads a csv file from a S3 bucket.
  
  Parameters
  ----------
  bucket: str
    The bucket where the files are.
    
  path: str
    The folders to the file.
  
  filename: str
    Name of the file.
    
  Returns
  --------
  pd.DataFrame
    A DataFrame of the downloaded file.
  """
  s3 = boto3.client("s3", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_key)

  full_path = f"{path}{filename}"
  
  object = s3.get_object(Bucket=bucket, Key=full_path)
  object = object["Body"].read().decode("utf-8")
  output_df = pd.read_csv(StringIO(object))
  
  return output_df

In [205]:
s3 = boto3.client("s3", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_key)
bucket = "cubix-chicago-taxi-akt"

payment_type_path = "transformed_data/payment_type/"
community_areas_path = "transformed_data/community_areas/"
company_path = "transformed_data/company/"
date_path = "transformed_data/date/"
taxi_trips_path = "transformed_data/taxi_trips/"
weather_path = "transformed_data/weather/"

payment_type = read_csv_from_s3(bucket=bucket, path=payment_type_path, filename="payment_type_master.csv")
community_areas = read_csv_from_s3(bucket=bucket, path=community_areas_path, filename="community_areas_master.csv")
company = read_csv_from_s3(bucket=bucket, path=company_path, filename="company_master.csv")
date = read_csv_from_s3(bucket=bucket, path=date_path, filename="date_dimension.csv")


In [206]:
payment_type.head()

Unnamed: 0,payment_type_id,payment_type
0,1,Mobile
1,2,Cash
2,3,Credit Card
3,4,Prcard
4,5,No Charge


In [207]:
community_areas.head()

Unnamed: 0,area code,community_name
0,1,Rogers Park
1,2,West Ridge
2,3,Uptown
3,4,Lincoln Square
4,5,North Center


In [208]:
company.head()

Unnamed: 0,company_id,company
0,1,5 Star Taxi
1,2,Sun Taxi
2,3,Taxicab Insurance Agency Llc
3,4,Choice Taxi Association Inc
4,5,Flash Cab


In [209]:
date.head()

Unnamed: 0,date,year,month,day,dayofweek,isweekend
0,2023-01-01,2023,1,1,7,True
1,2023-01-02,2023,1,2,1,False
2,2023-01-03,2023,1,3,2,False
3,2023-01-04,2023,1,4,3,False
4,2023-01-05,2023,1,5,4,False


In [210]:
trips_list = []
weather_list = []

In [211]:
for file in s3.list_objects(Bucket=bucket, Prefix=taxi_trips_path)['Contents']:
  taxi_trips_key = file["Key"]

  if taxi_trips_key.split("/")[-1].strip() !="":
    if taxi_trips_key.split(".")[1] == "csv":
      
      filename = taxi_trips_key.split("/")[-1] 
      trip = read_csv_from_s3(bucket, taxi_trips_path, filename)
      trips_list.append(trip)
      print(f"{filename} has been added.")
  

taxi_2024-06-18.csv has been added.
taxi_2024-06-19.csv has been added.


In [212]:
trips = pd.concat(trips_list, ignore_index=True)

In [213]:
trips.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_community_area_id,datetime_for_weather,payment_type_id,company_id
0,498213267d9d8a5b9c55149a1eb8531bd8e8336b,10e8a565ac8829bd9caa44f57e9831fd8b4ff374e4d666...,2024-06-18T23:45:00.000,2024-06-18T23:45:00.000,342,1.12,8,6.25,1.0,0.0,0.0,7.75,41.899602,-87.633308,41.899602,-87.633308,8,2024-06-18 23:00:00,3,5
1,48dbfe8f9c735bbfdce7e35028b45876b9f48469,da1a4de231c9fa53f23463812544aa4af3c4de1cfc4add...,2024-06-18T23:45:00.000,2024-06-19T00:00:00.000,777,3.03,7,11.46,3.0,0.0,0.0,14.46,41.922686,-87.649489,41.944227,-87.655998,6,2024-06-18 23:00:00,1,5
2,439725470678efb2014c7f93d434d37930c64fe8,8e61957eda2e69d68b0bce4add726e0a84827e3f1ec9c9...,2024-06-18T23:45:00.000,2024-06-19T00:15:00.000,1570,14.43,76,36.5,6.15,0.0,4.0,47.15,41.980264,-87.913625,41.947792,-87.683835,5,2024-06-18 23:00:00,3,12
3,4172db2de157874279cc72156d98ab811a94afd6,cfaf3ec1344ffc1a207c1cbe3b1a89ab060081a60450d1...,2024-06-18T23:45:00.000,2024-06-18T23:45:00.000,599,1.94,32,9.0,2.34,0.0,0.0,11.34,41.878866,-87.625192,41.899602,-87.633308,8,2024-06-18 23:00:00,1,3
4,3fc0b7810f5a097fc79413e0733e704069a33b82,aee6a1f2577e5f670e300733aca9b27ad8fecd22425d14...,2024-06-18T23:45:00.000,2024-06-19T00:00:00.000,991,14.28,76,49.0,6.0,0.0,0.0,55.5,41.980264,-87.913625,41.922761,-87.699155,22,2024-06-18 23:00:00,3,3


In [214]:
trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36093 entries, 0 to 36092
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   trip_id                     36093 non-null  object 
 1   taxi_id                     36093 non-null  object 
 2   trip_start_timestamp        36093 non-null  object 
 3   trip_end_timestamp          36093 non-null  object 
 4   trip_seconds                36093 non-null  int64  
 5   trip_miles                  36093 non-null  float64
 6   pickup_community_area_id    36093 non-null  int64  
 7   fare                        36093 non-null  float64
 8   tips                        36093 non-null  float64
 9   tolls                       36093 non-null  float64
 10  extras                      36093 non-null  float64
 11  trip_total                  36093 non-null  float64
 12  pickup_centroid_latitude    36093 non-null  float64
 13  pickup_centroid_longitude   360

In [215]:
trips.shape

(36093, 20)

In [216]:
for file in s3.list_objects(Bucket=bucket, Prefix=weather_path)["Contents"]:
    weather_key = file["Key"]

    if weather_key.split("/")[-1].strip() !="":
        if weather_key.split(".")[1] == "csv":
        
            filename = weather_key.split("/")[-1]
            weather_daily = read_csv_from_s3(bucket, weather_path, filename)
            weather_list.append(weather_daily)

            print(f"{filename} has been added.")    

weather_2024-06-18.csv has been added.
weather_2024-06-19.csv has been added.


In [217]:
weather = pd.concat(weather_list, ignore_index=True)

In [218]:
weather.head()

Unnamed: 0,datetime,temperature,wind_speed,rain,precipitation
0,2024-06-18 00:00:00,33.0,24.2,0.0,0.0
1,2024-06-18 01:00:00,29.0,22.0,0.0,0.0
2,2024-06-18 02:00:00,28.1,23.8,0.0,0.0
3,2024-06-18 03:00:00,27.2,22.3,0.0,0.0
4,2024-06-18 04:00:00,27.1,19.6,0.0,0.0


In [219]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   datetime       48 non-null     object 
 1   temperature    48 non-null     float64
 2   wind_speed     48 non-null     float64
 3   rain           48 non-null     float64
 4   precipitation  48 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.0+ KB


In [220]:
weather.shape

(48, 5)

### Join them together

In [251]:
trips_full = pd.merge(trips, weather, left_on ="datetime_for_weather", right_on="datetime", how="inner")
trips_full = trips_full.drop(columns=["datetime"])
trips_full.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_community_area_id,datetime_for_weather,payment_type_id,company_id,temperature,wind_speed,rain,precipitation
0,498213267d9d8a5b9c55149a1eb8531bd8e8336b,10e8a565ac8829bd9caa44f57e9831fd8b4ff374e4d666...,2024-06-18T23:45:00.000,2024-06-18T23:45:00.000,342,1.12,8,6.25,1.0,0.0,0.0,7.75,41.899602,-87.633308,41.899602,-87.633308,8,2024-06-18 23:00:00,3,5,31.8,30.6,0.0,0.0
1,48dbfe8f9c735bbfdce7e35028b45876b9f48469,da1a4de231c9fa53f23463812544aa4af3c4de1cfc4add...,2024-06-18T23:45:00.000,2024-06-19T00:00:00.000,777,3.03,7,11.46,3.0,0.0,0.0,14.46,41.922686,-87.649489,41.944227,-87.655998,6,2024-06-18 23:00:00,1,5,31.8,30.6,0.0,0.0
2,439725470678efb2014c7f93d434d37930c64fe8,8e61957eda2e69d68b0bce4add726e0a84827e3f1ec9c9...,2024-06-18T23:45:00.000,2024-06-19T00:15:00.000,1570,14.43,76,36.5,6.15,0.0,4.0,47.15,41.980264,-87.913625,41.947792,-87.683835,5,2024-06-18 23:00:00,3,12,31.8,30.6,0.0,0.0
3,4172db2de157874279cc72156d98ab811a94afd6,cfaf3ec1344ffc1a207c1cbe3b1a89ab060081a60450d1...,2024-06-18T23:45:00.000,2024-06-18T23:45:00.000,599,1.94,32,9.0,2.34,0.0,0.0,11.34,41.878866,-87.625192,41.899602,-87.633308,8,2024-06-18 23:00:00,1,3,31.8,30.6,0.0,0.0
4,3fc0b7810f5a097fc79413e0733e704069a33b82,aee6a1f2577e5f670e300733aca9b27ad8fecd22425d14...,2024-06-18T23:45:00.000,2024-06-19T00:00:00.000,991,14.28,76,49.0,6.0,0.0,0.0,55.5,41.980264,-87.913625,41.922761,-87.699155,22,2024-06-18 23:00:00,3,3,31.8,30.6,0.0,0.0


In [252]:
trips_full = pd.merge(trips_full, company, left_on="company_id", right_on="company_id", how="inner")
trips_full = trips_full.drop(columns=["company_id"])
trips_full.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_community_area_id,datetime_for_weather,payment_type_id,temperature,wind_speed,rain,precipitation,company
0,498213267d9d8a5b9c55149a1eb8531bd8e8336b,10e8a565ac8829bd9caa44f57e9831fd8b4ff374e4d666...,2024-06-18T23:45:00.000,2024-06-18T23:45:00.000,342,1.12,8,6.25,1.0,0.0,0.0,7.75,41.899602,-87.633308,41.899602,-87.633308,8,2024-06-18 23:00:00,3,31.8,30.6,0.0,0.0,Flash Cab
1,48dbfe8f9c735bbfdce7e35028b45876b9f48469,da1a4de231c9fa53f23463812544aa4af3c4de1cfc4add...,2024-06-18T23:45:00.000,2024-06-19T00:00:00.000,777,3.03,7,11.46,3.0,0.0,0.0,14.46,41.922686,-87.649489,41.944227,-87.655998,6,2024-06-18 23:00:00,1,31.8,30.6,0.0,0.0,Flash Cab
2,439725470678efb2014c7f93d434d37930c64fe8,8e61957eda2e69d68b0bce4add726e0a84827e3f1ec9c9...,2024-06-18T23:45:00.000,2024-06-19T00:15:00.000,1570,14.43,76,36.5,6.15,0.0,4.0,47.15,41.980264,-87.913625,41.947792,-87.683835,5,2024-06-18 23:00:00,3,31.8,30.6,0.0,0.0,Blue Ribbon Taxi Association
3,4172db2de157874279cc72156d98ab811a94afd6,cfaf3ec1344ffc1a207c1cbe3b1a89ab060081a60450d1...,2024-06-18T23:45:00.000,2024-06-18T23:45:00.000,599,1.94,32,9.0,2.34,0.0,0.0,11.34,41.878866,-87.625192,41.899602,-87.633308,8,2024-06-18 23:00:00,1,31.8,30.6,0.0,0.0,Taxicab Insurance Agency Llc
4,3fc0b7810f5a097fc79413e0733e704069a33b82,aee6a1f2577e5f670e300733aca9b27ad8fecd22425d14...,2024-06-18T23:45:00.000,2024-06-19T00:00:00.000,991,14.28,76,49.0,6.0,0.0,0.0,55.5,41.980264,-87.913625,41.922761,-87.699155,22,2024-06-18 23:00:00,3,31.8,30.6,0.0,0.0,Taxicab Insurance Agency Llc


In [253]:
trips_full = pd.merge(trips_full, payment_type, left_on="payment_type_id", right_on="payment_type_id", how="inner")
trips_full = trips_full.drop(columns=["payment_type_id"])
trips_full.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_community_area_id,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_community_area_id,datetime_for_weather,temperature,wind_speed,rain,precipitation,company,payment_type
0,498213267d9d8a5b9c55149a1eb8531bd8e8336b,10e8a565ac8829bd9caa44f57e9831fd8b4ff374e4d666...,2024-06-18T23:45:00.000,2024-06-18T23:45:00.000,342,1.12,8,6.25,1.0,0.0,0.0,7.75,41.899602,-87.633308,41.899602,-87.633308,8,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Flash Cab,Credit Card
1,48dbfe8f9c735bbfdce7e35028b45876b9f48469,da1a4de231c9fa53f23463812544aa4af3c4de1cfc4add...,2024-06-18T23:45:00.000,2024-06-19T00:00:00.000,777,3.03,7,11.46,3.0,0.0,0.0,14.46,41.922686,-87.649489,41.944227,-87.655998,6,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Flash Cab,Mobile
2,439725470678efb2014c7f93d434d37930c64fe8,8e61957eda2e69d68b0bce4add726e0a84827e3f1ec9c9...,2024-06-18T23:45:00.000,2024-06-19T00:15:00.000,1570,14.43,76,36.5,6.15,0.0,4.0,47.15,41.980264,-87.913625,41.947792,-87.683835,5,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Blue Ribbon Taxi Association,Credit Card
3,4172db2de157874279cc72156d98ab811a94afd6,cfaf3ec1344ffc1a207c1cbe3b1a89ab060081a60450d1...,2024-06-18T23:45:00.000,2024-06-18T23:45:00.000,599,1.94,32,9.0,2.34,0.0,0.0,11.34,41.878866,-87.625192,41.899602,-87.633308,8,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Taxicab Insurance Agency Llc,Mobile
4,3fc0b7810f5a097fc79413e0733e704069a33b82,aee6a1f2577e5f670e300733aca9b27ad8fecd22425d14...,2024-06-18T23:45:00.000,2024-06-19T00:00:00.000,991,14.28,76,49.0,6.0,0.0,0.0,55.5,41.980264,-87.913625,41.922761,-87.699155,22,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Taxicab Insurance Agency Llc,Credit Card


In [254]:
trips_full = pd.merge(trips_full, community_areas, left_on="pickup_community_area_id", right_on="area code", how="inner")
trips_full = trips_full.drop(columns=["pickup_community_area_id", "area code"])
trips_full.rename(columns={"community_name": "pickup_community_area_name"}, inplace=True)
trips_full.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_community_area_id,datetime_for_weather,temperature,wind_speed,rain,precipitation,company,payment_type,pickup_community_area_name
0,498213267d9d8a5b9c55149a1eb8531bd8e8336b,10e8a565ac8829bd9caa44f57e9831fd8b4ff374e4d666...,2024-06-18T23:45:00.000,2024-06-18T23:45:00.000,342,1.12,6.25,1.0,0.0,0.0,7.75,41.899602,-87.633308,41.899602,-87.633308,8,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Flash Cab,Credit Card,Near North Side
1,48dbfe8f9c735bbfdce7e35028b45876b9f48469,da1a4de231c9fa53f23463812544aa4af3c4de1cfc4add...,2024-06-18T23:45:00.000,2024-06-19T00:00:00.000,777,3.03,11.46,3.0,0.0,0.0,14.46,41.922686,-87.649489,41.944227,-87.655998,6,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Flash Cab,Mobile,Lincoln Park
2,439725470678efb2014c7f93d434d37930c64fe8,8e61957eda2e69d68b0bce4add726e0a84827e3f1ec9c9...,2024-06-18T23:45:00.000,2024-06-19T00:15:00.000,1570,14.43,36.5,6.15,0.0,4.0,47.15,41.980264,-87.913625,41.947792,-87.683835,5,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Blue Ribbon Taxi Association,Credit Card,O'Hare
3,4172db2de157874279cc72156d98ab811a94afd6,cfaf3ec1344ffc1a207c1cbe3b1a89ab060081a60450d1...,2024-06-18T23:45:00.000,2024-06-18T23:45:00.000,599,1.94,9.0,2.34,0.0,0.0,11.34,41.878866,-87.625192,41.899602,-87.633308,8,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Taxicab Insurance Agency Llc,Mobile,Loop
4,3fc0b7810f5a097fc79413e0733e704069a33b82,aee6a1f2577e5f670e300733aca9b27ad8fecd22425d14...,2024-06-18T23:45:00.000,2024-06-19T00:00:00.000,991,14.28,49.0,6.0,0.0,0.0,55.5,41.980264,-87.913625,41.922761,-87.699155,22,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Taxicab Insurance Agency Llc,Credit Card,O'Hare


In [255]:
trips_full = pd.merge(trips_full, community_areas, left_on="dropoff_community_area_id", right_on="area code", how="inner")
trips_full = trips_full.drop(columns=["dropoff_community_area_id", "area code"])
trips_full.rename(columns={"community_name": "dropoff_community_area_name"}, inplace=True)
trips_full.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,temperature,wind_speed,rain,precipitation,company,payment_type,pickup_community_area_name,dropoff_community_area_name
0,498213267d9d8a5b9c55149a1eb8531bd8e8336b,10e8a565ac8829bd9caa44f57e9831fd8b4ff374e4d666...,2024-06-18T23:45:00.000,2024-06-18T23:45:00.000,342,1.12,6.25,1.0,0.0,0.0,7.75,41.899602,-87.633308,41.899602,-87.633308,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Flash Cab,Credit Card,Near North Side,Near North Side
1,48dbfe8f9c735bbfdce7e35028b45876b9f48469,da1a4de231c9fa53f23463812544aa4af3c4de1cfc4add...,2024-06-18T23:45:00.000,2024-06-19T00:00:00.000,777,3.03,11.46,3.0,0.0,0.0,14.46,41.922686,-87.649489,41.944227,-87.655998,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Flash Cab,Mobile,Lincoln Park,Lake View
2,439725470678efb2014c7f93d434d37930c64fe8,8e61957eda2e69d68b0bce4add726e0a84827e3f1ec9c9...,2024-06-18T23:45:00.000,2024-06-19T00:15:00.000,1570,14.43,36.5,6.15,0.0,4.0,47.15,41.980264,-87.913625,41.947792,-87.683835,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Blue Ribbon Taxi Association,Credit Card,O'Hare,North Center
3,4172db2de157874279cc72156d98ab811a94afd6,cfaf3ec1344ffc1a207c1cbe3b1a89ab060081a60450d1...,2024-06-18T23:45:00.000,2024-06-18T23:45:00.000,599,1.94,9.0,2.34,0.0,0.0,11.34,41.878866,-87.625192,41.899602,-87.633308,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Taxicab Insurance Agency Llc,Mobile,Loop,Near North Side
4,3fc0b7810f5a097fc79413e0733e704069a33b82,aee6a1f2577e5f670e300733aca9b27ad8fecd22425d14...,2024-06-18T23:45:00.000,2024-06-19T00:00:00.000,991,14.28,49.0,6.0,0.0,0.0,55.5,41.980264,-87.913625,41.922761,-87.699155,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Taxicab Insurance Agency Llc,Credit Card,O'Hare,Logan Square


In [256]:
# trips_full.info()
# date.info()
# trip_start_timestamp and date are objects

In [257]:
trips_full.head()

# 2024-06-18T23:45:00.000
# 2024-06-18

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,temperature,wind_speed,rain,precipitation,company,payment_type,pickup_community_area_name,dropoff_community_area_name
0,498213267d9d8a5b9c55149a1eb8531bd8e8336b,10e8a565ac8829bd9caa44f57e9831fd8b4ff374e4d666...,2024-06-18T23:45:00.000,2024-06-18T23:45:00.000,342,1.12,6.25,1.0,0.0,0.0,7.75,41.899602,-87.633308,41.899602,-87.633308,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Flash Cab,Credit Card,Near North Side,Near North Side
1,48dbfe8f9c735bbfdce7e35028b45876b9f48469,da1a4de231c9fa53f23463812544aa4af3c4de1cfc4add...,2024-06-18T23:45:00.000,2024-06-19T00:00:00.000,777,3.03,11.46,3.0,0.0,0.0,14.46,41.922686,-87.649489,41.944227,-87.655998,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Flash Cab,Mobile,Lincoln Park,Lake View
2,439725470678efb2014c7f93d434d37930c64fe8,8e61957eda2e69d68b0bce4add726e0a84827e3f1ec9c9...,2024-06-18T23:45:00.000,2024-06-19T00:15:00.000,1570,14.43,36.5,6.15,0.0,4.0,47.15,41.980264,-87.913625,41.947792,-87.683835,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Blue Ribbon Taxi Association,Credit Card,O'Hare,North Center
3,4172db2de157874279cc72156d98ab811a94afd6,cfaf3ec1344ffc1a207c1cbe3b1a89ab060081a60450d1...,2024-06-18T23:45:00.000,2024-06-18T23:45:00.000,599,1.94,9.0,2.34,0.0,0.0,11.34,41.878866,-87.625192,41.899602,-87.633308,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Taxicab Insurance Agency Llc,Mobile,Loop,Near North Side
4,3fc0b7810f5a097fc79413e0733e704069a33b82,aee6a1f2577e5f670e300733aca9b27ad8fecd22425d14...,2024-06-18T23:45:00.000,2024-06-19T00:00:00.000,991,14.28,49.0,6.0,0.0,0.0,55.5,41.980264,-87.913625,41.922761,-87.699155,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Taxicab Insurance Agency Llc,Credit Card,O'Hare,Logan Square


In [258]:
date['date'] = pd.to_datetime(date['date'])

In [259]:
date.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1827 entries, 0 to 1826
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       1827 non-null   datetime64[ns]
 1   year       1827 non-null   int64         
 2   month      1827 non-null   int64         
 3   day        1827 non-null   int64         
 4   dayofweek  1827 non-null   int64         
 5   isweekend  1827 non-null   bool          
dtypes: bool(1), datetime64[ns](1), int64(4)
memory usage: 73.3 KB


In [264]:
trips_full['trip_start_timestamp'] = pd.to_datetime(trips_full['trip_start_timestamp'])
trips_full['trip_start_date'] = trips_full['trip_start_timestamp'].dt.date # DATE(column)
trips_full['trip_start_date'] = pd.to_datetime(trips_full['trip_start_date'])


In [265]:
trips_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36083 entries, 0 to 36082
Data columns (total 25 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   trip_id                      36083 non-null  object        
 1   taxi_id                      36083 non-null  object        
 2   trip_start_timestamp         36083 non-null  datetime64[ns]
 3   trip_end_timestamp           36083 non-null  object        
 4   trip_seconds                 36083 non-null  int64         
 5   trip_miles                   36083 non-null  float64       
 6   fare                         36083 non-null  float64       
 7   tips                         36083 non-null  float64       
 8   tolls                        36083 non-null  float64       
 9   extras                       36083 non-null  float64       
 10  trip_total                   36083 non-null  float64       
 11  pickup_centroid_latitude     36083 non-nu

In [266]:
trips_full['trip_start_date'].dtypes

dtype('<M8[ns]')

In [267]:
trips_full.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,temperature,wind_speed,rain,precipitation,company,payment_type,pickup_community_area_name,dropoff_community_area_name,trip_start_date
0,498213267d9d8a5b9c55149a1eb8531bd8e8336b,10e8a565ac8829bd9caa44f57e9831fd8b4ff374e4d666...,2024-06-18 23:45:00,2024-06-18T23:45:00.000,342,1.12,6.25,1.0,0.0,0.0,7.75,41.899602,-87.633308,41.899602,-87.633308,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Flash Cab,Credit Card,Near North Side,Near North Side,2024-06-18
1,48dbfe8f9c735bbfdce7e35028b45876b9f48469,da1a4de231c9fa53f23463812544aa4af3c4de1cfc4add...,2024-06-18 23:45:00,2024-06-19T00:00:00.000,777,3.03,11.46,3.0,0.0,0.0,14.46,41.922686,-87.649489,41.944227,-87.655998,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Flash Cab,Mobile,Lincoln Park,Lake View,2024-06-18
2,439725470678efb2014c7f93d434d37930c64fe8,8e61957eda2e69d68b0bce4add726e0a84827e3f1ec9c9...,2024-06-18 23:45:00,2024-06-19T00:15:00.000,1570,14.43,36.5,6.15,0.0,4.0,47.15,41.980264,-87.913625,41.947792,-87.683835,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Blue Ribbon Taxi Association,Credit Card,O'Hare,North Center,2024-06-18
3,4172db2de157874279cc72156d98ab811a94afd6,cfaf3ec1344ffc1a207c1cbe3b1a89ab060081a60450d1...,2024-06-18 23:45:00,2024-06-18T23:45:00.000,599,1.94,9.0,2.34,0.0,0.0,11.34,41.878866,-87.625192,41.899602,-87.633308,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Taxicab Insurance Agency Llc,Mobile,Loop,Near North Side,2024-06-18
4,3fc0b7810f5a097fc79413e0733e704069a33b82,aee6a1f2577e5f670e300733aca9b27ad8fecd22425d14...,2024-06-18 23:45:00,2024-06-19T00:00:00.000,991,14.28,49.0,6.0,0.0,0.0,55.5,41.980264,-87.913625,41.922761,-87.699155,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Taxicab Insurance Agency Llc,Credit Card,O'Hare,Logan Square,2024-06-18


In [268]:
trips_full = pd.merge(trips_full, date, left_on="trip_start_date", right_on="date", how="inner")
trips_full = trips_full.drop(columns=["date"])
trips_full.head()

Unnamed: 0,trip_id,taxi_id,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,fare,tips,tolls,extras,trip_total,pickup_centroid_latitude,pickup_centroid_longitude,dropoff_centroid_latitude,dropoff_centroid_longitude,datetime_for_weather,temperature,wind_speed,rain,precipitation,company,payment_type,pickup_community_area_name,dropoff_community_area_name,trip_start_date,year,month,day,dayofweek,isweekend
0,498213267d9d8a5b9c55149a1eb8531bd8e8336b,10e8a565ac8829bd9caa44f57e9831fd8b4ff374e4d666...,2024-06-18 23:45:00,2024-06-18T23:45:00.000,342,1.12,6.25,1.0,0.0,0.0,7.75,41.899602,-87.633308,41.899602,-87.633308,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Flash Cab,Credit Card,Near North Side,Near North Side,2024-06-18,2024,6,18,2,False
1,48dbfe8f9c735bbfdce7e35028b45876b9f48469,da1a4de231c9fa53f23463812544aa4af3c4de1cfc4add...,2024-06-18 23:45:00,2024-06-19T00:00:00.000,777,3.03,11.46,3.0,0.0,0.0,14.46,41.922686,-87.649489,41.944227,-87.655998,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Flash Cab,Mobile,Lincoln Park,Lake View,2024-06-18,2024,6,18,2,False
2,439725470678efb2014c7f93d434d37930c64fe8,8e61957eda2e69d68b0bce4add726e0a84827e3f1ec9c9...,2024-06-18 23:45:00,2024-06-19T00:15:00.000,1570,14.43,36.5,6.15,0.0,4.0,47.15,41.980264,-87.913625,41.947792,-87.683835,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Blue Ribbon Taxi Association,Credit Card,O'Hare,North Center,2024-06-18,2024,6,18,2,False
3,4172db2de157874279cc72156d98ab811a94afd6,cfaf3ec1344ffc1a207c1cbe3b1a89ab060081a60450d1...,2024-06-18 23:45:00,2024-06-18T23:45:00.000,599,1.94,9.0,2.34,0.0,0.0,11.34,41.878866,-87.625192,41.899602,-87.633308,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Taxicab Insurance Agency Llc,Mobile,Loop,Near North Side,2024-06-18,2024,6,18,2,False
4,3fc0b7810f5a097fc79413e0733e704069a33b82,aee6a1f2577e5f670e300733aca9b27ad8fecd22425d14...,2024-06-18 23:45:00,2024-06-19T00:00:00.000,991,14.28,49.0,6.0,0.0,0.0,55.5,41.980264,-87.913625,41.922761,-87.699155,2024-06-18 23:00:00,31.8,30.6,0.0,0.0,Taxicab Insurance Agency Llc,Credit Card,O'Hare,Logan Square,2024-06-18,2024,6,18,2,False
