# Sampled Dataset exploration, meta-data collection

In [1]:
# Imports go here
import os
import glob
import pandas as pd

**Depending on the mode that you wish to use, you need to start Spark differently**

We first show how to run it in local mode, then how to run it in cluster mode (yarn). Therefore, **only run 1 of the two next cells**

### Convenience functions

What follows are some functions to extract essential meta-data from the list of files in the dataset. Such as:
- the year of the trips data in the file
- the month
- the file size
- the number of records
- the (relational) schema of the file

In [2]:
def get_schema(filename):
    '''Extracts the schema from the given file
    
    Assumes that the first line of the file includes the schema
    '''
    with open(filename, 'r') as f:
        return tuple([attr.strip('" ').lower() for attr in f.readline().strip().split(',')])
      
def get_month(filename):
    '''Returns the month that the TCL file reports on.
    
       Assumes that the filename uses the TLC convensions:
       $(fileSource)_tripdata_$(year)-$(month).csv
    '''
    return int(filename[-6:-4])

def get_year(filename):
    '''Returns the month that the TCL file reports on. 
    
       Assumes that the filename uses the TLC convensions:
       $(fileSource)_tripdata_$(year)-$(month).csv
    '''
    #TODO: to be completed by you, cf the get_month function!
    return int(filename[-11:-7])

def get_type(filename):
    '''Returns the type of trip that the TCL file reports on (yellow, green, fhv, hvfhv). 
    
       Assumes that the filename uses the TLC convensions:
       $(fileSource)_tripdata_$(year)-$(month).csv
    '''
    #TODO: to be completed by you
    
    # The basename of the file is the file name without the folder information
    # E.g., if filename = "/home/stijn/foo.txt" then the basename = "foo.txt"
    basename = os.path.basename(filename)
    # compute the type here from the basename string
    transport_class = basename.split('_', 1)[0]
    return transport_class # To be replaced with the correct answer

def get_numrecords(filename):
    '''Returns the number of records in a TCL file.
       
       Equals the number of lines in the file minus one 
       (the header, which is the schema, not a record)
    '''
    with open(filename) as f:
        lines = 0
        for line in f:
            lines += 1
        return lines - 1                   
    
def get_metadata(filename):
    '''Returns all metadata associated to the `filename` datafile as one big tuple'''
    return (filename, 
            get_type(filename),
            get_year(filename),
            get_month(filename),
            os.path.getsize(filename),
            get_numrecords(filename),
            get_schema(filename) )    
    

In [6]:
# NOTE: this cell actually reads 14 GB of data, so be patient ;-)
# You need to change the path here 
# Get a sorted list of all files in the sampled data set. Adjust the path of the dataset as necessary
files = sorted(glob.glob("/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/*.csv"))

In [9]:
files

['/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/fhv_tripdata_2015-01.csv',
 '/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/fhv_tripdata_2015-02.csv',
 '/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/fhv_tripdata_2015-03.csv',
 '/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/fhv_tripdata_2015-04.csv',
 '/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/fhv_tripdata_2015-05.csv',
 '/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/fhv_tripdata_2015-06.csv',
 '/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/fhv_tripdata_2015-07.csv',
 '/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/fhv_tripdata_2015-08.csv',
 '/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/fhv_tripdata_2015-09.csv',
 '/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/fhv_tripdata_2015-10.csv',
 '/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/fhv_tripdata_2015-11.csv',
 '/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/

In [7]:
# NOTE: this cell actually reads 14 GB of data, so be patient ;-)
# Compute the metadata for each such file, using the function that we introduced above
metadata = [ get_metadata(f) for f in files ]

In [8]:
metadata

[('/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/fhv_tripdata_2015-01.csv',
  'fhv',
  2015,
  1,
  4126514,
  136556,
  ('dispatching_base_num', 'pickup_date', 'locationid')),
 ('/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/fhv_tripdata_2015-02.csv',
  'fhv',
  2015,
  2,
  4712489,
  155514,
  ('dispatching_base_num', 'pickup_date', 'locationid')),
 ('/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/fhv_tripdata_2015-03.csv',
  'fhv',
  2015,
  3,
  4922012,
  163232,
  ('dispatching_base_num', 'pickup_date', 'locationid')),
 ('/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/fhv_tripdata_2015-04.csv',
  'fhv',
  2015,
  4,
  5845469,
  195182,
  ('dispatching_base_num', 'pickup_date', 'locationid')),
 ('/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/fhv_tripdata_2015-05.csv',
  'fhv',
  2015,
  5,
  6434970,
  214016,
  ('dispatching_base_num', 'pickup_date', 'locationid')),
 ('/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/

In [9]:
#pd.set_option('display.max_colwidth', 250)


# Put the metadata in a Pandas dataframe
metadata_labels = [ 'filename',  'type', 'year', 'month', 'size', 'num_records', 'schema']
df = pd.DataFrame.from_records(metadata, columns=metadata_labels)
df

Unnamed: 0,filename,type,year,month,size,num_records,schema
0,/home/marlene/Documents/ULB/CFDS/Final_Project...,fhv,2015,1,4126514,136556,"(dispatching_base_num, pickup_date, locationid)"
1,/home/marlene/Documents/ULB/CFDS/Final_Project...,fhv,2015,2,4712489,155514,"(dispatching_base_num, pickup_date, locationid)"
2,/home/marlene/Documents/ULB/CFDS/Final_Project...,fhv,2015,3,4922012,163232,"(dispatching_base_num, pickup_date, locationid)"
3,/home/marlene/Documents/ULB/CFDS/Final_Project...,fhv,2015,4,5845469,195182,"(dispatching_base_num, pickup_date, locationid)"
4,/home/marlene/Documents/ULB/CFDS/Final_Project...,fhv,2015,5,6434970,214016,"(dispatching_base_num, pickup_date, locationid)"
...,...,...,...,...,...,...,...
256,/home/marlene/Documents/ULB/CFDS/Final_Project...,yellow,2019,2,32109751,350590,"(vendorid, tpep_pickup_datetime, tpep_dropoff_..."
257,/home/marlene/Documents/ULB/CFDS/Final_Project...,yellow,2019,3,35856704,390942,"(vendorid, tpep_pickup_datetime, tpep_dropoff_..."
258,/home/marlene/Documents/ULB/CFDS/Final_Project...,yellow,2019,4,34046518,371197,"(vendorid, tpep_pickup_datetime, tpep_dropoff_..."
259,/home/marlene/Documents/ULB/CFDS/Final_Project...,yellow,2019,5,34643823,377704,"(vendorid, tpep_pickup_datetime, tpep_dropoff_..."


In [10]:
# Save the dataframe for future use (e.g., in later stages)
df.to_csv('/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/dataset-description.csv')

In [14]:
# You can use the `describe` function of a Pandas Series to get the basic statistics
df['size'].describe()

count    2.610000e+02
mean     5.699806e+07
std      5.040927e+07
min      5.787400e+04
25%      1.033738e+07
50%      4.048584e+07
75%      1.111178e+08
max      1.489577e+08
Name: size, dtype: float64

In [15]:
# You can use the `describe` function of a Pandas Series to get the basic statistics
df.describe()

Unnamed: 0,year,month,size,num_records
count,261.0,261.0,261.0,261.0
mean,2015.206897,6.360153,56998060.0,468590.6
std,2.818092,3.441906,50409270.0,328408.0
min,2009.0,1.0,57874.0,390.0
25%,2014.0,3.0,10337380.0,78219.0
50%,2016.0,6.0,40485840.0,514745.0
75%,2017.0,9.0,111117800.0,719293.0
max,2019.0,12.0,148957700.0,1192596.0


In [16]:
# TODO: get the same kind of information about the number of records here.
df['num_records'].describe()

count    2.610000e+02
mean     4.685906e+05
std      3.284080e+05
min      3.900000e+02
25%      7.821900e+04
50%      5.147450e+05
75%      7.192930e+05
max      1.192596e+06
Name: num_records, dtype: float64

Take the time to inspect the numbers you got above, so that you get an impression of the order of magnitude of files in the dataset. Write this down in your own words

**Conclusion:**

## Analysis of the schema evolution.

Over time, the relational schema associated to each type of trip data (yellow, green, fhv, hvfhv) has changed. Let us analyze the changes.

In [11]:
# Code to help analyze the schema changes goes here

# NOTE: What follows is *a* possible way of doing this. You may have a different way.
# Solution approach:
#   Given a pandas dataframe that contains all the files + their metadata of a single sub-dataset
#   let us create a new dataframe that adds two dataframe columns containing, respectively:
#      (1) the schema columns that were removed since the previous file; and 
#      (2) the schema columns added since the previous file

# You need to complete the  analyze_schema_changes function below !

def diff_schema(schema1, schema2):
    ''' Compute a tuple containing all elements of schema1 that are not in schema2        
    
        Example: if  schema1= ("a", "b", "c") and schema2 = ("b", "d", "e") the result = ("a", "c")
    '''
    lschema1 = list(schema1) # schema1 is a tuple, convert it to a list
    lschema2 = list(schema2) # schema2 is a tuple, convert it to a list
    removed = [ x for x in lschema1 if x not in lschema2 ]
    return tuple(removed) # removed is a list, convert it back to a tuple
    

def analyze_schema_changes(dataset):
    '''Analyze schema changes over time for all files in the dataset
    
    dataset: A dataframe that lists all files beloning to a given sub-dataset (fhv, yellow, green, ...)
             with their metadata, sorted lexicographically on (year, month)
    
    output: a dataframe that contains for each file two extra columns: removed, and added containing 
    '''
  
    prev_schema = () # assume the initial schema is empty
    labels = ['type', 'year', 'month',  'schema', 'removed', 'added'] # The column labels of the resulting dataframe

    # Solution approach: 
    dataset = dataset.sort_values(by=['year', 'month'])
    removed = []  # list of columns removed
    added = []  # list of columns added

    for row in range(len(dataset)):
        a = dataset.iloc[row]['schema']
        b = dataset.iloc[row-1]['schema']
        if row != 0:
            rm = diff_schema(b,a)
            ad = diff_schema(a,b)
        else:
            rm = prev_schema
            ad = prev_schema

        removed.append(rm)
        added.append(ad)
    dataset['removed'] = removed
    dataset['added'] = added

    # convert the result list to the dataframe
    return pd.DataFrame(dataset, columns=labels)

### Analysis of schema changes for fhv cab data files

Analyze the schema changes for the FHV cab data files. Write down your conclusions

In [16]:
# Compute the dataframe containing all files belonging to the fhv dataset
fhv_files = df[ df['type'] == 'fhv']
fhv_files.head()

Unnamed: 0,filename,type,year,month,size,num_records,schema
0,/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/fhv_tripdata_2015-01.csv,fhv,2015,1,4126514,136556,"(dispatching_base_num, pickup_date, locationid)"
1,/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/fhv_tripdata_2015-02.csv,fhv,2015,2,4712489,155514,"(dispatching_base_num, pickup_date, locationid)"
2,/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/fhv_tripdata_2015-03.csv,fhv,2015,3,4922012,163232,"(dispatching_base_num, pickup_date, locationid)"
3,/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/fhv_tripdata_2015-04.csv,fhv,2015,4,5845469,195182,"(dispatching_base_num, pickup_date, locationid)"
4,/home/marlene/Documents/ULB/CFDS/Final_Project_CFDS/data/fhv_tripdata_2015-05.csv,fhv,2015,5,6434970,214016,"(dispatching_base_num, pickup_date, locationid)"


In [135]:
# Analyze the schema changes
fhv_changes = analyze_schema_changes(fhv_files)
fhv_changes

Unnamed: 0,type,year,month,schema,removed,added
0,fhv,2015,1,"(dispatching_base_num, pickup_date, locationid)",(),()
1,fhv,2015,2,"(dispatching_base_num, pickup_date, locationid)",(),()
2,fhv,2015,3,"(dispatching_base_num, pickup_date, locationid)",(),()
3,fhv,2015,4,"(dispatching_base_num, pickup_date, locationid)",(),()
4,fhv,2015,5,"(dispatching_base_num, pickup_date, locationid)",(),()
5,fhv,2015,6,"(dispatching_base_num, pickup_date, locationid)",(),()
6,fhv,2015,7,"(dispatching_base_num, pickup_date, locationid)",(),()
7,fhv,2015,8,"(dispatching_base_num, pickup_date, locationid)",(),()
8,fhv,2015,9,"(dispatching_base_num, pickup_date, locationid)",(),()
9,fhv,2015,10,"(dispatching_base_num, pickup_date, locationid)",(),()


In [136]:
# fhv_changes has one row for each file in the fhv dataset
# The rows where nothing changes are actually not interesting
# Let us focus on only those lines where something is added or removed
fhv_delta = fhv_changes[(fhv_changes['added'] != ()) |  (fhv_changes['removed'] != ())]
fhv_delta

Unnamed: 0,type,year,month,schema,removed,added
24,fhv,2017,1,"(dispatching_base_num, pickup_datetime, dropoff_datetime, pulocationid, dolocationid)","(pickup_date, locationid)","(pickup_datetime, dropoff_datetime, pulocationid, dolocationid)"
30,fhv,2017,7,"(dispatching_base_num, pickup_datetime, dropoff_datetime, pulocationid, dolocationid, sr_flag)",(),"(sr_flag,)"
36,fhv,2018,1,"(pickup_datetime, dropoff_datetime, pulocationid, dolocationid, sr_flag, dispatching_base_number, dispatching_base_num)",(),"(dispatching_base_number,)"
48,fhv,2019,1,"(dispatching_base_num, pickup_datetime, dropoff_datetime, pulocationid, dolocationid, sr_flag)","(dispatching_base_number,)",()


Now inspect fhv_delta further, line by line, and describe what the changes are

### Conclusion for fhv data files

**From 2015.01 to 2016.12** we have (dispatching_base_num, pickup_date, locationid) as schema <br>
**In 2017.01** columns (pickup_date, locationid) were removed and columns (pickup_datetime, dropoff_datetime, pulocationid, dolocationid) <br> were added <br>
**In 2017.07**  column (sr_flag) was added. <br>
**In 2018.01**  column (dispatching_base_number,) was added.  <br>
**In 2019.01**  column (dispatching_base_number,) was removed.  <br>

### Analysis of schema changes for fhvhv data files

Analyze the schema changes for the FHV cab data files. Write down your conclusions

In [142]:
# Compute the dataframe containing all files belonging to the fhv dataset
fhvhv_files = df[ df['type'] == 'fhvhv']
fhvhv_files 
# Analyze the schema changes
fhvhv_changes = analyze_schema_changes(fhvhv_files)
fhvhv_changes

Unnamed: 0,type,year,month,schema,removed,added
59,fhvhv,2019,2,"(hvfhs_license_num, dispatching_base_num, pickup_datetime, dropoff_datetime, pulocationid, dolocationid, sr_flag)",(),()
60,fhvhv,2019,3,"(hvfhs_license_num, dispatching_base_num, pickup_datetime, dropoff_datetime, pulocationid, dolocationid, sr_flag)",(),()
61,fhvhv,2019,4,"(hvfhs_license_num, dispatching_base_num, pickup_datetime, dropoff_datetime, pulocationid, dolocationid, sr_flag)",(),()
62,fhvhv,2019,5,"(hvfhs_license_num, dispatching_base_num, pickup_datetime, dropoff_datetime, pulocationid, dolocationid, sr_flag)",(),()
63,fhvhv,2019,6,"(hvfhs_license_num, dispatching_base_num, pickup_datetime, dropoff_datetime, pulocationid, dolocationid, sr_flag)",(),()


In [34]:
len(fhvhv_files)

5

### Conclusion for fhvhv data files

**From 2019.02 to 2019.06** the schema remains the same: <br>
(hvfhs_license_num, dispatching_base_num, pickup_datetime, dropoff_datetime, pulocationid, dolocationid, sr_flag) 	

### Analysis of schema changes for green cab data files

Analyze the schema changes for the green taxi data files. Write down your conclusions

In [12]:
pd.set_option('display.max_colwidth', 350)
pd.set_option('display.max_row', 71)
# Compute the dataframe containing all files belonging to the fhv dataset
green_files = df[ df['type'] == 'green']
green_files
# Analyze the schema changes
green_changes = analyze_schema_changes(green_files)
green_changes

Unnamed: 0,type,year,month,schema,removed,added
64,green,2013,8,"(vendorid, lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee, total_amount, payment_type, trip_type)",(),()
65,green,2013,9,"(vendorid, lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee, total_amount, payment_type, trip_type)",(),()
66,green,2013,10,"(vendorid, lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee, total_amount, payment_type, trip_type)",(),()
67,green,2013,11,"(vendorid, lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee, total_amount, payment_type, trip_type)",(),()
68,green,2013,12,"(vendorid, lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee, total_amount, payment_type, trip_type)",(),()
69,green,2014,1,"(vendorid, lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee, total_amount, payment_type, trip_type)",(),()
70,green,2014,2,"(vendorid, lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee, total_amount, payment_type, trip_type)",(),()
71,green,2014,3,"(vendorid, lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee, total_amount, payment_type, trip_type)",(),()
72,green,2014,4,"(vendorid, lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee, total_amount, payment_type, trip_type)",(),()
73,green,2014,5,"(vendorid, lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee, total_amount, payment_type, trip_type)",(),()


In [32]:
len(green_files)

71

In [13]:
green_delta = green_changes[(green_changes['added'] != ()) |  (green_changes['removed'] != ())]
green_delta

Unnamed: 0,type,year,month,schema,removed,added
81,green,2015,1,"(vendorid, lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee, improvement_surcharge, total_amount, payment_type, trip_type)",(),"(improvement_surcharge,)"
99,green,2016,7,"(vendorid, lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, pulocationid, dolocationid, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee, improvement_surcharge, total_amount, payment_type, trip_type)","(pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude)","(pulocationid, dolocationid)"
129,green,2019,1,"(vendorid, lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, pulocationid, dolocationid, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee, improvement_surcharge, total_amount, payment_type, trip_type, congestion_surcharge)",(),"(congestion_surcharge,)"


### Conclusion for green data files

**From 2013.08 to 2014.12** we have as schema: <br>
    (vendorid, lpep_pickup_datetime, lpep_dropoff_datetime, store_and_fwd_flag, ratecodeid, pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, passenger_count, trip_distance, fare_amount, extra, mta_tax, tip_amount, tolls_amount, ehail_fee, total_amount, payment_type, trip_type)
**In 2015.01**  columns (pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude) were removed and 
columns (pulocationid, dolocationid) were added. <br>
**In 2019.01**  column (congestion_surcharge,) was added.  <br>

### Analysis of schema changes for yellow cab data files

Analyze the schema changes for the Yellow taxi data files. Write down your conclusions

In [144]:
# Compute the dataframe containing all files belonging to the fhv dataset
yellow_files = df[ df['type'] == 'yellow']
yellow_files
# Analyze the schema changes
yellow_changes = analyze_schema_changes(yellow_files)
yellow_changes

Unnamed: 0,type,year,month,schema,removed,added
135,yellow,2009,1,"(vendor_name, trip_pickup_datetime, trip_dropoff_datetime, passenger_count, trip_distance, start_lon, start_lat, rate_code, store_and_forward, end_lon, end_lat, payment_type, fare_amt, surcharge, mta_tax, tip_amt, tolls_amt, total_amt)",(),()
136,yellow,2009,2,"(vendor_name, trip_pickup_datetime, trip_dropoff_datetime, passenger_count, trip_distance, start_lon, start_lat, rate_code, store_and_forward, end_lon, end_lat, payment_type, fare_amt, surcharge, mta_tax, tip_amt, tolls_amt, total_amt)",(),()
137,yellow,2009,3,"(vendor_name, trip_pickup_datetime, trip_dropoff_datetime, passenger_count, trip_distance, start_lon, start_lat, rate_code, store_and_forward, end_lon, end_lat, payment_type, fare_amt, surcharge, mta_tax, tip_amt, tolls_amt, total_amt)",(),()
138,yellow,2009,4,"(vendor_name, trip_pickup_datetime, trip_dropoff_datetime, passenger_count, trip_distance, start_lon, start_lat, rate_code, store_and_forward, end_lon, end_lat, payment_type, fare_amt, surcharge, mta_tax, tip_amt, tolls_amt, total_amt)",(),()
139,yellow,2009,5,"(vendor_name, trip_pickup_datetime, trip_dropoff_datetime, passenger_count, trip_distance, start_lon, start_lat, rate_code, store_and_forward, end_lon, end_lat, payment_type, fare_amt, surcharge, mta_tax, tip_amt, tolls_amt, total_amt)",(),()
140,yellow,2009,6,"(vendor_name, trip_pickup_datetime, trip_dropoff_datetime, passenger_count, trip_distance, start_lon, start_lat, rate_code, store_and_forward, end_lon, end_lat, payment_type, fare_amt, surcharge, mta_tax, tip_amt, tolls_amt, total_amt)",(),()
141,yellow,2009,7,"(vendor_name, trip_pickup_datetime, trip_dropoff_datetime, passenger_count, trip_distance, start_lon, start_lat, rate_code, store_and_forward, end_lon, end_lat, payment_type, fare_amt, surcharge, mta_tax, tip_amt, tolls_amt, total_amt)",(),()
142,yellow,2009,8,"(vendor_name, trip_pickup_datetime, trip_dropoff_datetime, passenger_count, trip_distance, start_lon, start_lat, rate_code, store_and_forward, end_lon, end_lat, payment_type, fare_amt, surcharge, mta_tax, tip_amt, tolls_amt, total_amt)",(),()
143,yellow,2009,9,"(vendor_name, trip_pickup_datetime, trip_dropoff_datetime, passenger_count, trip_distance, start_lon, start_lat, rate_code, store_and_forward, end_lon, end_lat, payment_type, fare_amt, surcharge, mta_tax, tip_amt, tolls_amt, total_amt)",(),()
144,yellow,2009,10,"(vendor_name, trip_pickup_datetime, trip_dropoff_datetime, passenger_count, trip_distance, start_lon, start_lat, rate_code, store_and_forward, end_lon, end_lat, payment_type, fare_amt, surcharge, mta_tax, tip_amt, tolls_amt, total_amt)",(),()


In [145]:
len(yellow_files)

126

In [146]:
yellow_delta = yellow_changes[(yellow_changes['added'] != ()) |  (yellow_changes['removed'] != ())]
yellow_delta

Unnamed: 0,type,year,month,schema,removed,added
147,yellow,2010,1,"(vendor_id, pickup_datetime, dropoff_datetime, passenger_count, trip_distance, pickup_longitude, pickup_latitude, rate_code, store_and_fwd_flag, dropoff_longitude, dropoff_latitude, payment_type, fare_amount, surcharge, mta_tax, tip_amount, tolls...","(vendor_name, trip_pickup_datetime, trip_dropoff_datetime, start_lon, start_lat, store_and_forward, end_lon, end_lat, fare_amt, tip_amt, tolls_amt, total_amt)","(vendor_id, pickup_datetime, dropoff_datetime, pickup_longitude, pickup_latitude, store_and_fwd_flag, dropoff_longitude, dropoff_latitude, fare_amount, tip_amount, tolls_amount, total_amount)"
207,yellow,2015,1,"(vendorid, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, pickup_longitude, pickup_latitude, ratecodeid, store_and_fwd_flag, dropoff_longitude, dropoff_latitude, payment_type, fare_amount, extra, mta_tax, tip_amount,...","(vendor_id, pickup_datetime, dropoff_datetime, rate_code, surcharge)","(vendorid, tpep_pickup_datetime, tpep_dropoff_datetime, ratecodeid, extra, improvement_surcharge)"
225,yellow,2016,7,"(vendorid, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, ratecodeid, store_and_fwd_flag, pulocationid, dolocationid, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_...","(pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude)","(pulocationid, dolocationid)"
255,yellow,2019,1,"(vendorid, tpep_pickup_datetime, tpep_dropoff_datetime, passenger_count, trip_distance, ratecodeid, store_and_fwd_flag, pulocationid, dolocationid, payment_type, fare_amount, extra, mta_tax, tip_amount, tolls_amount, improvement_surcharge, total_...",(),"(congestion_surcharge,)"


### Conclusion for yellow data files

**From 2015.01 to 2016.12** we have (dispatching_base_num, pickup_date, locationid) as schema <br>
**In 2017.07**  column (sr_flag) was added. <br>
**In 2018.01**  column (dispatching_base_number,) was added.  <br>
**In 2019.01**  column (dispatching_base_number,) was removed.  <br>