In [1]:
from sqlalchemy import create_engine
import mta_functions as mta
import pandas as pd
import numpy as np

# DB Preprocessing

In [2]:
# Pull data if necessary
mta.download_files(16, '2020-01-04')

In [3]:
# Create dataframe from unprocessed directory files
mta_data = mta.df_from_directory('../data/turnstiles/processing')

In [4]:
# Create hash for each entry
# mta_data.set_index(pd.util.hash_pandas_object(mta_data), drop=False, inplace=True)

# Create concat string for entries
mta_data['UID'] = mta.concat_columns(mta_data, ['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME', 'DESC'])

In [5]:
# Convert entry string to hash
mta_data['UID'] = mta_data.apply(lambda x: mta.create_hashed_column(x['UID'], 16), axis = 1)

In [6]:
# Create concat string for turnstile
mta_data['TID'] = mta.concat_columns(mta_data, ['C/A', 'UNIT', 'SCP'])

In [7]:
# Create Turnstile ID for each turnstile
mta_data['TID'] = mta_data.apply(lambda x: mta.create_hashed_column(x['TID'], 16), axis = 1)

In [8]:
# Create DateTime column
mta_data['DATETIME'] = pd.to_datetime(mta_data.DATE + ' ' + mta_data.TIME)

In [9]:
#  Fix EXITS column name
mta_data.columns = [column.strip() for column in mta_data.columns]

In [10]:
# Reorganize Columns
mta_data = mta_data[['UID','TID','C/A','UNIT','SCP','STATION','LINENAME','DIVISION','DATETIME', 'DESC','ENTRIES','EXITS']]

In [11]:
mta_data

Unnamed: 0,UID,TID,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATETIME,DESC,ENTRIES,EXITS
0,9A2F381911EE6F7A,90191926F891F8C7,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-03-07 03:00:00,REGULAR,7402469,2511926
1,7FB9FB3796D02C5D,90191926F891F8C7,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-03-07 07:00:00,REGULAR,7402480,2511937
2,03DB69A90A1751AB,90191926F891F8C7,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-03-07 11:00:00,REGULAR,7402559,2512022
3,A51ABAC63D9013B1,90191926F891F8C7,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-03-07 15:00:00,REGULAR,7402755,2512073
4,B1EBC5155669839F,90191926F891F8C7,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-03-07 19:00:00,REGULAR,7403040,2512132
...,...,...,...,...,...,...,...,...,...,...,...,...
205616,F96EEC002D2D5CEC,B1A51961660208B4,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2020-01-17 04:00:00,REGULAR,5554,420
205617,6CAC93A8AE5EB49F,B1A51961660208B4,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2020-01-17 08:00:00,REGULAR,5554,420
205618,FD149E9B7C4238FB,B1A51961660208B4,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2020-01-17 12:00:00,REGULAR,5554,420
205619,E39BF10EA18411A6,B1A51961660208B4,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2020-01-17 16:00:00,REGULAR,5554,420


# Table Creation

## Division table

In [12]:
# Create dataframe for divisions
division_df = mta_data['DIVISION'].drop_duplicates().reset_index(drop=True)

In [13]:
# Create dictionary from dataframe
division_dict = division_df.to_dict()

In [14]:
# Create/append table and values
mta.df_append_to_table(division_df, 'divisions')

In [15]:
# Reassign df from updated table data
division_df = mta.df_from_table('divisions').reset_index(drop=True)

In [16]:
division_df

Unnamed: 0,index,DIVISION
0,0,BMT
1,1,IND
2,2,PTH
3,3,IRT
4,4,SRT
5,5,RIT


## Linename table

In [17]:
# Create dataframe for linenames
linename_df = mta_data['LINENAME'].drop_duplicates().reset_index(drop=True)

In [18]:
# Create dictionary from dataframe
linename_dict = linename_df.to_dict()

In [19]:
# Append/create table and values
mta.df_append_to_table(linename_df, 'linenames')

In [20]:
# Reassign dataframe from updated table data
linename_df = mta.df_from_table('linenames')

## Station Table

In [21]:
# Create dataframe for stations
station_df = mta_data['STATION'].drop_duplicates().reset_index(drop=True)

In [22]:
# Create dictionary from dataframe
station_dict = station_df.to_dict()

In [23]:
# Append/create table and values
mta.df_append_to_table(station_df, 'stations')

In [24]:
# Reassign dataframe from updated table data
station_df = mta.df_from_table('stations')

## Description Table

In [25]:
# Create dataframe for descriptions
desc_df = mta_data['DESC'].drop_duplicates().reset_index(drop=True)

In [26]:
# Create dictionary from dataframe
desc_dict = desc_df.to_dict()

In [27]:
# Append/create table and values
mta.df_append_to_table(desc_df, 'descriptions')

In [28]:
# Reassign dataframe from updated table data
desc_df = mta.df_from_table('descriptions')

In [29]:
mta_data.head()

Unnamed: 0,UID,TID,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATETIME,DESC,ENTRIES,EXITS
0,9A2F381911EE6F7A,90191926F891F8C7,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-03-07 03:00:00,REGULAR,7402469,2511926
1,7FB9FB3796D02C5D,90191926F891F8C7,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-03-07 07:00:00,REGULAR,7402480,2511937
2,03DB69A90A1751AB,90191926F891F8C7,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-03-07 11:00:00,REGULAR,7402559,2512022
3,A51ABAC63D9013B1,90191926F891F8C7,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-03-07 15:00:00,REGULAR,7402755,2512073
4,B1EBC5155669839F,90191926F891F8C7,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-03-07 19:00:00,REGULAR,7403040,2512132


In [30]:
mta_dataTimes Sq-42 St

SyntaxError: invalid syntax (<ipython-input-30-3f7d7edd001d>, line 1)

In [29]:
raise Exception('Pause')

Exception: Pause

In [11]:
# Create df for turnstiles
turnstiles_columns = ['TID', 'C/A','UNIT','SCP','STATION','LINENAME','DIVISION']

# Append values to table
mta.df_append_to_table(turnstiles_df, 'turnstiles')

KeyError: Index(['TID'], dtype='object')

In [18]:
# mta_data = mta_data.drop(['TID','C/A','UNIT','SCP','STATION','LINENAME','DIVISION','DATETIME', 'DESC','ENTRIES','EXITS'], axis=1)
# mta_data

In [19]:
# mta.df_append_to_table(mta_data, 'traffic')

In [10]:
# Make copy of db to windows
cp ../data/db/mta.db /mnt/d/Metis/mnt_files/db/mta.db