# Data Wrangling Notebook 2

In [2]:
# Importing relevant libraries
from bs4 import BeautifulSoup as soup
import requests
import re
import boto3
import sys
import os
import pandas as pd
import csv
import s3fs

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast, StructType
from pyspark.sql.types import *
from pyspark.sql.functions import col, split, slice, count, when, expr, isnan, isnull
from pyspark.sql.functions import date_format, to_timestamp, concat, unix_timestamp, substring, lit
from pyspark.sql.functions import col, month, quarter, dayofweek, year
from pyspark.sql import functions as f
from pyspark.sql.types import StringType,BooleanType,DateType,IntegerType
from pyspark.sql.functions import monotonically_increasing_id 
from pyspark.sql.window import Window
from pyspark.sql.functions import regexp_replace

import configparser
import findspark
import lxml
from datetime import timedelta
from pandas.tseries.offsets import BDay
import itertools
import warnings
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Reading From S3 Bucket using boto3

In [None]:
############# This code block is used for pulling data from S3 using boto3 library
############# Note: Not necessary for this project

#getting relevant information for the S3 bucket

# AWS_S3_BUCKET='w210-bucket'
# AWS_S3_REGION='us-east-1'
# AWS_PROFILE_NAME='default'
# session = boto3.Session(profile_name=AWS_PROFILE_NAME)
# s3 = session.resource('s3')
# s3_client = session.client('s3',region_name=AWS_S3_REGION)
# my_bucket = s3.Bucket(AWS_S3_BUCKET)

# #printing all the files in ridership directory 
# for objects in my_bucket.objects.filter(Prefix="ridership/"):
#     print(objects.key)

# #printing all the files in weather directory 
# for objects in my_bucket.objects.filter(Prefix="weather/"):
#     print(objects.key)

# #reading one file only as pandas df
# obj = s3_client.get_object(Bucket= AWS_S3_BUCKET, Key= "ridership/date-hour-soo-dest-2011.csv") 
# # get object and file (key) from bucket
# initial_df = pd.read_csv(obj['Body'], header=None) 
# initial_df

# Reading From S3 Bucket using PySpark

In [4]:
### Starting Pyspark Session

spark = SparkSession.builder\
                    .config('spark.master','local[*]')\
                    .config('spark.add.name','S3app')\
                    .config('spark.jars.packages','org.apache.hadoop:hadoop-aws:3.3.4,org.apache.hadoop:hadoop-common:3.3.4')\
                    .config("spark.driver.memory", "8g") \
                    .getOrCreate()


In [5]:
spark

In [6]:
### Configuring Pyspark to read data from S3 Bucket. 

findspark.init()
config = configparser.ConfigParser()
# AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
# AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

aws_profile = 'default'
config.read(os.path.expanduser("~/.aws/credentials"))
access_id = config.get(aws_profile, "aws_access_key_id") 
access_key = config.get(aws_profile, "aws_secret_access_key")

# conf = SparkConf()
# conf.set('spark.executor.memory', '2g')
# spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
# spark.conf.set("spark.sql.autoBroadcastJoinThreshold", 104857600)
# spark.conf.set("spark.sql.broadcastTimeout", 3000)

# spark.conf.set('spark.executor.memory', '4G')
# spark.conf.set('spark.driver.memory', '16G')
# spark.conf.set('spark.driver.maxResultSize', '10G')

# spark.conf.set("spark.driver.maxResultSize", "8g")




sc=spark.sparkContext
hadoop_conf=sc._jsc.hadoopConfiguration()
hadoop_conf.set("fs.s3a.committer.name","magic")
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("fs.s3a.awsAccessKeyId", access_id)
hadoop_conf.set("fs.s3a.awsSecretAccessKey", access_key)
hadoop_conf.set('spark.sql.files.maxPartitionBytes','134217728')

# Sports Event Data

### In this section of Notebook, we will read the Sports Events data and do data wrangling

We will use webscrapping to grab the necessary data

In [7]:
######## Note: This block has been commented out, the webscrapping has already been completed and added to S3 bucket. ########

# # mapping dates
# days = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4,'May':5,'Jun':6, 
#          'Jul':7, 'Aug':8, 'Sep':9,'Oct':10, 'Nov':11, 'Dec':12}

# # generate input dates
# date_range = pd.date_range('2011-01-01', '2022-12-31', freq='D')
# date_range = pd.Series(date_range)
# date_range = pd.to_datetime(date_range)
# date_range_list = date_range.dt.strftime('%Y/%m/%d').tolist()
# date_range_list.sort()
# input_dates = date_range_list

# # input_dates = ['2022/02/27','2022/03/02','2022/11/01','2022/11/04','2022/11/05']

# # starting an empty dataframe
# final_df = pd.DataFrame()

# #web scrapping the relevant information

# # getting data for each date
# for input_date in input_dates:
#     str = 'https://dothebay.com/events/sports/{}'
#     url = str.format(input_date)
#     page = requests.get(url)
#     soup = BeautifulSoup(page.text,"html.parser")
    
#     # starting empty lists
#     event_venues = []
#     venue_location = []
#     event_ts = []
#     latitude = []
#     longitude = []
    
#     overall_groups = soup.find('div', attrs={'class','ds-listings ds-listings-list'})
#     if overall_groups is None:
#         continue
#     if overall_groups.find('div', attrs={'class','ds-events-page'}) is None:
#         continue 
#     item = soup.find('div', attrs={'class': 'ds-events-group'})
#     if item is None:
#         continue
#     upcoming = soup.find('div', attrs={'class': 'upcoming-event-groups'})
#     if upcoming is None:
#         continue
     
#     #getting the venue names
#     for i in item.find_all('div', attrs={'class': 'ds-venue-name'}):
#         event_venues.append(i.text)
#         venue_names = [i.strip() for i in event_venues]
        
#     #getting the timestamps
#     for t in item.find_all('meta', attrs={'itemprop':'startDate'}):
#         time = t.get('content')
#         event_ts.append(time)
        
    
#     #getting the geo_locations    
#     for l in item.find_all('span',attrs={'itemprop':'geo'}):
#         for lat in l.find_all('meta', attrs={'itemprop':'latitude'}):
#             lati = lat.get('content')
#             latitude.append(lati)
#         for lon in l.find_all('meta', attrs={'itemprop':'longitude'}):
#             long = lon.get('content')
#             longitude.append(long)
            
#     #generating a dataframe
#     venue_names = pd.DataFrame(venue_names, columns=['venue_name'])
#     latitude = pd.DataFrame(latitude, columns=['latitude'])
#     longitude = pd.DataFrame(longitude, columns=['longitude'])
#     time = pd.DataFrame(event_ts, columns=['ts'])

#     df_1 = pd.concat([time,venue_names,latitude,longitude,], axis = 1)
#     final_df = pd.concat([final_df,df_1], axis =0)



In [15]:
#### read the sporting events data. 

#read to pandas
sports = pd.read_excel('s3a://w210-bucket/sports_events_full.xlsx')
sports_1 = sports.dropna(subset = ['ts'])

# Filling up the missing longitude and latitude data
# getting rid of NAs
sports_2 = sports_1.drop(columns=['ts']) \
                   .dropna() \
                   .drop_duplicates(subset=['venue_name'], keep='first', inplace=False, ignore_index=True)

# sports_2 = sports_2.dropna()
# sports_2 = sports_2.drop_duplicates(subset=['venue_name'], keep='first', inplace=False, ignore_index=True)
sports_3 = pd.merge(sports_1, sports_2, on = 'venue_name', how ='outer')

sports_4 = sports_3.drop(columns=['latitude_x','longitude_x']) \
                   .drop(columns=['ts']) \
                   .drop_duplicates(subset=['venue_name'], keep='first', inplace=False, ignore_index=True) \
                   .rename(columns={"latitude_y": "latitude", "longitude_y": "longitude"})

data = {'venue_name': ['Civic Center Plaza','Civic Center','Downtown Oakland','Spartan Stadium','Laird Q. Cagan Stadium',
                       'UCB Edwards Stadium','PayPal Park', 'AT&T Park',
                       ],
        'latitude': [37.7801340,37.7799730,37.8043549,37.3197651,37.4332795,37.8691258,37.3513242,37.3350794],
        'longitude': [-122.4177319,-122.4187285,-122.2710607,-121.8682858,-122.1580094,-122.2648963,-121.9245659,-121.880754]}
missing_geos =pd.DataFrame(data)

sports_5 = pd.concat([sports_4,missing_geos]) \
             .dropna()

final_sports = pd.merge(sports_1,sports_5, on = 'venue_name', how = 'outer' )
final_sports = final_sports.drop(columns=['latitude_x','longitude_x'])\
                           .rename(columns={"latitude_y": "latitude", "longitude_y": "longitude"}) \
                           .dropna()

# #convert to PySpark Dataframe
mySchema = StructType([ StructField("ts", StringType(), True)\
                       ,StructField("venue_name", StringType(), True) \
                       ,StructField("latitude", StringType(), True) \
                       ,StructField("longitude", StringType(), True)])
df_sports = spark.createDataFrame(final_sports, schema=mySchema)

#convert to timestamp
df_sports = df_sports.withColumn("ts", concat(substring(col("ts"), 1, 10), lit(' '), substring(col("ts"),12 , 2), lit(':00')))
df_sports = df_sports.withColumn("ts",f.to_timestamp(col('ts')))

#rename venue latitude and longitude
df_sports = df_sports.withColumnRenamed('latitude','latitude_venue') \
                     .withColumnRenamed('longitude','longitude_venue')
df_sports = df_sports.orderBy('ts')
df_sports = df_sports.withColumn("longitude_venue", concat(substring(col("longitude_venue"), 1, 11)))
df_sports = df_sports.withColumn("latitude_venue", concat(substring(col("latitude_venue"), 1, 9)))

# filtering venues
df_sports_filtered = df_sports.toPandas()
df_sports_filtered=df_sports_filtered.replace('Avaya Stadium (SJ)','PayPal Park')
df_sports_filtered=df_sports_filtered.replace('AT&T Park','Oracle Park')
df_sports_filtered = df_sports_filtered[df_sports_filtered.groupby("venue_name")["venue_name"].transform('count') > 4]
filtered = df_sports_filtered['venue_name'] != 'Era Art Bar and Lounge'
df_sports_filtered = df_sports_filtered[filtered]
filtered = df_sports_filtered['venue_name'] != '4th Street Pizza Company'
df_sports_filtered = df_sports_filtered[filtered]
filtered = df_sports_filtered['venue_name'] != 'Sweetwater Music Hall'
df_sports_filtered = df_sports_filtered[filtered]
filtered = df_sports_filtered['venue_name'] != 'Trademark Sports Bar & Eatery'
df_sports_filtered = df_sports_filtered[filtered]
filtered = df_sports_filtered['venue_name'] != 'Madrone Art Bar'
df_sports_filtered = df_sports_filtered[filtered]
filtered = df_sports_filtered['venue_name'] != 'Trademark and Copyright'
df_sports_filtered = df_sports_filtered[filtered]
filtered = df_sports_filtered['venue_name'] != 'Hawthorn'
df_sports_filtered = df_sports_filtered[filtered]
filtered = df_sports_filtered['venue_name'] != 'Jaxson'
df_sports_filtered = df_sports_filtered[filtered]
filtered = df_sports_filtered['venue_name'] != 'Harding Park'
df_sports_filtered = df_sports_filtered[filtered]
filtered = df_sports_filtered['venue_name'] != 'Oakland Technical High School'
df_sports_filtered = df_sports_filtered[filtered]
filtered = df_sports_filtered['venue_name'] != 'Spark Social SF'
df_sports_filtered = df_sports_filtered[filtered]
filtered = df_sports_filtered['venue_name'] != 'The Phoenix Theater (Petaluma)'
df_sports_filtered = df_sports_filtered[filtered]
filtered = df_sports_filtered['venue_name'] != 'Yellowjacket Stadium'
df_sports_filtered = df_sports_filtered[filtered]
filtered = df_sports_filtered['venue_name'] != 'Golden 1 Center'
df_sports_filtered = df_sports_filtered[filtered]
filtered = df_sports_filtered['venue_name'] != 'District Six'
df_sports_filtered = df_sports_filtered[filtered]
filtered = df_sports_filtered['venue_name'] != 'The New Parkway'
df_sports_filtered = df_sports_filtered[filtered]
filtered = df_sports_filtered['venue_name'] != 'Candlestick Park'
df_sports_filtered = df_sports_filtered[filtered]
filtered = df_sports_filtered['venue_name'] != 'Kaiser Permanente Arena'



df_sports_filtered = df_sports_filtered[filtered]
sports_filtered = spark.createDataFrame(df_sports_filtered)
sports_filtered = sports_filtered.dropDuplicates()

sports_filtered.show()
sports_filtered.printSchema()
sports_filtered.count()

+-------------------+--------------------+--------------+---------------+
|                 ts|          venue_name|latitude_venue|longitude_venue|
+-------------------+--------------------+--------------+---------------+
|2013-02-10 19:00:00|Event Center at SJSU|     37.335079|    -121.880754|
|2013-01-16 19:00:00|          Cow Palace|     37.706765|    -122.418738|
|2015-07-05 13:00:00|Oakland-Alameda C...|     37.750342|    -122.202805|
|2012-03-24 19:00:00|          SAP Center|     37.332240|    -121.901650|
|2012-11-16 19:00:00|          Cow Palace|     37.706765|    -122.418738|
|2013-02-15 19:00:00|Event Center at SJSU|     37.335079|    -121.880754|
|2014-01-22 19:00:00|          SAP Center|     37.332240|    -121.901650|
|2013-01-29 19:00:00|          SAP Center|     37.332240|    -121.901650|
|2012-10-03 19:00:00|          SAP Center|     37.332240|    -121.901650|
|2013-01-31 19:00:00|          SAP Center|     37.332240|    -121.901650|
|2014-03-23 17:00:00|          SAP Cen

2940

In [16]:
sports_filtered.select('venue_name', 'latitude_venue', 'longitude_venue').distinct().show(50,truncate=False)

+-------------------------------+--------------+---------------+
|venue_name                     |latitude_venue|longitude_venue|
+-------------------------------+--------------+---------------+
|SAP Center                     |37.332240     |-121.901650    |
|Kezar Stadium                  |37.766948     |-122.456000    |
|Event Center at SJSU           |37.335079     |-121.880754    |
|Oakland-Alameda County Coliseum|37.750342     |-122.202805    |
|Oracle Park                    |37.335079     |-121.880754    |
|Cow Palace                     |37.706765     |-122.418738    |
|Oracle Park                    |37.778419     |-122.390621    |
|The Chapel                     |37.760565     |-122.421188    |
|Levi's Stadium                 |37.402317     |-121.968995    |
|PayPal Park                    |37.351573     |-121.925482    |
|Stanford Stadium               |37.434529     |-122.161122    |
|Spartan Stadium                |37.319765     |-121.868285    |
|Chase Center            

In [17]:
sports_filtered = sports_filtered.withColumn("latitude_venue", \
              when(sports_filtered["venue_name"] == 'Event Center at SJSU', 37.335366).otherwise(sports_filtered["latitude_venue"]))
sports_filtered = sports_filtered.withColumn("longitude_venue", \
              when(sports_filtered["venue_name"] == 'Event Center at SJSU', -121.881039).otherwise(sports_filtered["longitude_venue"]))
# 37.3353665202424, -121.8810393157378

sports_filtered = sports_filtered.withColumn("latitude_venue", \
              when(sports_filtered["venue_name"] == 'Oracle Park', 37.778832).otherwise(sports_filtered["latitude_venue"]))
sports_filtered = sports_filtered.withColumn("longitude_venue", \
              when(sports_filtered["venue_name"] == 'Oracle Park', -122.389344).otherwise(sports_filtered["longitude_venue"]))
# 37.77883251646694, -122.38934490408342

sports_filtered = sports_filtered.withColumn("latitude_venue", \
              when(sports_filtered["venue_name"] == 'Cow Palace', 37.707880).otherwise(sports_filtered["latitude_venue"]))
sports_filtered = sports_filtered.withColumn("longitude_venue", \
              when(sports_filtered["venue_name"] == 'Cow Palace', -122.420308).otherwise(sports_filtered["longitude_venue"]))
# 37.70788067558349, -122.42030831580716

sports_filtered = sports_filtered.withColumn("latitude_venue", \
              when(sports_filtered["venue_name"] == 'Historic BAL Theatre', 37.708800).otherwise(sports_filtered["latitude_venue"]))
sports_filtered = sports_filtered.withColumn("longitude_venue", \
              when(sports_filtered["venue_name"] == 'Historic BAL Theatre', -122.133087).otherwise(sports_filtered["longitude_venue"]))

# 37.708800133893575, -122.1330872483417

In [19]:
#### read the station data to get station abbreviations, Longitude, and Latitude

df_station = spark.read.option("header",True) \
           .csv("s3a://w210-bucket/ridership/station_bart.csv")

#rename columns
df_station = df_station.withColumnRenamed('Abbreviation','abbreviation') \
                        .withColumnRenamed('Location','location')\
                        .withColumnRenamed('Name','station_name').drop('description').drop('name')

#split longitude and latitude into separate columns
df_station = df_station.withColumn("longitude", split(col("location"), ",").getItem(0)) \
                         .withColumn("latitude", split(col("location"), ",").getItem(1)).drop('location')

# adding the missing station data to the df_station data
columns = ['abbreviation', 'station_name', 'longitude', 'latitude']
data = [\
        ('ANTC','Antioch Station', '-121.780320', '37.996012'),\
        ('PCTR','Pittsburg Center Station', '-121.888538', '38.018200'),\
        ('BERY','Berryessa Bart Station', '-121.874689', '37.368572'),\
        ('MLPT','Milpitas Bart Station', '-121.890621', '37.409878')]

missing_stations = spark.createDataFrame(data, columns)

df_station_full = df_station.union(missing_stations)
df_station_full.printSchema()
df_station_full.show(5, truncate = False)
df_station_full.count()

root
 |-- abbreviation: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- latitude: string (nullable = true)

+------------+-----------------------------------+-----------+---------+
|abbreviation|station_name                       |longitude  |latitude |
+------------+-----------------------------------+-----------+---------+
|12TH        |12th St. Oakland City Center (12TH)|-122.271450|37.803768|
|16TH        |16th St. Mission (16TH)            |-122.419694|37.765062|
|19TH        |19th St. Oakland (19TH)            |-122.268602|37.808350|
|24TH        |24th St. Mission (24TH)            |-122.418143|37.752470|
|ASHB        |Ashby (ASHB)                       |-122.270062|37.852803|
+------------+-----------------------------------+-----------+---------+
only showing top 5 rows



50

In [511]:
# # Calculate all the sports venue within a 3 mile radius
# from geopy.distance import geodesic

# @f.udf(returnType=FloatType())

# def geodesic_udf(a, b):
#     return geodesic(a, b).km*0.62

# sports_filtered1= sports_filtered.drop('ts')
# venue_bart_joined = sports_filtered1.join(df_station_full) \
#                                .withColumn('distance_mile', geodesic_udf(f.array('latitude','longitude'),
#                                                                      f.array('latitude_venue','longitude_venue'))).drop('station_name')

# venue_bart_joind_3_miles = venue_bart_joined.filter(col("distance_mile") <= 4).drop('longitude','latitude').dropDuplicates()
# venue_bart_joind_3_miles.show(5)
# venue_bart_joind_3_miles.printSchema()
# # venue_bart_joind_3_miles.count()                             

In [512]:
# saving venue_bart_joind_3_miles to a parquet file 
# venue_bart_joind_3_miles.write.parquet('s3a://w210-bucket/data_wrangling/venue_bart_joind_3_miles.parquet',mode='overwrite')

In [34]:
#Alternatively, calculate the closest distance of each bart station to a sports venue

# station_weather_joined to the station data
sports_filtered1 = sports_filtered.withColumn("index", monotonically_increasing_id())

#parititioning by index 
w=Window().partitionBy("index")

#calculating the Euclidean distance between Bart stations and weather stations
#filtering by min distance from each bart station to weather stations 
bart_venue_min_distance = sports_filtered1.join(df_station_full) \
            .withColumn("venue_distance",f.sqrt(f.pow(f.col("latitude")-f.col("latitude_venue"),2)+\
                                   f.pow(f.col("longitude")-f.col("longitude_venue"),2)))\
            .withColumn("venue_min_distance", f.min("venue_distance").over(w))\
            .filter('venue_distance=venue_min_distance') \
            .drop('station_name','index','venue_distance')
bart_venue_min_distance = bart_venue_min_distance.dropDuplicates()

# fixing the error 
bart_venue_min_distance = bart_venue_min_distance.withColumn("abbreviation", \
              when(bart_venue_min_distance["venue_name"] == 'Cow Palace', 'BALB').otherwise(bart_venue_min_distance["abbreviation"]))
bart_venue_min_distance = bart_venue_min_distance.withColumn("longitude", \
              when(bart_venue_min_distance["venue_name"] == 'BALB', -122.447506).otherwise(bart_venue_min_distance["longitude"]))
bart_venue_min_distance = bart_venue_min_distance.withColumn("latitude", \
              when(bart_venue_min_distance["venue_name"] == 'BALB', 37.721585).otherwise(bart_venue_min_distance["latitude"]))

bart_venue_min_distance = bart_venue_min_distance.withColumn("abbreviation", \
              when(bart_venue_min_distance["venue_name"] == 'Chase Center', '16TH').otherwise(bart_venue_min_distance["abbreviation"]))
bart_venue_min_distance = bart_venue_min_distance.withColumn("longitude", \
              when(bart_venue_min_distance["venue_name"] == '16TH', -122.419694).otherwise(bart_venue_min_distance["longitude"]))
bart_venue_min_distance = bart_venue_min_distance.withColumn("latitude", \
              when(bart_venue_min_distance["venue_name"] == '16TH', 37.765062).otherwise(bart_venue_min_distance["latitude"]))

# 37.76817935620899, -122.3878343023004

bart_venue_min_distance.printSchema()
bart_venue_min_distance.count()

root
 |-- ts: timestamp (nullable = true)
 |-- venue_name: string (nullable = true)
 |-- latitude_venue: string (nullable = true)
 |-- longitude_venue: string (nullable = true)
 |-- abbreviation: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- venue_min_distance: double (nullable = true)



2621

In [35]:
bart_venue_min_distance.select('venue_name','abbreviation').distinct().show()

+--------------------+------------+
|          venue_name|abbreviation|
+--------------------+------------+
|Historic BAL Theatre|        BAYF|
|Oakland-Alameda C...|        COLS|
|        Chase Center|        16TH|
|    Stanford Stadium|        UCTY|
|          Cow Palace|        BALB|
|          SAP Center|        BERY|
|         Oracle Park|        MONT|
|      Levi's Stadium|        MLPT|
|         PayPal Park|        BERY|
|       Kezar Stadium|        16TH|
|Event Center at SJSU|        BERY|
|     Spartan Stadium|        BERY|
|          The Chapel|        16TH|
+--------------------+------------+



In [36]:
# saving bart_venue_min_distance to a parquet file in S3 bucket
# bart_venue_min_distance.write.parquet('s3a://w210-bucket/data_wrangling/bart_venue_min_distance.parquet',mode='overwrite')

                                                                                

# Joining Sporting Events Dataset and Bart Station Dataset Based on 3 mile radius

In [37]:
# reading the venue_bart_joind_3_miles date set 
venue_bart_joind = spark.read.parquet("s3a://w210-bucket/data_wrangling/venue_bart_joind_3_miles.parquet")
venue_bart_joind.printSchema()
venue_bart_joind.count()

#converting venue_bart_joind to pandas dataframe
df_venue_bart = venue_bart_joind.toPandas()
df_venue_bart = df_venue_bart.sort_values('abbreviation')
df_venue_bart1 = df_venue_bart.drop(columns=['latitude_venue','longitude_venue','distance_mile'])

# get unique bart station abbreviations
unique_bart_stations = []
for bart in df_venue_bart1['abbreviation']:
    unique_bart_stations.append(bart)
unique_bart_stations = list(set(unique_bart_stations))

new_list = list(enumerate(unique_bart_stations,1))
df_test = pd.DataFrame(new_list,columns=['bart_station_id','abbreviation'])

df_venue_bart1 = pd.merge(df_venue_bart1, df_test, on = 'abbreviation',how = 'left')
df_venue_bart2 = spark.createDataFrame(df_venue_bart1) 

# joining 
df_sports_id = df_sports.join(df_venue_bart2, 'venue_name', how='left')
# df_sports_id.show()

#drop NAs
df_sports_id2 = df_sports_id.na.drop().drop('latitude_venue','longitude_venue')
# df_sports_id2.show()

# reading df_join1 
df_join2 = spark.read.parquet("s3a://w210-bucket/data_wrangling/df_joined2.parquet")
# df_join1.printSchema()

# get unique bart station abbreviations
unique_bart_stations = []
for bart in df_venue_bart1['abbreviation']:
    unique_bart_stations.append(bart)
unique_bart_stations = list(set(unique_bart_stations))

new_list = list(enumerate(unique_bart_stations,1))
df_test = pd.DataFrame(new_list,columns=['bart_station_id','abbreviation'])
bart_id = spark.createDataFrame(df_test)

df_join3 = df_join2.join(bart_id, df_join2.origin == bart_id.abbreviation, how='left').drop('abbreviation')
df_join3.select('origin','bart_station_id').show(5)

df_sports_id_1 = df_sports_id.withColumn('bart_station_id',col('bart_station_id').cast(StringType()))
df_sports_id_1 = df_sports_id_1.withColumnRenamed('bart_station_id','bart_station_id_1')
df_sports_id_1 = df_sports_id_1.withColumnRenamed('ts','ts1')

df_sports_id_2 = df_sports_id_1.na.drop()

# joining df_join2 data to the df_sport data on venue_name and ts 

df_joined3 = df_join3.join(df_sports_id_2, (df_join3['ts'] == df_sports_id_2['ts1']) &
                                    (df_join3['bart_station_id'] == df_sports_id_2['bart_station_id_1']),'left').drop('bart_station_id_1','ts1','abbreviation')

AnalysisException: Path does not exist: s3a://w210-bucket/data_wrangling/venue_bart_joind_3_miles.parquet

In [464]:
# writing to S3 bucket as parquet file
# df_joined3.write.parquet('s3a://w210-bucket/data_wrangling/df_joined3.parquet',mode='overwrite')

In [493]:
df_3 = spark.read.parquet("s3a://w210-bucket/data_wrangling/df_joined3.parquet")

# renaming events columns 
df_4 = df_3.withColumnRenamed('bart_station_id','bart_station_id_origin')\
           .withColumnRenamed('venue_name','venue_name_origin')\
           .withColumnRenamed('latitude_venue','latitude_venue_origin')\
           .withColumnRenamed('longitude_venue','longitude_venue_origin')

## joining based on destination 

In [498]:
df_5 = df_4.join(bart_id, df_4.destination == bart_id.abbreviation, how='left').drop('abbreviation')

In [499]:
df_5.printSchema()

root
 |-- date: date (nullable = true)
 |-- hour: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)
 |-- ridership_number: integer (nullable = true)
 |-- origin-des: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- quarter: string (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- ts: timestamp (nullable = true)
 |-- STATION_origin: string (nullable = true)
 |-- SOURCE_origin: string (nullable = true)
 |-- latitude_wthr_origin: string (nullable = true)
 |-- longitude_wthr_origin: string (nullable = true)
 |-- ELEVATION_origin: string (nullable = true)
 |-- NAME_origin: string (nullable = true)
 |-- REPORT_TYPE_origin: string (nullable = true)
 |-- CALL_SIGN_origin: string (nullable = true)
 |-- QUALITY_CONTROL_origin: string (nullable = tru

In [501]:
df_sports_id_1 = df_sports_id.withColumn('bart_station_id',col('bart_station_id').cast(StringType()))
df_sports_id_1 = df_sports_id_1.withColumnRenamed('bart_station_id','bart_station_id_1')
df_sports_id_1 = df_sports_id_1.withColumnRenamed('ts','ts1')
df_sports_id_2 = df_sports_id_1.na.drop()
df_sports_id_2.printSchema()
df_sports_id_2.count()

root
 |-- venue_name: string (nullable = true)
 |-- ts1: timestamp (nullable = true)
 |-- latitude_venue: string (nullable = true)
 |-- longitude_venue: string (nullable = true)
 |-- abbreviation: string (nullable = true)
 |-- bart_station_id_1: string (nullable = true)



8043

In [502]:
# joining df_join3 data to the df_sport data on venue_name and ts 

df_joined4 = df_5.join(df_sports_id_2, (df_5['ts'] == df_sports_id_2['ts1']) &
                                    (df_5['bart_station_id'] == df_sports_id_2['bart_station_id_1']),'left').drop('bart_station_id_1','ts1','abbreviation')

In [504]:
# writing to S3 bucket as parquet file
# df_joined4.write.parquet('s3a://w210-bucket/data_wrangling/df_joined4.parquet',mode='overwrite')

In [7]:
df_4 = spark.read.parquet("s3a://w210-bucket/data_wrangling/df_joined4.parquet")

23/02/28 04:21:45 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


                                                                                

In [8]:
df_4.printSchema()
df_4.count()

root
 |-- date: date (nullable = true)
 |-- hour: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- destination: string (nullable = true)
 |-- ridership_number: integer (nullable = true)
 |-- origin-des: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- quarter: string (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- ts: timestamp (nullable = true)
 |-- STATION_origin: string (nullable = true)
 |-- SOURCE_origin: string (nullable = true)
 |-- latitude_wthr_origin: string (nullable = true)
 |-- longitude_wthr_origin: string (nullable = true)
 |-- ELEVATION_origin: string (nullable = true)
 |-- NAME_origin: string (nullable = true)
 |-- REPORT_TYPE_origin: string (nullable = true)
 |-- CALL_SIGN_origin: string (nullable = true)
 |-- QUALITY_CONTROL_origin: string (nullable = tru

                                                                                

109769289

In [1]:
# df_4.select('ts','origin','destination','ridership_number','venue_name_origin','bart_station_id','venue_name').show(1000)

# Joining Sporting Events Dataset and Bart Station Dataset Based on minimum distance

In [38]:
# reading the df_join2 and the bart_venue_min_distance
df_join2_1 = spark.read.parquet("s3a://w210-bucket/data_wrangling/df_joined2_1.parquet")
bart_venue_min_distance = spark.read.parquet("s3a://w210-bucket/data_wrangling/bart_venue_min_distance.parquet")

In [39]:
## we will look at two hours before the event time. We will wait 3 hours for the event. And then consider one hour after the event 
sports_filtered = sports_filtered.drop('latitude_venue','longitude_venue')
final_venue_df = sports_filtered.join(bart_venue_min_distance, ['ts','venue_name'],'right').drop('venue_min_distance','longitude','latitude')
final_venue_df= final_venue_df.dropDuplicates()
final_venue_df_1 = final_venue_df.withColumn('venue_name',f.when(f.col("venue_name").isNull(),f.lit(0)).otherwise(f.lit(1)))\
           .withColumn('venue_name',col('venue_name').cast('string'))

ts_extract = final_venue_df_1.orderBy('ts').drop('latitude_venue','longitude_venue')
ts_extract_df = ts_extract.toPandas()
################### getting two hours before event, 3 hours for the duration of event, and 1 hour after event 

########### getting two hours before event ######################3
df_1_hr_before_event = ts_extract_df.apply(lambda x: x["ts"]-timedelta(hours=1),axis=1)
df_1_hr_before_event_df = pd.DataFrame(df_1_hr_before_event, columns=['ts']) 
df_1_hr_before_event_df['event'] = 1 
before_1 = pd.concat([df_1_hr_before_event_df,ts_extract_df])
before_1['event'] = before_1['event'].astype('object')
before_1 = before_1.sort_values(['ts','abbreviation'])
before_1['abbreviation']= before_1['abbreviation'].bfill()
before_1['venue_name'] = before_1['venue_name'].fillna(before_1['event'])
before_1 = before_1.drop(columns=['event'])                                     ########### first df

df_2_hrs_before_event = ts_extract_df.apply(lambda x: x["ts"]-timedelta(hours=2),axis=1)
df_2_hrs_before_event_df = pd.DataFrame(df_2_hrs_before_event, columns=['ts'])
df_2_hrs_before_event_df['event'] = 1 
before_2 = pd.concat([df_2_hrs_before_event_df,ts_extract_df])
before_2['event'] = before_2['event'].astype('object')
before_2 = before_2.sort_values(['ts','abbreviation'])
before_2['abbreviation']= before_2['abbreviation'].bfill()
before_2['venue_name'] = before_2['venue_name'].fillna(before_2['event'])
before_2 = before_2.drop(columns=['event'])                                    ############ second df

# ########### getting three hours during the event ######################
df_1_hr_during_event = ts_extract_df.apply(lambda x: x["ts"]+timedelta(hours=1),axis=1)
df_1_hr_during_event_df = pd.DataFrame(df_1_hr_during_event, columns=['ts'])
df_1_hr_during_event_df['event'] = 0 
during_1 = pd.concat([df_1_hr_during_event_df,ts_extract_df])
during_1['event'] = during_1['event'].astype('object')
during_1 = during_1.sort_values(['ts','abbreviation'])
during_1['abbreviation']= during_1['abbreviation'].ffill()
during_1['venue_name'] = during_1['venue_name'].fillna(during_1['event'])
during_1 = during_1.drop(columns=['event'])                                   ############ third df

df_2_hr_during_event = ts_extract_df.apply(lambda x: x["ts"]+timedelta(hours=2),axis=1)
df_2_hr_during_event_df = pd.DataFrame(df_2_hr_during_event, columns=['ts'])
df_2_hr_during_event_df['event'] = 0 
during_2 = pd.concat([df_2_hr_during_event_df,ts_extract_df])
during_2['event'] = during_2['event'].astype('object')
during_2 = during_2.sort_values(['ts','abbreviation'])
during_2['abbreviation']= during_2['abbreviation'].ffill()
during_2['venue_name'] = during_2['venue_name'].fillna(during_2['event'])
during_2 = during_2.drop(columns=['event'])                                  ############ 4th df

df_3_hr_during_event = ts_extract_df.apply(lambda x: x["ts"]+timedelta(hours=3),axis=1)
df_3_hr_during_event_df = pd.DataFrame(df_3_hr_during_event, columns=['ts'])
df_3_hr_during_event_df['event'] = 0 
during_3 = pd.concat([df_3_hr_during_event_df,ts_extract_df])
during_3['event'] = during_3['event'].astype('object')
during_3 = during_3.sort_values(['ts','abbreviation'])
during_3['abbreviation']= during_3['abbreviation'].ffill()
during_3['venue_name'] = during_3['venue_name'].fillna(during_3['event'])
during_3 = during_3.drop(columns=['event'])                                 ############ 5th df


# ########### one hours after the event ######################
df_1_hr_after_event = ts_extract_df.apply(lambda x: x["ts"]+timedelta(hours=4),axis=1)
df_1_hr_after_event_df = pd.DataFrame(df_1_hr_after_event, columns=['ts'])
df_1_hr_after_event_df['event'] = 1 
after_1 = pd.concat([df_1_hr_after_event_df,ts_extract_df])
after_1['event'] = after_1['event'].astype('object')
after_1 = after_1.sort_values(['ts','abbreviation'])
after_1['abbreviation']= after_1['abbreviation'].ffill()
after_1['venue_name'] = after_1['venue_name'].fillna(after_1['event'])
after_1 = after_1.drop(columns=['event'])                                  ############# 6th df

# combining all datasets
full_event_date = pd.concat([before_1,before_2,during_1,during_2,during_3,after_1])
full_event_date = full_event_date.sort_values(['ts','abbreviation'])
full_event_date = full_event_date.drop_duplicates()

d = {1.0:'1',0.0:'0'}
full_event_date['venue_name'] = full_event_date['venue_name'].map(d)
full_event_date['venue_name'] = full_event_date['venue_name'].fillna(1)
full_event_date['venue_name']= full_event_date['venue_name'].astype('string')
full_event_date['abbreviation']= full_event_date['abbreviation'].astype('string')

# converting to PySpark
full_event_df = spark.createDataFrame(full_event_date)
full_event_df = full_event_df.withColumnRenamed('venue_name','event')

In [40]:
### joining bart_venue_min_distance dataset with the df_join2 dataset
full_event_df_1 = full_event_df.withColumnRenamed('abbreviation','origin')
df_joined_2_2 = df_join2_1.join(full_event_df_1,['ts','origin'],'left')

In [41]:
# saving df_joined_2_2 to a parquet file in S3 bucket
# df_joined_2_2.write.parquet('s3a://w210-bucket/data_wrangling/df_joined_2_2.parquet',mode='overwrite')

23/03/14 05:54:07 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

## Joining based on destination

In [42]:
# reading datasets
df_joined_2_2 = spark.read.parquet("s3a://w210-bucket/data_wrangling/df_joined_2_2.parquet")

In [43]:
#renaming columns 
df_joined_2_2 = df_joined_2_2.withColumnRenamed('event','event_origin')

In [44]:
#joining based on destination
full_event_df_1 = full_event_df.withColumnRenamed('abbreviation','destination')
df_joined_3_0 = df_joined_2_2.join(full_event_df_1,['ts','destination'],'left')

In [45]:
# saving df_joined_3_3 to a parquet file in S3 bucket
df_joined_3_0.write.parquet('s3a://w210-bucket/data_wrangling/df_joined_3_0.parquet',mode='overwrite')

                                                                                

In [46]:
df_joined_3_0 = spark.read.parquet("s3a://w210-bucket/data_wrangling/df_joined_3_0.parquet")

In [47]:
df_joined_3_0.printSchema()
df_joined_3_0.count()

root
 |-- ts: timestamp (nullable = true)
 |-- destination: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: string (nullable = true)
 |-- ridership_number: integer (nullable = true)
 |-- origin-des: string (nullable = true)
 |-- station_name: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- quarter: string (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- station_origin: string (nullable = true)
 |-- latitude_wthr_origin: string (nullable = true)
 |-- longitude_wthr_origin: string (nullable = true)
 |-- wthr_station_origin: string (nullable = true)
 |-- wind_speed_origin: double (nullable = true)
 |-- air_temp_origin: double (nullable = true)
 |-- precipitation_origin: double (nullable = true)
 |-- wth_type_origin: string (nullable = true)
 |-- station: string (nullable = true)


                                                                                

109242901

In [48]:
#checking null values in df_station data
missing_counts = df_joined_3_0.select([count(when(col(c).isNull(), c)).alias(c) for c in df_joined_3_0.columns]).toPandas()
missing_counts

                                                                                

Unnamed: 0,ts,destination,origin,date,hour,ridership_number,origin-des,station_name,longitude,latitude,...,station,latitude_wthr,longitude_wthr,wthr_station_name,wind_speed,air_temp,precipitation,wth_type,event_origin,event
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,108832579,108841264


In [282]:
bart_venue_min_distance = spark.read.parquet("s3a://w210-bucket/data_wrangling/bart_venue_min_distance.parquet")

In [283]:
bart_venue_min_distance.show()

+-------------------+--------------------+--------------+---------------+------------+-----------+---------+--------------------+
|                 ts|          venue_name|latitude_venue|longitude_venue|abbreviation|  longitude| latitude|  venue_min_distance|
+-------------------+--------------------+--------------+---------------+------------+-----------+---------+--------------------+
|2013-03-22 19:00:00|          Cow Palace|      37.70788|    -122.420308|        BALB|-122.433817|37.733064|0.028578434824177622|
|2014-02-08 19:00:00|          SAP Center|     37.332240|    -121.901650|        BERY|-121.874689|37.368572| 0.04524278666262851|
|2021-08-25 19:00:00|Oakland-Alameda C...|     37.750342|    -122.202805|        COLS|-122.196869|37.753661|0.006800871782343846|
|2022-09-27 19:00:00|      Levi's Stadium|     37.402317|    -121.968995|        MLPT|-121.890621|37.409878| 0.07873787269797004|
|2016-05-20 19:00:00|         Oracle Park|     37.778832|    -122.389344|        MONT|-122

In [505]:
sports_filtered = sports_filtered.drop('latitude_venue','longitude_venue')
final_venue_df = sports_filtered.join(bart_venue_min_distance, ['ts','venue_name'],'right').drop('venue_min_distance','longitude','latitude')
final_venue_df= final_venue_df.dropDuplicates()
final_venue_df.show()
final_venue_df.count()

+-------------------+--------------------+--------------+---------------+------------+
|                 ts|          venue_name|latitude_venue|longitude_venue|abbreviation|
+-------------------+--------------------+--------------+---------------+------------+
|2016-12-18 15:00:00|          SAP Center|     37.332240|    -121.901650|        BERY|
|2019-12-20 19:00:00|        Chase Center|     37.768829|    -122.389416|        MONT|
|2019-01-05 19:00:00|          SAP Center|     37.332240|    -121.901650|        BERY|
|2017-05-12 20:00:00|Event Center at SJSU|     37.335366|    -121.881039|        BERY|
|2022-06-06 19:00:00|         Oracle Park|     37.778832|    -122.389344|        MONT|
|2012-06-23 19:00:00|          SAP Center|     37.332240|    -121.901650|        BERY|
|2018-11-24 12:00:00|Event Center at SJSU|     37.335366|    -121.881039|        BERY|
|2015-12-13 09:00:00|      Levi's Stadium|     37.402317|    -121.968995|        MLPT|
|2012-07-22 19:00:00|          SAP Center| 

2621

In [592]:
sports_filtered = sports_filtered.drop('latitude_venue','longitude_venue')
final_venue_df = sports_filtered.join(bart_venue_min_distance, ['ts','venue_name'],'right').drop('venue_min_distance','longitude','latitude')
final_venue_df= final_venue_df.dropDuplicates()
final_venue_df_1 = final_venue_df.withColumn('venue_name',f.when(f.col("venue_name").isNull(),f.lit(0)).otherwise(f.lit(1)))\
           .withColumn('venue_name',col('venue_name').cast('string'))

ts_extract = final_venue_df_1.orderBy('ts').drop('latitude_venue','longitude_venue')
ts_extract_df = ts_extract.toPandas()
################### getting two hours before event, 3 hours for the duration of event, and 1 hour after event 

########### getting two hours before event ######################3
df_1_hr_before_event = ts_extract_df.apply(lambda x: x["ts"]-timedelta(hours=1),axis=1)
df_1_hr_before_event_df = pd.DataFrame(df_1_hr_before_event, columns=['ts']) 
df_1_hr_before_event_df['event'] = 1 
before_1 = pd.concat([df_1_hr_before_event_df,ts_extract_df])
before_1['event'] = before_1['event'].astype('object')
before_1 = before_1.sort_values(['ts','abbreviation'])
before_1['abbreviation']= before_1['abbreviation'].bfill()
before_1['venue_name'] = before_1['venue_name'].fillna(before_1['event'])
before_1 = before_1.drop(columns=['event'])                                     ########### first df

df_2_hrs_before_event = ts_extract_df.apply(lambda x: x["ts"]-timedelta(hours=2),axis=1)
df_2_hrs_before_event_df = pd.DataFrame(df_2_hrs_before_event, columns=['ts'])
df_2_hrs_before_event_df['event'] = 1 
before_2 = pd.concat([df_2_hrs_before_event_df,ts_extract_df])
before_2['event'] = before_2['event'].astype('object')
before_2 = before_2.sort_values(['ts','abbreviation'])
before_2['abbreviation']= before_2['abbreviation'].bfill()
before_2['venue_name'] = before_2['venue_name'].fillna(before_2['event'])
before_2 = before_2.drop(columns=['event'])                                    ############ second df

# ########### getting three hours during the event ######################
df_1_hr_during_event = ts_extract_df.apply(lambda x: x["ts"]+timedelta(hours=1),axis=1)
df_1_hr_during_event_df = pd.DataFrame(df_1_hr_during_event, columns=['ts'])
df_1_hr_during_event_df['event'] = 0 
during_1 = pd.concat([df_1_hr_during_event_df,ts_extract_df])
during_1['event'] = during_1['event'].astype('object')
during_1 = during_1.sort_values(['ts','abbreviation'])
during_1['abbreviation']= during_1['abbreviation'].ffill()
during_1['venue_name'] = during_1['venue_name'].fillna(during_1['event'])
during_1 = during_1.drop(columns=['event'])                                   ############ third df

df_2_hr_during_event = ts_extract_df.apply(lambda x: x["ts"]+timedelta(hours=2),axis=1)
df_2_hr_during_event_df = pd.DataFrame(df_2_hr_during_event, columns=['ts'])
df_2_hr_during_event_df['event'] = 0 
during_2 = pd.concat([df_2_hr_during_event_df,ts_extract_df])
during_2['event'] = during_2['event'].astype('object')
during_2 = during_2.sort_values(['ts','abbreviation'])
during_2['abbreviation']= during_2['abbreviation'].ffill()
during_2['venue_name'] = during_2['venue_name'].fillna(during_2['event'])
during_2 = during_2.drop(columns=['event'])                                  ############ 4th df

df_3_hr_during_event = ts_extract_df.apply(lambda x: x["ts"]+timedelta(hours=3),axis=1)
df_3_hr_during_event_df = pd.DataFrame(df_3_hr_during_event, columns=['ts'])
df_3_hr_during_event_df['event'] = 0 
during_3 = pd.concat([df_3_hr_during_event_df,ts_extract_df])
during_3['event'] = during_3['event'].astype('object')
during_3 = during_3.sort_values(['ts','abbreviation'])
during_3['abbreviation']= during_3['abbreviation'].ffill()
during_3['venue_name'] = during_3['venue_name'].fillna(during_3['event'])
during_3 = during_3.drop(columns=['event'])                                 ############ 5th df


# ########### one hours after the event ######################
df_1_hr_after_event = ts_extract_df.apply(lambda x: x["ts"]+timedelta(hours=4),axis=1)
df_1_hr_after_event_df = pd.DataFrame(df_1_hr_after_event, columns=['ts'])
df_1_hr_after_event_df['event'] = 1 
after_1 = pd.concat([df_1_hr_after_event_df,ts_extract_df])
after_1['event'] = after_1['event'].astype('object')
after_1 = after_1.sort_values(['ts','abbreviation'])
after_1['abbreviation']= after_1['abbreviation'].ffill()
after_1['venue_name'] = after_1['venue_name'].fillna(after_1['event'])
after_1 = after_1.drop(columns=['event'])                                  ############# 6th df

# combining all datasets
full_event_date = pd.concat([before_1,before_2,during_1,during_2,during_3,after_1])
full_event_date = full_event_date.sort_values(['ts','abbreviation'])
full_event_date = full_event_date.drop_duplicates()

d = {1.0:'1',0.0:'0'}
full_event_date['venue_name'] = full_event_date['venue_name'].map(d)
full_event_date['venue_name'] = full_event_date['venue_name'].fillna(1)



In [578]:
################### getting two hours before event, 3 hours for the duration of event, and 1 hour after event 

########### getting two hours before event ######################3
df_1_hr_before_event = ts_extract_df.apply(lambda x: x["ts"]-timedelta(hours=1),axis=1)
df_1_hr_before_event_df = pd.DataFrame(df_1_hr_before_event, columns=['ts']) 
df_1_hr_before_event_df['event'] = 1 
before_1 = pd.concat([df_1_hr_before_event_df,ts_extract_df])
before_1['event'] = before_1['event'].astype('object')
before_1 = before_1.sort_values(['ts','abbreviation'])
before_1['abbreviation']= before_1['abbreviation'].bfill()
before_1['venue_name'] = before_1['venue_name'].fillna(before_1['event'])
before_1 = before_1.drop(columns=['event'])                                     ########### first df

df_2_hrs_before_event = ts_extract_df.apply(lambda x: x["ts"]-timedelta(hours=2),axis=1)
df_2_hrs_before_event_df = pd.DataFrame(df_2_hrs_before_event, columns=['ts'])
df_2_hrs_before_event_df['event'] = 1 
before_2 = pd.concat([df_2_hrs_before_event_df,ts_extract_df])
before_2['event'] = before_2['event'].astype('object')
before_2 = before_2.sort_values(['ts','abbreviation'])
before_2['abbreviation']= before_2['abbreviation'].bfill()
before_2['venue_name'] = before_2['venue_name'].fillna(before_2['event'])
before_2 = before_2.drop(columns=['event'])                                    ############ second df

# ########### getting three hours during the event ######################
df_1_hr_during_event = ts_extract_df.apply(lambda x: x["ts"]+timedelta(hours=1),axis=1)
df_1_hr_during_event_df = pd.DataFrame(df_1_hr_during_event, columns=['ts'])
df_1_hr_during_event_df['event'] = 0 
during_1 = pd.concat([df_1_hr_during_event_df,ts_extract_df])
during_1['event'] = during_1['event'].astype('object')
during_1 = during_1.sort_values(['ts','abbreviation'])
during_1['abbreviation']= during_1['abbreviation'].ffill()
during_1['venue_name'] = during_1['venue_name'].fillna(during_1['event'])
during_1 = during_1.drop(columns=['event'])                                   ############ third df

df_2_hr_during_event = ts_extract_df.apply(lambda x: x["ts"]+timedelta(hours=2),axis=1)
df_2_hr_during_event_df = pd.DataFrame(df_2_hr_during_event, columns=['ts'])
df_2_hr_during_event_df['event'] = 0 
during_2 = pd.concat([df_2_hr_during_event_df,ts_extract_df])
during_2['event'] = during_2['event'].astype('object')
during_2 = during_2.sort_values(['ts','abbreviation'])
during_2['abbreviation']= during_2['abbreviation'].ffill()
during_2['venue_name'] = during_2['venue_name'].fillna(during_2['event'])
during_2 = during_2.drop(columns=['event'])                                  ############ 4th df

df_3_hr_during_event = ts_extract_df.apply(lambda x: x["ts"]+timedelta(hours=3),axis=1)
df_3_hr_during_event_df = pd.DataFrame(df_3_hr_during_event, columns=['ts'])
df_3_hr_during_event_df['event'] = 0 
during_3 = pd.concat([df_3_hr_during_event_df,ts_extract_df])
during_3['event'] = during_3['event'].astype('object')
during_3 = during_3.sort_values(['ts','abbreviation'])
during_3['abbreviation']= during_3['abbreviation'].ffill()
during_3['venue_name'] = during_3['venue_name'].fillna(during_3['event'])
during_3 = during_3.drop(columns=['event'])                                 ############ 5th df


# ########### one hours after the event ######################
df_1_hr_after_event = ts_extract_df.apply(lambda x: x["ts"]+timedelta(hours=4),axis=1)
df_1_hr_after_event_df = pd.DataFrame(df_1_hr_after_event, columns=['ts'])
df_1_hr_after_event_df['event'] = 1 
after_1 = pd.concat([df_1_hr_after_event_df,ts_extract_df])
after_1['event'] = after_1['event'].astype('object')
after_1 = after_1.sort_values(['ts','abbreviation'])
after_1['abbreviation']= after_1['abbreviation'].ffill()
after_1['venue_name'] = after_1['venue_name'].fillna(after_1['event'])
after_1 = after_1.drop(columns=['event'])                                  ############# 6th df

# combining all datasets
full_event_date = pd.concat([before_1,before_2,during_1,during_2,during_3,after_1])
full_event_date = full_event_date.sort_values(['ts','abbreviation'])
full_event_date = full_event_date.drop_duplicates()

d = {1.0:'1',0.0:'0'}
full_event_date['venue_name'] = full_event_date['venue_name'].map(d)
full_event_date['venue_name'] = full_event_date['venue_name'].fillna(1)


In [157]:
# ################### getting two hours before event, 3 hours for the duration of event, and 1 hour after event 

# ########### getting two hours before event ######################3
# df_1_hr_before_event = ts_extract_df.apply(lambda x: x["ts"]-timedelta(hours=1),axis=1)
# df_1_hr_before_event_df = pd.DataFrame(df_1_hr_before_event, columns=['ts']) 
# df_1_hr_before_event_df['event'] = 1 # first df

# df_2_hrs_before_event = df_1_hr_before_event_df.apply(lambda x: x["ts"]-timedelta(hours=1),axis=1)
# df_2_hrs_before_event_df = pd.DataFrame(df_2_hrs_before_event, columns=['ts'])
# df_2_hrs_before_event_df['event'] = 1 # second df


# ########### getting three hours during the event ######################
# df_1_hr_after_event = ts_extract_df.apply(lambda x: x["ts"]+timedelta(hours=1),axis=1)
# df_1_hr_after_event_df = pd.DataFrame(df_1_hr_after_event, columns=['ts'])
# df_1_hr_after_event_df['event'] = 0 # third df

# df_2_hr_after_event = df_1_hr_after_event_df.apply(lambda x: x["ts"]+timedelta(hours=1),axis=1)
# df_2_hr_after_event_df = pd.DataFrame(df_2_hr_after_event, columns=['ts'])
# df_2_hr_after_event_df['event'] = 0 # fourth df

# df_3_hr_after_event = df_2_hr_after_event_df.apply(lambda x: x["ts"]+timedelta(hours=1),axis=1)
# df_3_hr_after_event_df = pd.DataFrame(df_3_hr_after_event, columns=['ts'])
# df_3_hr_after_event_df['event'] = 0 # fifth df

# ########### one hours after the event ######################
# df_1_hr_event_end = df_3_hr_after_event_df.apply(lambda x: x["ts"]+timedelta(hours=1),axis=1)
# df_1_hr_event_end_df = pd.DataFrame(df_1_hr_event_end, columns=['ts'])
# df_1_hr_event_end_df['event']= 1 # sixth df

# full_event_date = pd.concat([ts_extract_df,df_1_hr_before_event_df,df_2_hrs_before_event_df,df_1_hr_after_event_df,
#                              df_2_hr_after_event_df,df_3_hr_after_event_df,df_1_hr_event_end_df])

In [158]:
full_event_date =full_event_date.fillna(1)

In [160]:
full_event_date.sort_values('ts').head()

Unnamed: 0,ts,event
0,2011-12-15 17:00:00,1.0
0,2011-12-15 18:00:00,1.0
0,2011-12-15 19:00:00,1.0
0,2011-12-15 20:00:00,0.0
0,2011-12-15 21:00:00,0.0


In [162]:
final_venue_df_pandas = final_venue_df_1.toPandas()

In [163]:
venue_final_df  = pd.merge(full_event_date,final_venue_df_pandas, on = 'ts')

Unnamed: 0,ts,venue_name,latitude_venue,longitude_venue,abbreviation
0,2016-12-18 15:00:00,1,37.332240,-121.901650,BERY
1,2019-12-20 19:00:00,1,37.768829,-122.389416,MONT
2,2019-01-05 19:00:00,1,37.332240,-121.901650,BERY
3,2017-05-12 20:00:00,1,37.335366,-121.881039,BERY
4,2022-06-06 19:00:00,1,37.778832,-122.389344,MONT
...,...,...,...,...,...
2935,2020-03-04 20:00:00,1,37.750342,-122.202805,COLS
2936,2013-03-14 19:00:00,1,37.332240,-121.901650,BERY
2937,2012-11-10 19:00:00,1,37.70788,-122.420308,BALB
2938,2016-02-29 19:00:00,1,37.332240,-121.901650,BERY


In [165]:
final_venue_df_pandas['ts'].nunique()

2412

In [166]:
sss = final_venue_df_pandas.drop_duplicates(subset=["ts"], keep=False)

In [167]:
sss['ts'].nunique()

1939