In [1]:
import os
import sys
import socket
from timeit import default_timer as timer
from datetime import datetime
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
import pyspark.sql.functions as F
from pyspark.sql.types import MapType, StringType, IntegerType, StructType, StructField, FloatType, ArrayType

In [2]:
try:
    spark
except NameError:
    if 'samuel' in socket.gethostname().lower():
        print('Create Local SparkSession')
        spark = SparkSession.builder.config(
        "spark.driver.host", "localhost").appName(
        "get-checkins-CDMX").getOrCreate()
    else:
        print('Create Cluster SparkSession')
        spark = SparkSession.builder.appName(
        "get-checkins-CDMX").getOrCreate()
        
# Local
print('Hostname:', socket.gethostname())
if 'samuel' in socket.gethostname().lower():
    path_to_tweets = '../../twitter/data/decahose/parsed/tweets/tweets-with-geocoordinates-or-place-extract/'
    path_to_pois   = '../data/pois/'
    path_to_output = '../data/'
# Cluster
else:
    path_to_tweets = '/user/spf248/twitter/data/decahose/parsed/tweets/tweets-with-geocoordinates-or-place-extract/'
    path_to_pois   = '/user/spf248/twitter/data/pois/'
    path_to_output = '/user/spf248/twitter/data/decahose/parsed/tweets/'

Hostname: Samuels-MacBook-Pro.local


In [3]:
print('Import Tweets:')
start = timer()

# tweets = spark.read.parquet(path_to_tweets+'part-00117-b7cf4768-c0a8-43db-86bd-d13b5a84ffbb-c000.snappy.parquet')
tweets = spark.read.parquet(path_to_tweets+'*.snappy.parquet')

end = timer()
print('Computing Time:', round(end - start), 'sec')

Import Tweets:
Computing Time: 4 sec


In [4]:
print('Import POIS:')
start = timer()
    
pois = spark.read.option('header','true').csv(path_to_pois+'pois-twitter-cdmx.csv')

end = timer()
print('Computing Time:', round(end - start), 'sec')

Import POIS:
Computing Time: 2 sec


In [5]:
print('Merge on Place ID')
tweets = tweets.join(pois.withColumnRenamed('id','place_id').select('place_id'), on='place_id', how='inner')

Merge on Place ID


In [6]:
print('Save')
start = timer()

tweets.write.mode("overwrite").parquet(path_to_output+'checkins-cdmx')

end = timer()
print('Computing Time:', round(end - start), 'sec')

Save
Computing Time: 3 sec
