In [2]:
import datetime
import operator
import os
import sys

import pyspark.ml.feature as feature
import pyspark.ml as ml
import pyspark.ml.clustering as clustering
import pyspark.sql as sql
import pyspark.sql.functions as functions
import pyspark.sql.types as types

# Add all scripts from repository to local path. 
# From https://stackoverflow.com/a/35273613 .
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import twokenize
import grid

# From https://stackoverflow.com/a/36218558 .
def sparkImport(module_name, module_directory):
    """
    Convenience function. 
    
    Tells the SparkContext sc (must already exist) to load
    module module_name on every computational node before
    executing an RDD. 
    
    Args:
        module_name: the name of the module, without ".py". 
        module_directory: the path, absolute or relative, to
                          the directory containing module
                          module_Name. 
    
    Returns: none. 
    """
    module_path = os.path.abspath(
        module_directory + "/" + module_name + ".py")
    sc.addPyFile(module_path)

sparkImport("twokenize", "..")
sparkImport('grid', '..')

ss = sql.SparkSession.builder.appName("TwitterTokenizing")\
                             .getOrCreate()

In [3]:
tweets_schema = types.StructType([
  types.StructField('id', types.LongType()),
  types.StructField('timestamp', types.LongType()),
  types.StructField('postalCode', types.StringType()),
  types.StructField('lon', types.DoubleType()),
  types.StructField('lat', types.DoubleType()),
  types.StructField('tweet', types.StringType()),
  types.StructField('user_id', types.LongType()),
  types.StructField('application', types.StringType()),
  types.StructField('source', types.StringType())
])
tweets_df = ss.read.csv('tweets2.csv',
                         escape='"',
                         header='true',
                         schema=tweets_schema,
                         mode='DROPMALFORMED')
tweets_df = tweets_df.drop('id') \
                     .drop('postalCode') \
                     .drop('user_id') \
                     .drop('application') \
                     .drop('source')

date_column = tweets_df['timestamp'].cast(types.TimestampType()) \
                                    .cast(types.DateType())
tweets_df = tweets_df.withColumn('date', date_column) \
                     .drop('timestamp')

date_to_column = functions.lit(datetime.datetime(2016, 3, 3))
date_from_column = functions.lit(functions.date_sub(date_to_column, 31))
tweets_df = tweets_df.filter(
    ~(tweets_df.date < date_from_column)
    & (tweets_df.date < date_to_column))

sql_tokenize = functions.udf(
    lambda tweet: twokenize.tokenize(tweet),
    returnType=types.ArrayType(types.StringType()))
tweets_df = tweets_df \
    .withColumn("tweet_tokens", sql_tokenize(tweets_df.tweet)) \
    .drop('tweet')

print(tweets_df.count())
print(tweets_df.take(1))

151306
[Row(lon=-74.27470828, lat=40.59844873, date=datetime.date(2016, 2, 1), tweet_tokens=[u'This', u'is', u'me', u',', u'YL', u'a', u'voice', u'from', u'JERSEY', u'..', u'And', u'im', u'pushin', u'this', u'#blackpowermovement', u'!', u'Links', u'in', u'my', u'bio', u'!', u'#BlackHistoryMonth', u'!', u'https://t.co/MLvAS9jIqj'])]


In [4]:
# Southwest corner of New York:
# lat = 40.488320, lon = -74.290739
# Northeast corner of New York:
# lat = 40.957189, lon = -73.635679

latlongrid = grid.LatLonGrid(
    lat_min=40.488320,
    lat_max=40.957189,
    lon_min=-74.290739,
    lon_max=-73.635679,
    lat_step=grid.get_lon_delta(1000, (40.957189 - 40.488320)/2.0),
    lon_step=grid.get_lat_delta(1000))

# The only way to group elements and get a set of data (as far as I know) is by converting the DataFrame into an RDD. 

row_to_gridsquare_tokens = lambda row: (
    latlongrid.grid_square_index(lat=row['lat'], lon=row['lon']),
    row['tweet_tokens'])

tokens_rdd = tweets_df.rdd.map(row_to_gridsquare_tokens) \
                          .reduceByKey(operator.concat)

tokens_df_schema = types.StructType([
    types.StructField('grid_square', types.IntegerType()),
    types.StructField('tokens', types.ArrayType(types.StringType()))
])
tokens_df = ss.createDataFrame(tokens_rdd, schema=tokens_df_schema)

print(tokens_df.count())
print(tokens_df.take(1))

1354
[Row(grid_square=1938, tokens=[u'#Retail', u'#Job', u'alert', u':', u'Assistant', u'Store', u'Manager', u'|', u'The', u'Vitamin', u'Shoppe', u'|', u'#NorthBergen', u',', u'NJ', u'https://t.co/oGkm5a2bWz', u'#Jobs', u'#Hiring', u"I'm", u'at', u'@Walmart', u'Supercenter', u'in', u'North', u'Bergen', u',', u'NJ', u'https://t.co/DvACYYIb5e', u'Want', u'to', u'work', u'in', u'#NorthBergen', u',', u'NJ', u'?', u'View', u'our', u'latest', u'opening', u':', u'https://t.co/JNE3Pv4jQR', u'#Retail', u'#Job', u'#Jobs', u'#Hiring', u'This', u'#Retail', u'#job', u'might', u'be', u'a', u'great', u'fit', u'for', u'you', u':', u'Assistant', u'Store', u'Manager', u'-', u'https://t.co/oGkm5a2bWz', u'#NorthBergen', u',', u'NJ', u'#Hiring', u'Key', u'Holder', u'-', u'The', u'Vitamin', u'Shoppe', u':', u'(', u'#NorthBergen', u',', u'NJ', u')', u'https://t.co/HQUWNVzJfg', u'#Job', u'#Jobs', u'#Hiring', u'#CareerArc', u'The', u'Vitamin', u'Shoppe', u':', u'Store', u'Manager', u'(', u'#NorthBergen', u',',

In [None]:
count_vectorizer = feature.CountVectorizer(inputCol='tokens', outputCol='token_frequencies')
lda = clustering.LDA().setFeaturesCol('token_frequencies').setK(10).setTopicDistributionCol('topic_distributions')
pipeline = ml.Pipeline(stages=[count_vectorizer, lda])

lda_model = pipeline.fit(tokens_df)
topic_distributions = lda_model.transform(tokens_df)

print(topic_distributions.count())
print(topic_distributions.take(1))