In [2]:
import pyspark.sql as sql
import pyspark.sql.types as types

ss = sql.SparkSession.builder.appName("TwitterTokenizing")\
                             .getOrCreate()

In [3]:
tweets_schema = types.StructType([
  types.StructField('id', types.LongType()),
  types.StructField('timestamp', types.LongType()),
  types.StructField('postalCode', types.StringType()),
  types.StructField('lon', types.DoubleType()),
  types.StructField('lat', types.DoubleType()),
  types.StructField('tweet', types.StringType()),
  types.StructField('user_id', types.StringType()),
  types.StructField('application', types.StringType()),
  types.StructField('source', types.StringType())
])
tweets_df = ss.read.load("unprocessed_tweets.csv",
                         format="com.databricks.spark.csv",
                         header="true",
                         schema=tweets_schema,
                         mode="DROPMALFORMED")
#tweets_df = tweets_df.drop('id') \
#                     .drop('postalCode') \
#                     .drop('user_id') \
#                     .drop('application') \
#                     .drop('source')

print(tweets_df.columns)
print(tweets_df.take(1))

['id', 'timestamp', 'postalCode', 'lon', 'lat', 'tweet', 'user_id', 'application', 'source']
[Row(id=616018411009744896, timestamp=1435723208, postalCode=u'83.0', lon=-73.951206, lat=40.79435, tweet=u'Incident on #VariousLocalExpressBuses SB from 5th Avenue:106th Street to 5th Avenue: 57th Street http://t.co/KrLOmkAqcE', user_id=u'52272942', application=u'511NY-Tweets', source=u'511NY-Tweets')]


In [7]:
import os
import sys

# From https://stackoverflow.com/a/36218558 .
def sparkImport(module_name, module_directory):
    """
    Convenience function. 
    
    Tells the SparkContext sc (must already exist) to load
    module module_name on every computational node before
    executing an RDD. 
    
    Args:
        module_name: the name of the module, without ".py". 
        module_directory: the path, absolute or relative, to
                          the directory containing module
                          module_Name. 
    
    Returns: none. 
    """
    module_path = os.path.abspath(
        module_directory + "/" + module_name + ".py")
    sc.addPyFile(module_path)

# Add all scripts from repository to local path. 
# From https://stackoverflow.com/a/35273613 .
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import twokenize
sparkImport("twokenize", "..")

example_tweet = u'Incident on #VariousLocalExpressBuses SB from 5th Avenue:106th Street to 5th Avenue: 57th Street http://t.co/KrLOmkAqcE'
print(twokenize.tokenize(example_tweet))

[u'Incident', u'on', u'#VariousLocalExpressBuses', u'SB', u'from', u'5th', u'Avenue', u':', u'106th', u'Street', u'to', u'5th', u'Avenue', u':', u'57th', u'Street', u'http://t.co/KrLOmkAqcE']


In [8]:
import pyspark.sql.functions as functions

sql_tokenize = functions.udf(
    lambda tweet: twokenize.tokenize(tweet),
    returnType=types.ArrayType(types.StringType()))
tweets_df = tweets_df.withColumn("tokens",
                                 sql_tokenize(tweets_df.tweet))

print(tweets_df.columns)
print(tweets_df.take(1))

['id', 'timestamp', 'postalCode', 'lon', 'lat', 'tweet', 'user_id', 'application', 'source', 'tokens']
[Row(id=616018411009744896, timestamp=1435723208, postalCode=u'83.0', lon=-73.951206, lat=40.79435, tweet=u'Incident on #VariousLocalExpressBuses SB from 5th Avenue:106th Street to 5th Avenue: 57th Street http://t.co/KrLOmkAqcE', user_id=u'52272942', application=u'511NY-Tweets', source=u'511NY-Tweets', tokens=[u'Incident', u'on', u'#VariousLocalExpressBuses', u'SB', u'from', u'5th', u'Avenue', u':', u'106th', u'Street', u'to', u'5th', u'Avenue', u':', u'57th', u'Street', u'http://t.co/KrLOmkAqcE'])]
