# Concert Tweet Classifier

## Import Necesary Packages

In [166]:
import sparknlp
from pyspark.sql.types import *
from pyspark.sql.functions import count, when, col
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer,
                                LemmatizerModel, StopWordsCleaner, PerceptronApproach)
from pyspark.ml import Pipeline
from nltk.corpus import stopwords
import pyspark.sql.functions as F
import pandas as pd
from sparknlp.pretrained import PretrainedPipeline
import re

## Start the spark-NLP session

In [2]:
spark = sparknlp.start()

In [3]:
spark.sparkContext.defaultParallelism

4

In [4]:
# adjust show output format to pandas-like
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

# support converting pandas to spark
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

## Read the data

In the first go, there were 25k rows of null - where the schema did not match the data. I decided to do some quick cleaning.

In [5]:
def remove_extra_seps(in_file, out_file, sep):
    """removes newline characters that come before the line reaches four segments(3 separators)
    and combines "middle sections" with extra separators into a single segment by removing the separators.
    
    Args:
        in_file: path to read file
        out_file: path to write file
        sep: separator/delimitor
    """
    n_chunks = 4
    
    with open(in_file, 'r') as rf:
        with open(out_file, 'w') as wf:
            while True:
                line = rf.readline()
                
                # if end of file
                if line == '':
                    break
                    
                # if line has less than n_sep, strip the newline and add the next line
                if len(line.split(sep)) < n_chunks:
                    line = line.strip('\n')
                    line += rf.readline()
                
                wf.write(line)

In [6]:
remove_extra_seps('../../data/test_set_tweets.txt',
                      '../../data/test_set_tweets_clean.txt',
                     '\t')
remove_extra_seps('../../data/training_set_tweets.txt',
                      '../../data/training_set_tweets_clean.txt',
                     '\t')

In [7]:
# set the schema
tweet_schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("t_id", StringType(), True),
    StructField("t_text", StringType(), True),
    StructField("t_dt", TimestampType(), True)
    ])

In [8]:
tweets_test = spark.read.csv('../../data/test_set_tweets_clean.txt', 
                              sep='\t',
                              schema=tweet_schema,
                              header="false")

In [9]:
tweets_training = spark.read.csv('../../data/training_set_tweets_clean.txt', 
                                 sep="\t", 
                                 schema=tweet_schema,
                                 header='false')

Since our data is unlabeled for our task, these test/train splits are not particularly useful, but a vestige of the original data set and purpose. We'll combine them.

In [10]:
tweets = tweets_test.union(tweets_training)

#### Future: Consider reading the data as a single column and then parsing. Compare outcome / number of tweets retrieved to that with the csv reading

## Data Set

### Tweets

In [11]:
tweets.select('*').show(5)

+--------+-----------+--------------------+-------------------+
| user_id|       t_id|              t_text|               t_dt|
+--------+-----------+--------------------+-------------------+
|22077441|10538487904|Ok today I have t...|2010-03-15 17:35:58|
|22077441|10536835844|I am glad I'm hav...|2010-03-15 16:53:44|
|22077441|10536809086|Honestly I don't ...|2010-03-15 16:52:59|
|22077441|10534149786|@LovelyJ_Janelle ...|2010-03-15 15:42:07|
|22077441|10530203659|Sitting infront o...|2010-03-15 13:55:22|
+--------+-----------+--------------------+-------------------+
only showing top 5 rows



In [12]:
tweets.select([count(when(col(c).isNull(), c)).alias(c) for c in 
        tweets.columns]).show()

# print("""+-------+-----+------+-----+
# |user_id| t_id|t_text| t_dt|
# +-------+-----+------+-----+
# |  33289|33179| 32631|53805|
# +-------+-----+------+-----+""")

+-------+-----+------+-----+
|user_id| t_id|t_text| t_dt|
+-------+-----+------+-----+
|  34555|34490| 34232|56489|
+-------+-----+------+-----+



In [13]:
tweets.count()

# print(8884863)

8884863

In [14]:
# tweets.distinct().count()
print(8850656)

8850656


It looks like the time stamp can be parsed from the end of the tweet text for many of these "null" datetimes.

In [15]:
tweets.filter(col('t_dt').isNull()).take(5)

[Row(user_id=22398295, t_id='10172355714', t_text='From my vantage point, when it comes to money, women tend to lack confidence in their ability to do the (cont) http://tl.gd/entj8\t2010-03-08 00:00:00', t_dt=None),
 Row(user_id=22398295, t_id='10025719159', t_text='The best way to move a mountain is one stone at a time. Nothing is insurmountable if you take one step (cont) http://tl.gd/dt36c\t2010-03-05 00:00:00', t_dt=None),
 Row(user_id=22398295, t_id='9828243702', t_text="I believe luck is preparation meeting opportunity. If you hadn't been prepared when the opportunity came (cont) http://tl.gd/cqt73\t2010-03-01 00:00:00", t_dt=None),
 Row(user_id=22398295, t_id='9542436310', t_text='Religion, philosophy, greeting cards, self-help books—they all tout the power of love. Being a chronic and (cont) http://tl.gd/b88fa\t2010-02-23 00:00:00', t_dt=None),
 Row(user_id=22398295, t_id='9504909244', t_text=' A Leader is someone who nurtures others and allows them to progress and perform to t

In [16]:
tweets = tweets.withColumn('datetime', 
                           F.when(F.col('t_dt').isNull(), 
                                  F.to_date(F.substring('t_text', -19, 19)))
                           .otherwise(F.col('t_dt'))
                          )

In [17]:
tweets = tweets.withColumn('t_text', 
                           F.when(F.col('t_dt').isNull(), 
                                  F.expr('substring(t_text, 1, length(t_text)-20)'))
                           .otherwise(F.col('t_text'))
                           )

In [18]:
tweets = tweets.withColumn('t_dt', F.col('datetime')).drop('datetime')

In [19]:
# save as parquet and reload
# tweets.write.parquet('../../data/tweets.parquet')
tweets = spark.read.parquet('../../data/tweets.parquet')

In [20]:
tweets.select([count(when(col(c).isNull(), c)).alias(c) for c in 
        tweets.columns]).show()

+-------+-----+------+-----+
|user_id| t_id|t_text| t_dt|
+-------+-----+------+-----+
|  34555|34490| 34232|54671|
+-------+-----+------+-----+



In [21]:
tweets.count()

8884863

In [22]:
tweets = tweets.dropna(how='any', subset=['t_text'])

In [23]:
tweets.filter(col('t_dt').isNull()).take(5)

[Row(user_id=25513575, t_id='10334442280', t_text='', t_dt=None),
 Row(user_id=25513575, t_id='10333612651', t_text='', t_dt=None),
 Row(user_id=16198727, t_id='6899029209', t_text='This vid cracked me up! haha I w', t_dt=None),
 Row(user_id=20106865, t_id='10362030419', t_text="Ladies and gentlemen... come and join me.  It'", t_dt=None),
 Row(user_id=20106865, t_id='10005503765', t_text='I am talking #Survivor RIGHT NOW in stickam', t_dt=None)]

Clearly I could do some more/better data engineering here, but for this exercise, I'm going to move on, dropping any records with null values or t_text with empty strings

In [24]:
tweets = tweets.dropna(how='any')

In [25]:
tweets = tweets.filter(~(tweets.t_text == ""))

In [26]:
tweets.count()

8829912

# Concert tweets - Classifier

I am deciding to focus on english tweets for now. (may add spanish, others in the future based on presence in the data set).

In [27]:
eng_stopwords = stopwords.words('english')

setting up the pieces of my pipeline to extract text info from the tweets (we'll use a pretrained pipeline later)

In [28]:
documentAssembler = DocumentAssembler() \
     .setInputCol('t_text') \
     .setOutputCol('document')
tokenizer = Tokenizer() \
     .setInputCols(['document']) \
     .setOutputCol('token')
normalizer = Normalizer() \
     .setInputCols(['token']) \
     .setOutputCol('normalized') \
     .setLowercase(True)
lemmatizer = LemmatizerModel.pretrained() \
     .setInputCols(['normalized']) \
     .setOutputCol('lemma')
stopwords_cleaner = StopWordsCleaner() \
     .setInputCols(['lemma']) \
     .setOutputCol('clean_lemma') \
     .setCaseSensitive(False) \
     .setStopWords(eng_stopwords)
finisher = Finisher() \
     .setInputCols(['clean_lemma']) \
     .setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [29]:
pipeline = Pipeline() \
     .setStages([
           documentAssembler,
           tokenizer,
           normalizer,
           lemmatizer,
           stopwords_cleaner,
           finisher
     ])

In [30]:
tweets = pipeline.fit(tweets).transform(tweets)

In [31]:
tweets.columns

['user_id',
 't_id',
 't_text',
 't_dt',
 'document',
 'token',
 'normalized',
 'lemma',
 'clean_lemma',
 'finished_clean_lemma']

## Basic Classifier: contains the word concert

In [32]:
concert_tweets = tweets.withColumn('concert', F.array_contains('finished_clean_lemma', 'concert'))
concert_tweets = concert_tweets.filter(concert_tweets['concert'] == 'true')

In [33]:
concert_tweets.select('t_text').take(3)

[Row(t_text="@herRoyalStarnes I just thought of the history broke down bmw's on bdays free concert tickets in the nose bleeds p (cont) http://tl.gd/4pp7k"),
 Row(t_text='Y is me @RandiICandy, @EpitomeOfADiva, and Leila Bunny n here singing Mary J like we Mary J. We in concert yall buy a ticket yall.'),
 Row(t_text="@beccalexis sup Bee? How'd the shoot go? Will you be at the concert tonight?")]

In [34]:
# concert_tweets.count()

print(12477)

12477


In [35]:
concert_tweets.columns

['user_id',
 't_id',
 't_text',
 't_dt',
 'document',
 'token',
 'normalized',
 'lemma',
 'clean_lemma',
 'finished_clean_lemma',
 'concert']

In [36]:
concert_tweets.select("t_text").show(30, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------+
|t_text                                                                                                                                      |
+--------------------------------------------------------------------------------------------------------------------------------------------+
|@herRoyalStarnes I just thought of the history broke down bmw's on bdays free concert tickets in the nose bleeds p (cont) http://tl.gd/4pp7k|
|Y is me @RandiICandy, @EpitomeOfADiva, and Leila Bunny n here singing Mary J like we Mary J. We in concert yall buy a ticket yall.          |
|@beccalexis sup Bee? How'd the shoot go? Will you be at the concert tonight?                                                                |
|RT @BoomKack: Janet was at Lady Gaga concert tonight she is everything!!!!!! Can't touch her!                                               |

## Basic Classifier: contains the word concert or similar words

In [37]:
concert_plus = tweets.withColumn('concert', F.array_contains('finished_clean_lemma', 'concert'))\
                     .withColumn('tour', F.array_contains('finished_clean_lemma', 'tour'))\
                     .withColumn('gig', F.array_contains('finished_clean_lemma', 'gig'))\
                     .withColumn('show', F.array_contains('finished_clean_lemma', 'show'))
concert_plus = concert_plus.withColumn('concert_like', col('concert')|col('tour')|col('gig'))
concert_plus = concert_plus.filter(concert_plus.concert_like == True)

In [38]:
concert_plus.select("t_text").show(30, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------+
|t_text                                                                                                                                      |
+--------------------------------------------------------------------------------------------------------------------------------------------+
|@Lauralu2u yeps I had curve than the tour.   Love my Droid                                                                                  |
|@herRoyalStarnes I just thought of the history broke down bmw's on bdays free concert tickets in the nose bleeds p (cont) http://tl.gd/4pp7k|
|Y is me @RandiICandy, @EpitomeOfADiva, and Leila Bunny n here singing Mary J like we Mary J. We in concert yall buy a ticket yall.          |
|@joeymcintyre You've got to be a LITTLE bit silly on tour or you wouldn't be YOU! ;)                                                        |

Looking at this super small sample, it doesn't seem like these alternate words are adding a lot to our classifier.

#### Future: maybe combination of show/tour/gig and musician/group name in addition to the concert

Since we don't have labeled data, and I'm not sure the best technique for clustering text data in this situation. Or how we would evaluate which techniqes are doing the best job identifying our concert tweets, and whether they are worth the extra complexity/computational requirements.

For now, I'm going to move on using the "concert" lemma classifier

In [39]:
df = concert_tweets.select('user_id', 't_text', 't_dt')

In [40]:
df.cache()

user_id,t_text,t_dt
85691996,@herRoyalStarnes ...,2010-01-22 10:17:15
85691996,Y is me @RandiICa...,2010-01-15 16:22:28
25611870,@beccalexis sup B...,2010-01-30 00:00:00
25611870,RT @BoomKack: Jan...,2010-01-24 00:00:00
30387809,Concert tonight a...,2009-12-09 15:06:12
71702459,They Played #FLEX...,2010-02-22 19:59:44
71702459,My First Concert....,2010-02-22 19:26:40
71702459,In The Library Wi...,2010-02-22 11:32:56
49483366,@RockStarRenRen l...,2009-07-30 11:56:06
28528232,Sooo go b4 u wet ...,2010-01-13 16:37:44


In [41]:
# df.count()
print(12444)

12444


In [42]:
df.show(1)

+--------+--------------------+-------------------+
| user_id|              t_text|               t_dt|
+--------+--------------------+-------------------+
|85691996|@herRoyalStarnes ...|2010-01-22 10:17:15|
+--------+--------------------+-------------------+
only showing top 1 row



In [43]:
sample = df.sample(withReplacement=None, fraction=0.01, seed=5)

In [44]:
sample = sample.withColumnRenamed('t_text', 'text')

In [45]:
sample.columns

['user_id', 'text', 't_dt']

In [46]:
sample.take(20)

[Row(user_id=71702459, text='In The Library With @NickAustinG... He Tryin To Get On His Jigga Shit... Oh We Goin To The Concert Tonite', t_dt=datetime.datetime(2010, 2, 22, 11, 32, 56)),
 Row(user_id=49483366, text='@RockStarRenRen lol is we going to this concert', t_dt=datetime.datetime(2009, 7, 30, 11, 56, 6)),
 Row(user_id=19688989, text='@bizymare   My son lives in Kenosha, I"m down there about a dozen times a year.    The concert was for the Kenosha area home schoolers.', t_dt=datetime.datetime(2009, 12, 11, 22, 16, 24)),
 Row(user_id=20019157, text='@itSHOWTIME how was the concert?', t_dt=datetime.datetime(2010, 1, 19, 0, 0)),
 Row(user_id=49477598, text="@audiobebop what time? I'm suppose to go to a concert tonight with Juda. uhh", t_dt=datetime.datetime(2010, 1, 15, 9, 26, 7)),
 Row(user_id=33814590, text='@MizzDania How was the concert?', t_dt=datetime.datetime(2010, 1, 22, 0, 0)),
 Row(user_id=29299184, text='The A was poppin @S_C_ @SongzYuuup and jeezy held the concert down.

## Entity Recognition

### Who

For the sake of time, I focused on pop and hip hop artists from 2009/2010 (data from wikipedia). This is extra tricky when tweeters use the artist handles (eg @JonasBrothers), again this is an area for future iteration

In [47]:
# import artist list
with open('../../data/musicians.txt', 'r') as f:
     artists = f.read().splitlines()
        
artists = list(set(artists))

In [48]:
pipeline = PretrainedPipeline("explain_document_dl", lang="en")

annotation = pipeline.transform(sample)

annotation.show(1)

explain_document_dl download started this may take some time.
Approx size to download 167.3 MB
[OK!]
+--------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| user_id|                text|               t_dt|            document|            sentence|               token|             checked|               lemma|                stem|                 pos|          embeddings|                 ner|            entities|
+--------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|71702459|In The Library Wi...|2010-02-22 11:32:56|[[document, 0, 10...|[[document, 0, 34...|[[token, 0, 1, In...

In [49]:
annotation.select("entities.result").show(20, truncate=False)

+-------------------------------------------------------+
|result                                                 |
+-------------------------------------------------------+
|[The Library With @NickAustinG, The Concert Tonite]    |
|[@RockStarRenRen]                                      |
|[Kenosha, I"m, Kenosha]                                |
|[]                                                     |
|[Juda]                                                 |
|[@MizzDania]                                           |
|[@S_C_ @SongzYuuup]                                    |
|[]                                                     |
|[I'm, Chile, Haiti, USA]                               |
|[Fred, Radio One, PAJAM]                               |
|[Decemberists]                                         |
|[]                                                     |
|[&]                                                    |
|[@JonasBrothers +, !!!!!*******]                       |
|[Lem, Chrissy

In [50]:
artists[0:5]

['MF DOOM',
 'Joe Budden',
 'Project Pat',
 'Jeremih',
 'Lil Jon',
 'Katy Perry',
 'N.O.R.E.',
 'Souls of Mischief',
 'Akon',
 'The Main',
 'Redman',
 'Famous Playaz',
 'Pink',
 'Mack 10',
 'Stevie Stone',
 'Cali Swag District',
 "Ol' Dirty Bastard",
 'Livestock',
 'Soap Nation',
 'Soul Assassins',
 'Young Money',
 'U-God',
 'Eminem',
 'Crunk Chris',
 'Kottonmouth Kings',
 'Soulja Boy',
 'David Guetta',
 'Method Man & Redman',
 'Nicki Minaj',
 'The Alchemist',
 'Dead Prez',
 'DJ Drama',
 'Drake',
 'Tyga',
 'B.o.B',
 'Kris Allen',
 'Chico DeBarge',
 'Linkin Park',
 'Kesha',
 'Haystak',
 'Bruno Mars',
 'Justin Bieber',
 'Skull Gang',
 'Ray J',
 'DJ Paul',
 'Busta Rhymes',
 'Sean Kingston',
 'Insane Clown Posse',
 'Skyzoo',
 'Shinedown',
 'Owl City',
 'Kings of Leon',
 'Timbaland',
 'Maroon 5',
 'Del the Funky Homosapien',
 'Obie Trice',
 'Playaz Circle',
 'Beyoncé',
 'Maino',
 'Webstar',
 'Eminem featuring Rihanna',
 'Juvenile',
 'Taio Cruz',
 'Lil Wyte',
 'Lady Antebellum',
 'Birdman',


In [51]:
# ugh. pyspark.sql.functions.typedLit doesn't exist yet in pyspark to pass the artists to a udf. 
# So I'm going to switch to pandas for this step

entities_pd = annotation.select('entities.result', 'text').toPandas()

In [52]:
entities_pd['who'] = [[entity for entity in e_list if entity in artists] for e_list in entities_pd['result']]

In [53]:
entities_pd.head(20)

Unnamed: 0,result,text,who
0,"[The Library With @NickAustinG, The Concert To...",In The Library With @NickAustinG... He Tryin T...,[]
1,[@RockStarRenRen],@RockStarRenRen lol is we going to this concert,[]
2,"[Kenosha, I""m, Kenosha]","@bizymare My son lives in Kenosha, I""m down ...",[]
3,[],@itSHOWTIME how was the concert?,[]
4,[Juda],@audiobebop what time? I'm suppose to go to a ...,[]
5,[@MizzDania],@MizzDania How was the concert?,[]
6,[@S_C_ @SongzYuuup],The A was poppin @S_C_ @SongzYuuup and jeezy h...,[]
7,[],@melodyxxx LOL u ladies go hard! Are u guys go...,[]
8,"[I'm, Chile, Haiti, USA]","Look, I'm sorry about Chile and Haiti, but for...",[]
9,"[Fred, Radio One, PAJAM]",U r! had to get Fred cause he was doing a Rad...,[]


In [54]:
who_schema = StructType([
                StructField("result", ArrayType(StringType()), True),
                StructField("text", StringType(), True),
                StructField("who", ArrayType(StringType()), True)
                ])

who = spark.createDataFrame(entities_pd, schema=who_schema)

In [55]:
who.select('*').take(10)

[Row(result=['The Library With @NickAustinG', 'The Concert Tonite'], text='In The Library With @NickAustinG... He Tryin To Get On His Jigga Shit... Oh We Goin To The Concert Tonite', who=[]),
 Row(result=['@RockStarRenRen'], text='@RockStarRenRen lol is we going to this concert', who=[]),
 Row(result=['Kenosha', 'I"m', 'Kenosha'], text='@bizymare   My son lives in Kenosha, I"m down there about a dozen times a year.    The concert was for the Kenosha area home schoolers.', who=[]),
 Row(result=[], text='@itSHOWTIME how was the concert?', who=[]),
 Row(result=['Juda'], text="@audiobebop what time? I'm suppose to go to a concert tonight with Juda. uhh", who=[]),
 Row(result=['@MizzDania'], text='@MizzDania How was the concert?', who=[]),
 Row(result=['@S_C_ @SongzYuuup'], text='The A was poppin @S_C_ @SongzYuuup and jeezy held the concert down.. Back in tally 2 papers and 2 midterms due tues. # backtoboredom', who=[]),
 Row(result=[], text='@melodyxxx LOL u ladies go hard! Are u guys goin

In [56]:
sample = sample.join(who, on='text', how='outer')

In [57]:
sample = sample.drop('result')

In [58]:
sample.show(40, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------+--------+-------------------+------------+
|text                                                                                                                                        |user_id |t_dt               |who         |
+--------------------------------------------------------------------------------------------------------------------------------------------+--------+-------------------+------------+
|For free - add your event to our calendar! http://bit.ly/T3lbr #seattle #music #events #event #bands #concerts #calendar #blog              |80949495|2009-11-02 15:10:01|[]          |
|Jason Mraz was terrific at Red Rocks Sat night.  It's a truly unique & amazing concert venue.  @http://twitpic.com/ilfxr                    |42220821|2009-09-21 15:27:57|[Jason Mraz]|
|My Daughter&#39;s First Orchestra Concert | Opensource, Nonprofits ... htt

### WHEN: looking for date-related words

In [59]:
# I'm going to start with the date matcher pretrained pipeline

date_pipe = PretrainedPipeline("match_datetime", lang="en")

date_annotation = date_pipe.transform(sample)

match_datetime download started this may take some time.
Approx size to download 12.8 KB
[OK!]


In [60]:
date_annotation.columns

['text', 'user_id', 't_dt', 'who', 'document', 'sentence', 'token', 'date']

In [61]:
date_annotation.select('text', 't_dt', 'date.result').show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------+-------------------+------------+
|text                                                                                                                                        |t_dt               |result      |
+--------------------------------------------------------------------------------------------------------------------------------------------+-------------------+------------+
|For free - add your event to our calendar! http://bit.ly/T3lbr #seattle #music #events #event #bands #concerts #calendar #blog              |2009-11-02 15:10:01|[]          |
|Jason Mraz was terrific at Red Rocks Sat night.  It's a truly unique & amazing concert venue.  @http://twitpic.com/ilfxr                    |2009-09-21 15:27:57|[]          |
|My Daughter&#39;s First Orchestra Concert | Opensource, Nonprofits ... http://bit.ly/1mcbz8                            

This is cool! It is using day-oriented words, like yesterday! I wonder if there is a way to set a reference date (as opposed to today). At least for the "Radio One concert" tweet... Doesn't look like there is, but I can use the date it outputs, get their relation with today, and apply to the date.

I'm not sure how it got 12/06 from the "Decemberists concert tonight" tweet. - maybe december + the 6 hours later?

In [62]:
date_annotation = date_annotation.select('text', F.col('date.result').alias('date_result'))

In [63]:
date_annotation.select('date_result').collect()

[Row(date_result=[]),
 Row(date_result=[]),
 Row(date_result=['2020/09/08']),
 Row(date_result=['2020/05/20']),
 Row(date_result=[]),
 Row(date_result=[]),
 Row(date_result=[]),
 Row(date_result=[]),
 Row(date_result=[]),
 Row(date_result=[]),
 Row(date_result=['2020/11/27']),
 Row(date_result=[]),
 Row(date_result=[]),
 Row(date_result=[]),
 Row(date_result=[]),
 Row(date_result=[]),
 Row(date_result=['2020/05/19']),
 Row(date_result=[]),
 Row(date_result=[]),
 Row(date_result=[]),
 Row(date_result=['2020/11/03']),
 Row(date_result=[]),
 Row(date_result=[]),
 Row(date_result=[]),
 Row(date_result=['2020/05/19']),
 Row(date_result=['2020/03/09']),
 Row(date_result=['2020/05/20']),
 Row(date_result=[]),
 Row(date_result=['2020/06/19']),
 Row(date_result=[]),
 Row(date_result=[]),
 Row(date_result=['2020/05/19']),
 Row(date_result=[]),
 Row(date_result=[]),
 Row(date_result=[]),
 Row(date_result=[]),
 Row(date_result=[]),
 Row(date_result=[]),
 Row(date_result=[]),
 Row(date_result=[]),


In [66]:
date_annotation.select(F.size("date_result").alias("no_of_dates")).agg({"no_of_dates": "max"}).show()

+----------------+
|max(no_of_dates)|
+----------------+
|               1|
+----------------+



I'm deciding to take the first date, since in my small sample, no tweet had more than one.

In [67]:
date_annotation = date_annotation.withColumn('date_result', F.col('date_result')[0])

In [69]:
sample = sample.join(date_annotation, on='text')

In [71]:
sample = sample.withColumn('date_result', F.to_date(sample['date_result'],'yyyy/MM/dd'))

In [72]:
sample = sample.withColumn('date_diff', F.datediff(F.current_timestamp(), sample['date_result']))

In [73]:
# if date_result is within two weeks of today, get difference, and apply it to timestamp
# elif date_result has this year's date. reset the year to match the year of the tweet 
# (hardcoeded as 10 years)

sample = sample.withColumn('when', F.when((col('date_diff') > -14),
                                      F.expr("date_add(t_dt, date_diff)"))\
                          .when((F.col('date_diff') < -14) 
                                & (F.year('date_result') == F.year(F.current_timestamp())), 
                                F.date_sub('date_result', 3652))
            )

In [74]:
sample.select('*').show(4)

+--------------------+--------+-------------------+------------+-----------+---------+----------+
|                text| user_id|               t_dt|         who|date_result|date_diff|      when|
+--------------------+--------+-------------------+------------+-----------+---------+----------+
|For free - add yo...|80949495|2009-11-02 15:10:01|          []|       null|     null|      null|
|Jason Mraz was te...|42220821|2009-09-21 15:27:57|[Jason Mraz]|       null|     null|      null|
|My Daughter&#39;s...|18518302|2009-11-18 17:20:16|          []| 2020-09-08|     -112|2010-09-09|
|I have 2 tickets ...|21316433|2009-09-14 20:09:00|[The Sounds]| 2020-05-20|       -1|2009-09-13|
+--------------------+--------+-------------------+------------+-----------+---------+----------+
only showing top 4 rows



In [75]:
sample = sample.drop('date_result', 'date_diff')

In [76]:
sample.show(20)

+--------------------+--------+-------------------+------------+----------+
|                text| user_id|               t_dt|         who|      when|
+--------------------+--------+-------------------+------------+----------+
|For free - add yo...|80949495|2009-11-02 15:10:01|          []|      null|
|Jason Mraz was te...|42220821|2009-09-21 15:27:57|[Jason Mraz]|      null|
|My Daughter&#39;s...|18518302|2009-11-18 17:20:16|          []|2010-09-09|
|I have 2 tickets ...|21316433|2009-09-14 20:09:00|[The Sounds]|2009-09-13|
|music news: Washi...|12135162|2009-11-16 08:37:38|          []|      null|
|Headed over to Bi...|38184021|2009-08-22 11:01:16|          []|      null|
|Zach Deputy = Ama...|13445142|2009-08-29 10:47:30|          []|      null|
|@Stony419 Whoops ...|18991162|2009-11-17 10:42:51|          []|      null|
|@q100Brittany OMG...|73886845|2009-11-13 13:10:54|          []|      null|
|*dizzy* RT @jeske...|17494046|2009-11-05 21:56:26|          []|      null|
|Fri Nov 27 

#### Future: update "when" to have a non-hard-coded version of setting the year.

## WHERE

I'm back to entitiy recognition. Going to try out some of the other Spark-NLP options.

In [77]:
# pipe_entity_rec2 = PretrainedPipeline("recognize_entities_bert", lang="en")
# annotation_entity2 = pipe_entity_rec2.transform(sample)
# annotation_entity2.select('entities.result').show(20, truncate=False)

# Oh no, this pretrained pipeline seems to be broken:

# """IllegalArgumentException: 'requirement failed: Wrong or missing inputCols annotators in BERT_EMBEDDINGS_2f121e7fb129. 
# Received inputCols: sentence. Make sure such annotators exist in your pipeline, with the right output names and that 
# they have following annotator types: document, token"""

In [154]:
pipe_entity_rec4 = PretrainedPipeline("onto_recognize_entities_lg", lang="en")
annotation_entity4 = pipe_entity_rec4.transform(sample)

onto_recognize_entities_lg download started this may take some time.
Approx size to download 2.3 GB
[OK!]


Each has slight differences. None of them seem to be far superior. 

Looking at the NER of the first entries, I'm decided to use Onto_Lg.

In [159]:
# look at ner and tokens together. I'll use any 'FAC', 'GEP' or 'LOC' NER tags as the location.
annotation_entity4.select('ner.result', 'token.result').show(20, truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                                 |result                                                                                                                                                                      |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------

In [162]:
loc_pd = annotation_entity4.select('text', 
                                   F.col('ner.result').alias('ner'), 
                                   F.col('token.result').alias('token'))\
                            .toPandas()

In [168]:
target_ners = ['FAC', 'GPE', 'LOC']

In [176]:
locations = []
for ners, tokens in zip(loc_pd['ner'], loc_pd['token']):
    location = []
    for ner, token in zip(ners, tokens):
        if any(target_ner in ner for target_ner in target_ners):
            location.append(token)
    location = ' '.join(location)
    locations.append(location)

In [177]:
loc_pd['location'] = locations

In [179]:
loc_pd.head(10)

Unnamed: 0,text,ner,token,location
0,For free - add your event to our calendar! htt...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[For, free, -, add, your, event, to, our, cale...",
1,Jason Mraz was terrific at Red Rocks Sat night...,"[B-PERSON, I-PERSON, O, O, O, B-LOC, I-LOC, B-...","[Jason, Mraz, was, terrific, at, Red, Rocks, S...",Red Rocks
2,My Daughter&#39;s First Orchestra Concert | Op...,"[O, O, O, B-ORG, I-ORG, I-ORG, I-ORG, I-ORG, O...","[My, Daughter&#39;, s, First, Orchestra, Conce...",
3,I have 2 tickets to The Sounds concert tomorro...,"[O, O, B-CARDINAL, O, O, O, O, O, B-TIME, I-TI...","[I, have, 2, tickets, to, The, Sounds, concert...",the House of Blues
4,music news: Washington 'Ring' cycle ends with ...,"[O, O, O, B-ORG, O, O, O, O, O, O, O, O, O]","[music, news, :, Washington, ', Ring, ', cycle...",
5,Headed over to Billy Bobs for the Diva Concert...,"[O, O, O, B-PERSON, I-PERSON, O, B-WORK_OF_ART...","[Headed, over, to, Billy, Bobs, for, the, Diva...",
6,Zach Deputy = Amazing. Love just pours outta t...,"[B-PERSON, O, O, O, O, O, O, O, O, O, O, O, O,...","[Zach, Deputy, =, Amazing, ., Love, just, pour...",
7,@Stony419 Whoops missed your tweet last night....,"[O, O, O, O, O, O, B-TIME, I-TIME, O, B-ORG, I...","[@, Stony419, Whoops, missed, your, tweet, las...",
8,@q100Brittany OMG if i win tickets it would be...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[@, q100Brittany, OMG, if, i, win, tickets, it...",
9,*dizzy* RT @jeskeets: MTV erects wall to block...,"[O, O, O, O, O, O, O, B-ORG, O, O, O, O, O, O,...","[*, dizzy, *, RT, @, jeskeets, :, MTV, erects,...",Berlin


In [180]:
loc_schema = StructType([
                StructField("text", StringType(), True),
                StructField("ner", ArrayType(StringType()), True),
                StructField("token", ArrayType(StringType()), True),
                StructField("location", StringType(), True)
                ])

In [182]:
loc_spk = spark.createDataFrame(loc_pd, schema=loc_schema)

In [None]:
sample = sample.join(loc_spk, on='text')

In [None]:
sam

## Sentiment

I'm curious about the differences in results from some of the different sentiment algorithms, but for now, we'll just go with the twitter-based sentiment analysis pretrained pipeline.

In [84]:
pipe_sentiment = PretrainedPipeline("analyze_sentiment", lang="en")
annotation_sentiment = pipe_sentiment.transform(sample)
annotation_sentiment.columns

# future note: "analyze_sentimentdl_use_twitter" --> Can not find the model to download please check the name!

analyze_sentiment download started this may take some time.
Approx size to download 4.9 MB
[OK!]


['text',
 'user_id',
 't_dt',
 'who',
 'when',
 'document',
 'sentence',
 'token',
 'checked',
 'sentiment']

In [113]:
annotation_sentiment.select('text', 'sentiment.result').show(10, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+
|text                                                                                                                                      |result                                            |
+------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------+
|For free - add your event to our calendar! http://bit.ly/T3lbr #seattle #music #events #event #bands #concerts #calendar #blog            |[positive, positive]                              |
|Jason Mraz was terrific at Red Rocks Sat night.  It's a truly unique & amazing concert venue.  @http://twitpic.com/ilfxr                  |[positive, positive, na]                          |
|My Daughter&#39;s First Orchestra Conce

This sentiment analysis seems like it is not doing a great job with these tweets (a lot of negative). I wish the twitter-trained one was working! But I'll continue.

Since the sentiment analysis returns a list, I'll switch to pandas to calculate the balance of the sentiments.

In [104]:
sentiment_pd = annotation_sentiment.select('text','sentiment.result').toPandas()

In [114]:
sentiment_pd['sentiment'] = [sum([1 if s == 'positive' else -1 if s == 'negative' else 0 for s in s_list]) for s_list in sentiment_pd['result']]

In [115]:
sentiment_pd.head()

Unnamed: 0,text,result,sentiment
0,For free - add your event to our calendar! htt...,"[positive, positive]",2
1,Jason Mraz was terrific at Red Rocks Sat night...,"[positive, positive, na]",2
2,My Daughter&#39;s First Orchestra Concert | Op...,"[negative, negative, negative, negative, na]",-4
3,I have 2 tickets to The Sounds concert tomorro...,"[positive, positive, positive]",3
4,music news: Washington 'Ring' cycle ends with ...,[negative],-1


In [118]:
sentiment_schema = StructType([
                StructField("text", StringType(), True),
                StructField("result", ArrayType(StringType()), True),
                StructField("sentiment", IntegerType(), True)
                ])

sentiment = spark.createDataFrame(sentiment_pd, schema=sentiment_schema)

In [119]:
sample = sample.join(sentiment.select('text', 'sentiment'), on='text', how='left')

In [123]:
sample = sample.withColumn('sentiment', F.when(F.col('sentiment') > 0, 'positive')\
                                 .when(F.col('sentiment') == 0, 'neutral')\
                                 .when(F.col('sentiment') < 0, 'negative'))

In [124]:
sample.show(5)

+--------------------+--------+-------------------+------------+----------+---------+
|                text| user_id|               t_dt|         who|      when|sentiment|
+--------------------+--------+-------------------+------------+----------+---------+
|For free - add yo...|80949495|2009-11-02 15:10:01|          []|      null| positive|
|Jason Mraz was te...|42220821|2009-09-21 15:27:57|[Jason Mraz]|      null| positive|
|My Daughter&#39;s...|18518302|2009-11-18 17:20:16|          []|2010-09-09| negative|
|I have 2 tickets ...|21316433|2009-09-14 20:09:00|[The Sounds]|2009-09-13| positive|
|music news: Washi...|12135162|2009-11-16 08:37:38|          []|      null| negative|
+--------------------+--------+-------------------+------------+----------+---------+
only showing top 5 rows



## Audience

Short cut. If tweet has I - then the twitter user.

**Future: perhaps use the NER to determine the subject (but not the performer) or POS?

Unfortunately, my attempt to use the POS tagger didn't work for today.

In [133]:
sample_pos = sample.withColumnRenamed('text', 't_text')

In [134]:
# pos_tagger = PerceptronApproach() \
#     .setInputCols(["token", "document"]) \
#     .setOutputCol("pos") \
#     .setNIterations(5)\
#     .fit() # I'm not sure where to get the training data set for this....

# finisher = finisher = Finisher() \
#      .setInputCols(['pos']) \
#      .setCleanAnnotations(True)

In [135]:
# pipe = Pipeline()\
#                .setStages([
#                     documentAssembler,
#                     tokenizer,
#                     pos_tagger,
#                     finisher
#                 ])

In [136]:
# pipe.transform(sample_pos)

So going with a simple solution:

In [138]:
finisher = finisher = Finisher() \
     .setInputCols(['lemma']) \
     .setCleanAnnotations(False)

pipeline = Pipeline() \
     .setStages([
           documentAssembler,
           tokenizer,
           normalizer,
           lemmatizer,
           finisher
     ])

In [139]:
sample_lemma = sample.withColumnRenamed('text', 't_text')
sample_lemma = pipeline.fit(sample_lemma).transform(sample_lemma)

In [140]:
sample_lemma.select('finished_lemma').show(10, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------+
|finished_lemma                                                                                                                                    |
+--------------------------------------------------------------------------------------------------------------------------------------------------+
|[for, free, add, you, event, to, we, calendar, httpbitlytlbr, seattle, music, event, event, band, concert, calendar, blog]                        |
|[jason, mraz, be, terrific, at, red, rock, sit, night, it, a, truly, unique, amazing, concert, venue, httptwitpiccomilfxr]                        |
|[i, daughter, first, orchestra, concert, opensource, nonprofits, httpbitlymcbz]                                                                   |
|[i, have, ticket, to, the, sound, concert, tomorrow, night, at, the, house, of, blue, for, both, let, i, 

In [141]:
sample_lemma = sample_lemma.withColumn('audience', F.when(F.array_contains(sample_lemma['finished_lemma'],'i'), sample_lemma['user_id'])\
                                 .when(F.array_contains(sample_lemma['finished_lemma'], 'we'), sample_lemma['user_id']))

In [148]:
sample = sample.join(sample_lemma.select(F.col('t_text').alias('text'), 'audience'), on='text', how='inner')

In [149]:
sample.show(10)

+--------------------+--------+-------------------+------------+----------+---------+--------+
|                text| user_id|               t_dt|         who|      when|sentiment|audience|
+--------------------+--------+-------------------+------------+----------+---------+--------+
|For free - add yo...|80949495|2009-11-02 15:10:01|          []|      null| positive|80949495|
|Jason Mraz was te...|42220821|2009-09-21 15:27:57|[Jason Mraz]|      null| positive|    null|
|My Daughter&#39;s...|18518302|2009-11-18 17:20:16|          []|2010-09-09| negative|18518302|
|I have 2 tickets ...|21316433|2009-09-14 20:09:00|[The Sounds]|2009-09-13| positive|21316433|
|music news: Washi...|12135162|2009-11-16 08:37:38|          []|      null| negative|    null|
|Headed over to Bi...|38184021|2009-08-22 11:01:16|          []|      null| negative|    null|
|Zach Deputy = Ama...|13445142|2009-08-29 10:47:30|          []|      null| negative|    null|
|@Stony419 Whoops ...|18991162|2009-11-17 10:42:51

## Last Cleaning