# Concert Tweet Classifier

## Import Necesary Packages

In [1]:
import sparknlp
from pyspark.sql.types import *
from pyspark.sql.functions import count, when, col
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer,
                                LemmatizerModel, StopWordsCleaner, PerceptronApproach)
from pyspark.ml import Pipeline
from nltk.corpus import stopwords
import pyspark.sql.functions as F
import pandas as pd
from sparknlp.pretrained import PretrainedPipeline

## Start the spark-NLP session

In [2]:
spark = sparknlp.start()

In [3]:
spark.sparkContext.defaultParallelism

4

In [4]:
# adjust show output format to pandas-like
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

# enable pyarrow for toPandas
spark.conf.set("spark.sql.execution.arrow.enabled", True)

## Read the data

Note, I have saved the clean data to parquet and commented out the preprocessing/data cleaning steps to save time when re-running the notebook.

In the first go, there were 25k rows of null - where the schema did not match the data. I decided to do some quick cleaning.

In [5]:
def remove_extra_seps(in_file, out_file, sep):
    """removes newline characters that come before the line reaches four segments(3 separators)
    and combines "middle sections" with extra separators into a single segment by removing the separators.
    
    Args:
        in_file: path to read file
        out_file: path to write file
        sep: separator/delimitor
    """
    n_chunks = 4
    
    with open(in_file, 'r') as rf:
        with open(out_file, 'w') as wf:
            while True:
                line = rf.readline()
                
                # if end of file
                if line == '':
                    break
                    
                # if line has less than n_sep, strip the newline and add the next line
                if len(line.split(sep)) < n_chunks:
                    line = line.strip('\n')
                    line += rf.readline()
                
                wf.write(line)

In [6]:
# remove_extra_seps('../../data/test_set_tweets.txt',
#                       '../../data/test_set_tweets_clean.txt',
#                      '\t')
# remove_extra_seps('../../data/training_set_tweets.txt',
#                       '../../data/training_set_tweets_clean.txt',
#                      '\t')

In [7]:
# set the schema
tweet_schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("t_id", StringType(), True),
    StructField("t_text", StringType(), True),
    StructField("t_dt", TimestampType(), True)
    ])

In [8]:
# tweets_test = spark.read.csv('../../data/test_set_tweets_clean.txt', 
#                               sep='\t',
#                               schema=tweet_schema,
#                               header="false")

In [9]:
# tweets_training = spark.read.csv('../../data/training_set_tweets_clean.txt', 
#                                  sep="\t", 
#                                  schema=tweet_schema,
#                                  header='false')

Since our data is unlabeled for our task, these test/train splits are not particularly useful, but a vestige of the original data set and purpose. We'll combine them.

In [10]:
# tweets = tweets_test.union(tweets_training)

#### Future: Consider reading the data as a single column and then parsing. Compare outcome / number of tweets retrieved to that with the csv reading

## Data Set

### Tweets

In [11]:
# tweets.select('*').show(5)

In [12]:
# tweets.select([count(when(col(c).isNull(), c)).alias(c) for c in 
#         tweets.columns]).show()

print("""+-------+-----+------+-----+
|user_id| t_id|t_text| t_dt|
+-------+-----+------+-----+
|  33289|33179| 32631|53805|
+-------+-----+------+-----+""")

+-------+-----+------+-----+
|user_id| t_id|t_text| t_dt|
+-------+-----+------+-----+
|  33289|33179| 32631|53805|
+-------+-----+------+-----+


In [13]:
# tweets.count()

print(8884863)

8884863


In [14]:
# tweets.distinct().count()
print(8850656)

8850656


It looks like the time stamp can be parsed from the end of the tweet text for many of these "null" datetimes.

In [15]:
# tweets.filter(col('t_dt').isNull()).take(5)

In [16]:
# when the datetime is null, take the last 19 characters of the tweet text as the datetime

# tweets = tweets.withColumn('datetime', 
#                            F.when(F.col('t_dt').isNull(), 
#                                   F.to_date(F.substring('t_text', -19, 19)))
#                            .otherwise(F.col('t_dt'))
#                           )

In [17]:
# when the datetime is null, remove the last characters (the tab and datetime) from the tweet text 

# tweets = tweets.withColumn('t_text', 
#                            F.when(F.col('t_dt').isNull(), 
#                                   F.expr('substring(t_text, 1, length(t_text)-20)'))
#                            .otherwise(F.col('t_text'))
#                            )

In [18]:
# tweets = tweets.withColumn('t_dt', F.col('datetime')).drop('datetime')

In [19]:
# save as parquet and reload

# tweets.write.parquet('../../data/tweets.parquet')
tweets = spark.read.parquet('../../data/tweets.parquet')

In [20]:
# tweets.select([count(when(col(c).isNull(), c)).alias(c) for c in 
#         tweets.columns]).show()

print("""+-------+-----+------+-----+
        |user_id| t_id|t_text| t_dt|
        +-------+-----+------+-----+
        |  34555|34490| 34232|54671|
        +-------+-----+------+-----+
        """)

+-------+-----+------+-----+
        |user_id| t_id|t_text| t_dt|
        +-------+-----+------+-----+
        |  34555|34490| 34232|54671|
        +-------+-----+------+-----+
        


In [21]:
# tweets.count()

print(8884863)

8884863


In [22]:
# drop remaining rows with null values in the text column
tweets = tweets.dropna(how='any', subset=['t_text'])

In [23]:
tweets.filter(col('t_dt').isNull()).take(5)

[Row(user_id=25513575, t_id='10334442280', t_text='', t_dt=None),
 Row(user_id=25513575, t_id='10333612651', t_text='', t_dt=None),
 Row(user_id=16198727, t_id='6899029209', t_text='This vid cracked me up! haha I w', t_dt=None),
 Row(user_id=20106865, t_id='10362030419', t_text="Ladies and gentlemen... come and join me.  It'", t_dt=None),
 Row(user_id=20106865, t_id='10005503765', t_text='I am talking #Survivor RIGHT NOW in stickam', t_dt=None)]

Clearly I could do some more/better data engineering here, but for this exercise, I'm going to move on, dropping any records with null values or t_text with empty strings

In [24]:
tweets = tweets.dropna(how='any')

In [25]:
# remove empty string tweets from the data set
tweets = tweets.filter(~(tweets.t_text == ""))

In [26]:
# tweets.count()

print(8829912)

8829912


We still have 8.83 million from the original 8.88 million. I'll take it for today's exercise. I'm pretty sure some more/better data engineering could extract more tweets from our text file, but that's a challenge for another day.

# Concert tweets - Classifier

I am deciding to focus on english tweets for now. (may add spanish, others in the future based on presence in the data set).

In [27]:
eng_stopwords = stopwords.words('english')

setting up the pieces of my pipeline to extract text info from the tweets (we'll use a pretrained pipeline later)

In [28]:
documentAssembler = DocumentAssembler() \
     .setInputCol('t_text') \
     .setOutputCol('document')
tokenizer = Tokenizer() \
     .setInputCols(['document']) \
     .setOutputCol('token')
normalizer = Normalizer() \
     .setInputCols(['token']) \
     .setOutputCol('normalized') \
     .setLowercase(True)
lemmatizer = LemmatizerModel.pretrained() \
     .setInputCols(['normalized']) \
     .setOutputCol('lemma')
stopwords_cleaner = StopWordsCleaner() \
     .setInputCols(['lemma']) \
     .setOutputCol('clean_lemma') \
     .setCaseSensitive(False) \
     .setStopWords(eng_stopwords)
finisher = Finisher() \
     .setInputCols(['clean_lemma']) \
     .setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [29]:
pipeline = Pipeline() \
     .setStages([
           documentAssembler,
           tokenizer,
           normalizer,
           lemmatizer,
           stopwords_cleaner,
           finisher
     ])

In [30]:
tweets = pipeline.fit(tweets).transform(tweets)

In [31]:
tweets.columns

['user_id',
 't_id',
 't_text',
 't_dt',
 'document',
 'token',
 'normalized',
 'lemma',
 'clean_lemma',
 'finished_clean_lemma']

## Basic Classifier: contains the word concert

In [32]:
concert_tweets = tweets.withColumn('concert', F.array_contains('finished_clean_lemma', 'concert'))
concert_tweets = concert_tweets.filter(concert_tweets['concert'] == 'true')

In [33]:
concert_tweets.select('t_text').take(3)

[Row(t_text="@herRoyalStarnes I just thought of the history broke down bmw's on bdays free concert tickets in the nose bleeds p (cont) http://tl.gd/4pp7k"),
 Row(t_text='Y is me @RandiICandy, @EpitomeOfADiva, and Leila Bunny n here singing Mary J like we Mary J. We in concert yall buy a ticket yall.'),
 Row(t_text="@beccalexis sup Bee? How'd the shoot go? Will you be at the concert tonight?")]

In [34]:
# concert_tweets.count()

print(12477)

12477


In [35]:
concert_tweets.select("t_text").show(30, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------+
|t_text                                                                                                                                      |
+--------------------------------------------------------------------------------------------------------------------------------------------+
|@herRoyalStarnes I just thought of the history broke down bmw's on bdays free concert tickets in the nose bleeds p (cont) http://tl.gd/4pp7k|
|Y is me @RandiICandy, @EpitomeOfADiva, and Leila Bunny n here singing Mary J like we Mary J. We in concert yall buy a ticket yall.          |
|@beccalexis sup Bee? How'd the shoot go? Will you be at the concert tonight?                                                                |
|RT @BoomKack: Janet was at Lady Gaga concert tonight she is everything!!!!!! Can't touch her!                                               |

These are looking pretty concert-oriented! I want to see if we could catch some more tweets with a more inclusive filter:

## Basic Classifier: contains the word concert or similar words

In [36]:
concert_plus = tweets.withColumn('concert', F.array_contains('finished_clean_lemma', 'concert'))\
                     .withColumn('tour', F.array_contains('finished_clean_lemma', 'tour'))\
                     .withColumn('gig', F.array_contains('finished_clean_lemma', 'gig'))\
                     .withColumn('show', F.array_contains('finished_clean_lemma', 'show'))
concert_plus = concert_plus.withColumn('concert_like', col('concert')|col('tour')|col('gig'))
concert_plus = concert_plus.filter(concert_plus.concert_like == True)

In [37]:
concert_plus.select("t_text").show(30, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------+
|t_text                                                                                                                                      |
+--------------------------------------------------------------------------------------------------------------------------------------------+
|@Lauralu2u yeps I had curve than the tour.   Love my Droid                                                                                  |
|@herRoyalStarnes I just thought of the history broke down bmw's on bdays free concert tickets in the nose bleeds p (cont) http://tl.gd/4pp7k|
|Y is me @RandiICandy, @EpitomeOfADiva, and Leila Bunny n here singing Mary J like we Mary J. We in concert yall buy a ticket yall.          |
|@joeymcintyre You've got to be a LITTLE bit silly on tour or you wouldn't be YOU! ;)                                                        |

Looking at this super small sample, it doesn't seem like these alternate words are adding a lot to our classifier.

#### Future: maybe combination of show/tour/gig and musician/group name in addition to the concert

Since we don't have labeled data, and I'm not sure the best technique for clustering text data in this situation. Or how we would evaluate which techniqes are doing the best job identifying our concert tweets, and whether they are worth the extra complexity/computational requirements.

For now, I'm going to move on using the "concert" lemma classifier

In [38]:
df = concert_tweets.select('user_id', 't_text', 't_dt')

In [40]:
# df.count()
print(12444)

12444


In [54]:
# df.take(1)

print("""[Row(user_id=85691996, t_text="@herRoyalStarnes I just thought of the history broke down bmw's on bdays free
concert tickets in the nose bleeds p (cont) http://tl.gd/4pp7k", t_dt=datetime.datetime(2010, 1, 22, 10, 17, 15))]"""

SyntaxError: EOF while scanning triple-quoted string literal (<ipython-input-54-e29f211de91f>, line 4)

In [43]:
# rename t_text to text for use with pretrained Spark-NLP models
df = df.withColumnRenamed('t_text', 'text')

## Entity Recognition

### WHEN: looking for date-related words

#### Future: update "when" to have a non-hard-coded version of setting the year.

In [44]:
# date matcher pretrained pipeline

date_pipe = PretrainedPipeline("match_datetime", lang="en")

date_annotation = date_pipe.transform(df)

match_datetime download started this may take some time.
Approx size to download 12.8 KB
[OK!]


In [45]:
# check the result
date_annotation.select('text', 't_dt', 'date.result').show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------+-------------------+------------+
|text                                                                                                                                        |t_dt               |result      |
+--------------------------------------------------------------------------------------------------------------------------------------------+-------------------+------------+
|@herRoyalStarnes I just thought of the history broke down bmw's on bdays free concert tickets in the nose bleeds p (cont) http://tl.gd/4pp7k|2010-01-22 10:17:15|[]          |
|Y is me @RandiICandy, @EpitomeOfADiva, and Leila Bunny n here singing Mary J like we Mary J. We in concert yall buy a ticket yall.          |2010-01-15 16:22:28|[]          |
|@beccalexis sup Bee? How'd the shoot go? Will you be at the concert tonight?                                           

This is cool! It is using day-oriented words, like yesterday! I wonder if there is a way to set a reference date (as opposed to today). At least for the "Radio One concert" tweet... Doesn't look like there is, but I can use the date it outputs, get their relation with today, and apply to the date.

I'm not sure how it got 12/06 from the "Decemberists concert tonight" tweet. - maybe december + the 6 hours later?

In [46]:
# rename date.result to date_result
date_annotation = date_annotation.select('text', F.col('date.result').alias('date_result'))

In [47]:
# unfortunately, I'm getting this error that I didn't get when coding on a smaller sample :(
# Py4JJavaError: An error occurred while calling o6798.collectToPython.
# : org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 126.0 failed 1 times, 
# most recent failure: Lost task 1.0 in stage 126.0 (TID 1298, localhost, executor driver): 
# org.apache.spark.SparkException: Failed to execute user defined function($anonfun$dfAnnotate$1: 
# (array<array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:
#  array<float>>>>) => array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,
# embeddings:array<float>>>)

# I'm going to move on for today

# date_annotation.select('date_result').collect()

For the exercise's sake, I will continue with the date transformation from this 1% sample.

**Future: Investigate this error

In [48]:
# try with a sample of the dataframe
date_annotation = date_pipe.transform(df.sample(fraction=.01, seed=5))

# rename date.result to date_result
date_annotation = date_annotation.select('text', F.col('date.result').alias('date_result'))

# test for error
# date_annotation.select('date_result').collect()

In [49]:
# date_annotation.select(F.size("date_result").alias("no_of_dates")).agg({"no_of_dates": "max"}).show()

print("""max no of dates: 1""")

max no of dates: 1


I'm deciding to take the first date, since in my small sample, no tweet had more than one.

In [50]:
# get first date from list of dates
date_annotation = date_annotation.withColumn('date_result', F.col('date_result')[0])

# join the extracted date df with the original data frame
df = df.join(date_annotation, on='text', how='left')

# convert to dateType
df = df.withColumn('date_result', F.to_date(df['date_result'],'yyyy/MM/dd'))

# add a column with the difference in date between the date produced by the date extractor and today
df = df.withColumn('date_diff', F.datediff(F.current_timestamp(), df['date_result']))

# if date_result is within two weeks of today, get difference, and apply it to timestamp
# elif date_result has this year's date. reset the year to match the year of the tweet 
# (hardcoeded as 10 years)
df = df.withColumn('when', F.when((col('date_diff') > -14),
                                      F.expr("date_add(t_dt, date_diff)"))\
                          .when((F.col('date_diff') < -14) 
                                & (F.year('date_result') == F.year(F.current_timestamp())), 
                                F.date_sub('date_result', 3652))
            )

# drop the extra columns
df = df.drop('date_result', 'date_diff', 't_dt')

In [53]:
# convert to string for compatibility with pyarrow
df = df.withColumn('when', F.col('when').cast('string'))

# I'm having some errors with toPandas() so I'm going to convert to pandas in stages
date_df = df.toPandas()


# df.write.json('../../data/date_text.json')

## Other entities

Since we only have 12k records and pyspark doesn't support typedLit (passing arrays to udfs) yet, I'm going to collect the text information we need for the rest of the data extraction, and move to pandas.

In [55]:
# use pretrained pipeline for NER, Tokens
pipeline_entities = PretrainedPipeline("onto_recognize_entities_lg", lang="en")
annotation_entities = pipeline_entities.transform(df)

onto_recognize_entities_lg download started this may take some time.
Approx size to download 2.3 GB
[OK!]


In [56]:
# convert the needed columns to pandas
entities_df = annotation_entities.select(F.col('entities.result').alias('entities'),
                                            F.col('ner.result').alias('ners'),
                                            F.col('token.result').alias('tokens'),
                                           'text')\
                                .toPandas()

In [57]:
# use pretrained pipeline for sentiment extraction
pipe_sentiment = PretrainedPipeline("analyze_sentiment", lang="en")
annotation_sentiment = pipe_sentiment.transform(df)

# future note: "analyze_sentimentdl_use_twitter" --> Can not find the model to download please check the name!

analyze_sentiment download started this may take some time.
Approx size to download 4.9 MB
[OK!]


In [None]:
annotation_sentiment.select('text', 'sentiment.result').show(10, truncate=False)

This sentiment analysis seems like it is not doing a great job with these tweets (a lot of negative). I wish the twitter-trained one was working! But I'll continue.

In [58]:
# add sentiments to df
# df = df.join(annotation_sentiment.select('text',
#                                         F.col('sentiment.result').alias('sentiments')),
#              on='text',
#              how='inner')

# convert sentiments to pandas
sentiments_df = annotation_sentiment.select('text',
                                            F.col('sentiment.result').alias('sentiments'))\
                                    .toPandas()

In [None]:
# end spark session
spark.stop()

## Combine dataframes in pandas

In [177]:
# confirm head and shape
print(date_df.shape)
date_df.head()

(12444, 3)


Unnamed: 0,text,user_id,when
0,@herRoyalStarnes I just thought of the history...,85691996,
1,"Y is me @RandiICandy, @EpitomeOfADiva, and Lei...",85691996,
2,@beccalexis sup Bee? How'd the shoot go? Will ...,25611870,
3,RT @BoomKack: Janet was at Lady Gaga concert t...,25611870,
4,Concert tonight at the bellyup! The grouch& mr...,30387809,


In [178]:
print(sentiments_df.shape)
sentiments_df.head()

(12444, 2)


Unnamed: 0,text,sentiments
0,@herRoyalStarnes I just thought of the history...,"[positive, positive]"
1,"Y is me @RandiICandy, @EpitomeOfADiva, and Lei...",[positive]
2,@beccalexis sup Bee? How'd the shoot go? Will ...,"[negative, negative, negative]"
3,RT @BoomKack: Janet was at Lady Gaga concert t...,"[negative, positive]"
4,Concert tonight at the bellyup! The grouch& mr...,"[negative, positive]"


In [179]:
print(entities_df.shape)
entities_df.head()

(12444, 4)


Unnamed: 0,entities,ners,tokens,text
0,[bmw],"[O, O, O, O, O, O, O, O, O, O, B-PRODUCT, O, O...","[@, herRoyalStarnes, I, just, thought, of, the...",@herRoyalStarnes I just thought of the history...
1,"[RandiICandy, EpitomeOfADiva, Leila Bunny, Mar...","[O, O, O, O, B-ORG, O, O, B-FAC, O, O, B-PERSO...","[Y, is, me, @, RandiICandy, ,, @, EpitomeOfADi...","Y is me @RandiICandy, @EpitomeOfADiva, and Lei..."
2,[tonight],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[@, beccalexis, sup, Bee, ?, How, 'd, the, sho...",@beccalexis sup Bee? How'd the shoot go? Will ...
3,"[BoomKack, Janet, Gaga, tonight]","[O, O, B-FAC, O, B-PERSON, O, O, O, B-PERSON, ...","[RT, @, BoomKack, :, Janet, was, at, Lady, Gag...",RT @BoomKack: Janet was at Lady Gaga concert t...
4,[tonight],"[O, B-TIME, O, O, O, O, O, O, O, O, O]","[Concert, tonight, at, the, bellyup, !, The, g...",Concert tonight at the bellyup! The grouch& mr...


In [180]:
df_pd = date_df.join(sentiments_df.drop(columns='text'))

In [181]:
df_pd = df_pd.join(entities_df.drop(columns='text'))

In [182]:
df_pd.head()

Unnamed: 0,text,user_id,when,sentiments,entities,ners,tokens
0,@herRoyalStarnes I just thought of the history...,85691996,,"[positive, positive]",[bmw],"[O, O, O, O, O, O, O, O, O, O, B-PRODUCT, O, O...","[@, herRoyalStarnes, I, just, thought, of, the..."
1,"Y is me @RandiICandy, @EpitomeOfADiva, and Lei...",85691996,,[positive],"[RandiICandy, EpitomeOfADiva, Leila Bunny, Mar...","[O, O, O, O, B-ORG, O, O, B-FAC, O, O, B-PERSO...","[Y, is, me, @, RandiICandy, ,, @, EpitomeOfADi..."
2,@beccalexis sup Bee? How'd the shoot go? Will ...,25611870,,"[negative, negative, negative]",[tonight],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[@, beccalexis, sup, Bee, ?, How, 'd, the, sho..."
3,RT @BoomKack: Janet was at Lady Gaga concert t...,25611870,,"[negative, positive]","[BoomKack, Janet, Gaga, tonight]","[O, O, B-FAC, O, B-PERSON, O, O, O, B-PERSON, ...","[RT, @, BoomKack, :, Janet, was, at, Lady, Gag..."
4,Concert tonight at the bellyup! The grouch& mr...,30387809,,"[negative, positive]",[tonight],"[O, B-TIME, O, O, O, O, O, O, O, O, O]","[Concert, tonight, at, the, bellyup, !, The, g..."


### Who

For the sake of time, I focused on pop and hip hop artists from 2009/2010 (data from wikipedia). This is extra tricky when tweeters use the artist handles (eg @JonasBrothers), again this is an area for future iteration

In [183]:
# import artist list
with open('../../data/musicians.txt', 'r') as f:
     artists = f.read().splitlines()
        
artists = list(set(artists))

In [184]:
artists[0:5]

['Lloyd', 'Michael Bublé', 'Beyoncé', 'Fat Joe', 'Travie McCoy']

In [185]:
# get 'who' with the intersection of the extracted entities from the tweets and my artist list
df_pd['who'] = [[entity for entity in e_list if entity in artists] for e_list in df_pd['entities']]

In [186]:
df_pd.loc[df_pd['who'].str.len() >0]

Unnamed: 0,text,user_id,when,sentiments,entities,ners,tokens,who
24,"Jason Derulo- Helps @ BET's ""SOS: Help for Hai...",71429761,,[positive],[Jason Derulo],"[B-PERSON, I-PERSON, O, O, O, O, O, O, O, O, O...","[Jason, Derulo, -, Helps, @, BET, 's, "", SOS, ...",[Jason Derulo]
35,PAUSE! Now Drake is in concert? How? He doesn'...,46329494,,"[positive, negative, negative, negative]",[Drake],"[O, O, O, B-PERSON, O, O, O, O, O, O, O, O, O,...","[PAUSE, !, Now, Drake, is, in, concert, ?, How...",[Drake]
114,Boooom ! RT @AllThingsFresh: Drake just perfor...,62205707,,[negative],"[AllThingsFresh, Drake, ""Forever, Toronto]","[O, O, O, O, B-FAC, O, B-PERSON, O, O, B-WORK_...","[Boooom, !, RT, @, AllThingsFresh, :, Drake, j...",[Drake]
157,Sooo i'm in love with Jay-Z new cd i really wa...,53309244,,[positive],[Jay-Z],"[O, O, O, O, O, O, B-PERSON, O, O, O, O, O, O,...","[Sooo, i, 'm, in, love, with, Jay-Z, new, cd, ...",[Jay-Z]
168,RT @dlloydthemlmpro: World AIDS Day: Alicia Ke...,65392460,,"[positive, na]","[World AIDS Day, Alicia Keys]","[O, O, O, O, B-WORK_OF_ART, I-WORK_OF_ART, I-W...","[RT, @, dlloydthemlmpro, :, World, AIDS, Day, ...",[Alicia Keys]
...,...,...,...,...,...,...,...,...
12307,Watching NKOTB on youtube. In need of a concert,26341336,,"[negative, negative]",[NKOTB],"[O, B-ORG, O, O, O, O, O, O, O, O]","[Watching, NKOTB, on, youtube, ., In, need, of...",[NKOTB]
12360,Check out Alicia Keys streaming live concert o...,24575856,,[positive],[Alicia Keys],"[O, O, B-PERSON, I-PERSON, O, O, O, O, O, O, O...","[Check, out, Alicia, Keys, streaming, live, co...",[Alicia Keys]
12390,I was just thinkin.... what if at this Jay-Z c...,18546575,,"[negative, negative, negative, negative, negat...","[Jay-Z, a few weeks, #FTW]","[O, O, O, O, O, O, O, O, O, O, O, O, B-PERSON,...","[I, was, just, thinkin, ., ., ., ., what, if, ...",[Jay-Z]
12417,people getting real live pissed about the Drak...,69156796,,[negative],"[Drake, Drake]","[O, O, O, O, O, O, O, B-PERSON, O, O, O, B-PER...","[people, getting, real, live, pissed, about, t...","[Drake, Drake]"


In [187]:
# replace empty strings with null/None
df_pd['who'] = df_pd['who'].apply(lambda x: None if len(x)==0 else x)

## WHERE

In [188]:
# look at ner and tokens together. I'll use any 'FAC', 'GEP' or 'LOC' NER tags as the location.
df_pd[['ners', 'tokens']].head(20)

Unnamed: 0,ners,tokens
0,"[O, O, O, O, O, O, O, O, O, O, B-PRODUCT, O, O...","[@, herRoyalStarnes, I, just, thought, of, the..."
1,"[O, O, O, O, B-ORG, O, O, B-FAC, O, O, B-PERSO...","[Y, is, me, @, RandiICandy, ,, @, EpitomeOfADi..."
2,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[@, beccalexis, sup, Bee, ?, How, 'd, the, sho..."
3,"[O, O, B-FAC, O, B-PERSON, O, O, O, B-PERSON, ...","[RT, @, BoomKack, :, Janet, was, at, Lady, Gag..."
4,"[O, B-TIME, O, O, O, O, O, O, O, O, O]","[Concert, tonight, at, the, bellyup, !, The, g..."
5,"[O, O, O, O, B-WORK_OF_ART, I-WORK_OF_ART, I-W...","[They, Played, #FLEX, @, The, Jigga, Concert, ..."
6,"[O, B-ORDINAL, O, O, O, O, O, O, O, O, O, O, O...","[My, First, Concert, ., .., Then, I, 'm, seein..."
7,"[O, B-WORK_OF_ART, I-WORK_OF_ART, I-WORK_OF_AR...","[In, The, Library, With, @, NickAustinG, ., ....."
8,"[O, B-PERSON, O, O, O, O, O, O, O]","[@, RockStarRenRen, lol, is, we, going, to, th..."
9,"[O, O, O, O, O, O, O, O, O, O, B-PERSON, O, O,...","[Sooo, go, b4, u, wet, ur, self, @, ANGELicNES..."


In [189]:
target_ners = ['FAC', 'GPE', 'LOC']

In [190]:
# combining the tokens that are tagged with our target NERs into a cohesive location string
locations = []
for ners, tokens in zip(df_pd['ners'], df_pd['tokens']):
    location = []
    for ner, token in zip(ners, tokens):
        if any(target_ner in ner for target_ner in target_ners):
            location.append(token)
    location = " ".join(location)
    locations.append(location)

In [191]:
# adding our locations to the pandas dataframe
df_pd['where'] = locations

In [192]:
df_pd.head(10)

Unnamed: 0,text,user_id,when,sentiments,entities,ners,tokens,who,where
0,@herRoyalStarnes I just thought of the history...,85691996,,"[positive, positive]",[bmw],"[O, O, O, O, O, O, O, O, O, O, B-PRODUCT, O, O...","[@, herRoyalStarnes, I, just, thought, of, the...",,
1,"Y is me @RandiICandy, @EpitomeOfADiva, and Lei...",85691996,,[positive],"[RandiICandy, EpitomeOfADiva, Leila Bunny, Mar...","[O, O, O, O, B-ORG, O, O, B-FAC, O, O, B-PERSO...","[Y, is, me, @, RandiICandy, ,, @, EpitomeOfADi...",,EpitomeOfADiva
2,@beccalexis sup Bee? How'd the shoot go? Will ...,25611870,,"[negative, negative, negative]",[tonight],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[@, beccalexis, sup, Bee, ?, How, 'd, the, sho...",,
3,RT @BoomKack: Janet was at Lady Gaga concert t...,25611870,,"[negative, positive]","[BoomKack, Janet, Gaga, tonight]","[O, O, B-FAC, O, B-PERSON, O, O, O, B-PERSON, ...","[RT, @, BoomKack, :, Janet, was, at, Lady, Gag...",,BoomKack
4,Concert tonight at the bellyup! The grouch& mr...,30387809,,"[negative, positive]",[tonight],"[O, B-TIME, O, O, O, O, O, O, O, O, O]","[Concert, tonight, at, the, bellyup, !, The, g...",,
5,They Played #FLEX @ The Jigga Concert... And #...,71702459,,"[positive, negative]",[The Jigga Concert],"[O, O, O, O, B-WORK_OF_ART, I-WORK_OF_ART, I-W...","[They, Played, #FLEX, @, The, Jigga, Concert, ...",,
6,My First Concert... Then I'm seeing one of the...,71702459,,"[positive, negative]",[First],"[O, B-ORDINAL, O, O, O, O, O, O, O, O, O, O, O...","[My, First, Concert, ., .., Then, I, 'm, seein...",,
7,In The Library With @NickAustinG... He Tryin T...,71702459,,"[negative, negative, negative]",[The Library With @NickAustinG],"[O, B-WORK_OF_ART, I-WORK_OF_ART, I-WORK_OF_AR...","[In, The, Library, With, @, NickAustinG, ., .....",,
8,@RockStarRenRen lol is we going to this concert,49483366,,[positive],[RockStarRenRen],"[O, B-PERSON, O, O, O, O, O, O, O]","[@, RockStarRenRen, lol, is, we, going, to, th...",,
9,Sooo go b4 u wet ur self @ANGELicNES: Hhuuuuh...,28528232,,[positive],"[Hhuuuuhh, Jayz, SongzYuuup]","[O, O, O, O, O, O, O, O, O, O, B-PERSON, O, O,...","[Sooo, go, b4, u, wet, ur, self, @, ANGELicNES...",,SongzYuuup


## Sentiment

I'm curious about the differences in results from some of the different sentiment algorithms, but for now, we'll just go with the twitter-based sentiment analysis pretrained pipeline.

In [193]:
# combining the sentiment readings for each row: +1 for positive, -1 for negative, then sum
df_pd['sentiments'] = [sum([1 if s == 'positive' else -1 if s == 'negative' else 0 for s in s_list]) 
                             for s_list in df_pd['sentiments']]

In [194]:
df_pd.head()

Unnamed: 0,text,user_id,when,sentiments,entities,ners,tokens,who,where
0,@herRoyalStarnes I just thought of the history...,85691996,,2,[bmw],"[O, O, O, O, O, O, O, O, O, O, B-PRODUCT, O, O...","[@, herRoyalStarnes, I, just, thought, of, the...",,
1,"Y is me @RandiICandy, @EpitomeOfADiva, and Lei...",85691996,,1,"[RandiICandy, EpitomeOfADiva, Leila Bunny, Mar...","[O, O, O, O, B-ORG, O, O, B-FAC, O, O, B-PERSO...","[Y, is, me, @, RandiICandy, ,, @, EpitomeOfADi...",,EpitomeOfADiva
2,@beccalexis sup Bee? How'd the shoot go? Will ...,25611870,,-3,[tonight],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[@, beccalexis, sup, Bee, ?, How, 'd, the, sho...",,
3,RT @BoomKack: Janet was at Lady Gaga concert t...,25611870,,0,"[BoomKack, Janet, Gaga, tonight]","[O, O, B-FAC, O, B-PERSON, O, O, O, B-PERSON, ...","[RT, @, BoomKack, :, Janet, was, at, Lady, Gag...",,BoomKack
4,Concert tonight at the bellyup! The grouch& mr...,30387809,,0,[tonight],"[O, B-TIME, O, O, O, O, O, O, O, O, O]","[Concert, tonight, at, the, bellyup, !, The, g...",,


In [195]:
# convert sentiment numbers to strings
df_pd['sentiment'] = ['positive' if s > 0 else 'neutral' if s==0 else 'negative' for s in df_pd['sentiments']]

In [196]:
df_pd.head(5)

Unnamed: 0,text,user_id,when,sentiments,entities,ners,tokens,who,where,sentiment
0,@herRoyalStarnes I just thought of the history...,85691996,,2,[bmw],"[O, O, O, O, O, O, O, O, O, O, B-PRODUCT, O, O...","[@, herRoyalStarnes, I, just, thought, of, the...",,,positive
1,"Y is me @RandiICandy, @EpitomeOfADiva, and Lei...",85691996,,1,"[RandiICandy, EpitomeOfADiva, Leila Bunny, Mar...","[O, O, O, O, B-ORG, O, O, B-FAC, O, O, B-PERSO...","[Y, is, me, @, RandiICandy, ,, @, EpitomeOfADi...",,EpitomeOfADiva,positive
2,@beccalexis sup Bee? How'd the shoot go? Will ...,25611870,,-3,[tonight],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[@, beccalexis, sup, Bee, ?, How, 'd, the, sho...",,,negative
3,RT @BoomKack: Janet was at Lady Gaga concert t...,25611870,,0,"[BoomKack, Janet, Gaga, tonight]","[O, O, B-FAC, O, B-PERSON, O, O, O, B-PERSON, ...","[RT, @, BoomKack, :, Janet, was, at, Lady, Gag...",,BoomKack,neutral
4,Concert tonight at the bellyup! The grouch& mr...,30387809,,0,[tonight],"[O, B-TIME, O, O, O, O, O, O, O, O, O]","[Concert, tonight, at, the, bellyup, !, The, g...",,,neutral


### Audience

I'll add the audience column.

**Future: perhaps use the NER to determine the subject (but not the performer) or POS?

Unfortunately, my attempt to use the POS tagger didn't work for today.

I'm going with the basic solution - contains I or we, then the audience is the tweeter

In [197]:
# pos_tagger = PerceptronApproach() \
#     .setInputCols(["token", "document"]) \
#     .setOutputCol("pos") \
#     .setNIterations(5)\
#     .fit() # I'm not sure where to get the training data set for this....

# finisher = finisher = Finisher() \
#      .setInputCols(['pos']) \
#      .setCleanAnnotations(True)

In [198]:
# pipe = Pipeline()\
#                .setStages([
#                     documentAssembler,
#                     tokenizer,
#                     pos_tagger,
#                     finisher
#                 ])

In [199]:
# pipe.transform(df_pos)

**Future: perhaps use the NER to determine the subject (but not the performer) or POS?

Unfortunately, my attempt to use the POS tagger didn't work for today.


In [200]:
# pos_tagger = PerceptronApproach() \
#     .setInputCols(["token", "document"]) \
#     .setOutputCol("pos") \
#     .setNIterations(5)\
#     .fit() # I'm not sure where to get the training data set for this....

# finisher = finisher = Finisher() \
#      .setInputCols(['pos']) \
#      .setCleanAnnotations(True)

In [201]:
# pipe = Pipeline()\
#                .setStages([
#                     documentAssembler,
#                     tokenizer,
#                     pos_tagger,
#                     finisher
#                 ])

In [202]:
# pipe.transform(df_pos)

So going with a simple solution:

In [203]:
lower_tokens_list = []
for token_list in df_pd['tokens']:
    lower_tokens_list.append([token.lower() for token in token_list])

In [204]:
df_pd['audience'] = [u if ('i' in t or 'we' in t) 
                     else None 
                     for u, t in zip(df_pd['user_id'], lower_tokens_list)]

## Last Cleaning

In [205]:
df_pd.head()

Unnamed: 0,text,user_id,when,sentiments,entities,ners,tokens,who,where,sentiment,audience
0,@herRoyalStarnes I just thought of the history...,85691996,,2,[bmw],"[O, O, O, O, O, O, O, O, O, O, B-PRODUCT, O, O...","[@, herRoyalStarnes, I, just, thought, of, the...",,,positive,85691996.0
1,"Y is me @RandiICandy, @EpitomeOfADiva, and Lei...",85691996,,1,"[RandiICandy, EpitomeOfADiva, Leila Bunny, Mar...","[O, O, O, O, B-ORG, O, O, B-FAC, O, O, B-PERSO...","[Y, is, me, @, RandiICandy, ,, @, EpitomeOfADi...",,EpitomeOfADiva,positive,85691996.0
2,@beccalexis sup Bee? How'd the shoot go? Will ...,25611870,,-3,[tonight],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[@, beccalexis, sup, Bee, ?, How, 'd, the, sho...",,,negative,
3,RT @BoomKack: Janet was at Lady Gaga concert t...,25611870,,0,"[BoomKack, Janet, Gaga, tonight]","[O, O, B-FAC, O, B-PERSON, O, O, O, B-PERSON, ...","[RT, @, BoomKack, :, Janet, was, at, Lady, Gag...",,BoomKack,neutral,
4,Concert tonight at the bellyup! The grouch& mr...,30387809,,0,[tonight],"[O, B-TIME, O, O, O, O, O, O, O, O, O]","[Concert, tonight, at, the, bellyup, !, The, g...",,,neutral,


In [206]:
df_pd = df_pd[['text', 'who', 'when', 'where', 'audience', 'sentiment']]

In [210]:
df_pd.head(30)

Unnamed: 0,text,who,when,where,audience,sentiment
0,@herRoyalStarnes I just thought of the history...,,,,85691996.0,positive
1,"Y is me @RandiICandy, @EpitomeOfADiva, and Lei...",,,EpitomeOfADiva,85691996.0,positive
2,@beccalexis sup Bee? How'd the shoot go? Will ...,,,,,negative
3,RT @BoomKack: Janet was at Lady Gaga concert t...,,,BoomKack,,neutral
4,Concert tonight at the bellyup! The grouch& mr...,,,,,neutral
5,They Played #FLEX @ The Jigga Concert... And #...,,,,,neutral
6,My First Concert... Then I'm seeing one of the...,,,,71702459.0,neutral
7,In The Library With @NickAustinG... He Tryin T...,,,,71702459.0,negative
8,@RockStarRenRen lol is we going to this concert,,,,49483366.0,positive
9,Sooo go b4 u wet ur self @ANGELicNES: Hhuuuuh...,,,SongzYuuup,28528232.0,positive


I'd love to add more artists to my artist list, to make this more satisfying, and to figure out the issue with the date recognition and twitter sentiment detector. Another day.