# Concert Tweet Classifier

## Import Necesary Packages

In [1]:
import sparknlp
from pyspark.sql.types import *
from pyspark.sql.functions import count, when, col
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer,
                                LemmatizerModel, StopWordsCleaner, PerceptronApproach)
from pyspark.ml import Pipeline
from nltk.corpus import stopwords
import pyspark.sql.functions as F
import pandas as pd
from sparknlp.pretrained import PretrainedPipeline

## Start the spark-NLP session

In [2]:
spark = sparknlp.start()

In [3]:
spark.sparkContext.defaultParallelism

4

In [4]:
# adjust show output format to pandas-like
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

# enable pyarrow for toPandas
spark.conf.set("spark.sql.execution.arrow.enabled", True)

## Read the data

Note, I have saved the clean data to parquet and commented out the preprocessing/data cleaning steps to save time when re-running the notebook.

In the first go, there were 25k rows of null - where the schema did not match the data. I decided to do some quick cleaning.

In [5]:
def remove_extra_seps(in_file, out_file, sep):
    """removes newline characters that come before the line reaches four segments(3 separators)
    and combines "middle sections" with extra separators into a single segment by removing the separators.
    
    Args:
        in_file: path to read file
        out_file: path to write file
        sep: separator/delimitor
    """
    n_chunks = 4
    
    with open(in_file, 'r') as rf:
        with open(out_file, 'w') as wf:
            while True:
                line = rf.readline()
                
                # if end of file
                if line == '':
                    break
                    
                # if line has less than n_sep, strip the newline and add the next line
                if len(line.split(sep)) < n_chunks:
                    line = line.strip('\n')
                    line += rf.readline()
                
                wf.write(line)

In [6]:
# remove_extra_seps('../data/test_set_tweets.txt',
#                       '../data/test_set_tweets_clean.txt',
#                      '\t')
# remove_extra_seps('../data/training_set_tweets.txt',
#                       '../data/training_set_tweets_clean.txt',
#                      '\t')

In [7]:
# set the schema
tweet_schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("t_id", StringType(), True),
    StructField("t_text", StringType(), True),
    StructField("t_dt", TimestampType(), True)
    ])

In [8]:
# tweets_test = spark.read.csv('../data/test_set_tweets_clean.txt', 
#                               sep='\t',
#                               schema=tweet_schema,
#                               header="false")

In [9]:
# tweets_training = spark.read.csv('../data/training_set_tweets_clean.txt', 
#                                  sep="\t", 
#                                  schema=tweet_schema,
#                                  header='false')

Since our data is unlabeled for our task, these test/train splits are not particularly useful, but a vestige of the original data set and purpose. We'll combine them.

In [10]:
# tweets = tweets_test.union(tweets_training)

#### Future: Consider reading the data as a single column and then parsing. Compare outcome / number of tweets retrieved to that with the csv reading

## Data Set

### Tweets

In [11]:
# tweets.select('*').show(5)

In [12]:
# tweets.select([count(when(col(c).isNull(), c)).alias(c) for c in 
#         tweets.columns]).show()

print("""+-------+-----+------+-----+
|user_id| t_id|t_text| t_dt|
+-------+-----+------+-----+
|  33289|33179| 32631|53805|
+-------+-----+------+-----+""")

+-------+-----+------+-----+
|user_id| t_id|t_text| t_dt|
+-------+-----+------+-----+
|  33289|33179| 32631|53805|
+-------+-----+------+-----+


In [13]:
# tweets.count()

print(8884863)

8884863


In [14]:
# tweets.distinct().count()
print(8850656)

8850656


It looks like the time stamp can be parsed from the end of the tweet text for many of these "null" datetimes.

In [15]:
# tweets.filter(col('t_dt').isNull()).take(5)

In [16]:
# when the datetime is null, take the last 19 characters of the tweet text as the datetime

# tweets = tweets.withColumn('datetime', 
#                            F.when(F.col('t_dt').isNull(), 
#                                   F.to_date(F.substring('t_text', -19, 19)))
#                            .otherwise(F.col('t_dt'))
#                           )

In [17]:
# when the datetime is null, remove the last characters (the tab and datetime) from the tweet text 

# tweets = tweets.withColumn('t_text', 
#                            F.when(F.col('t_dt').isNull(), 
#                                   F.expr('substring(t_text, 1, length(t_text)-20)'))
#                            .otherwise(F.col('t_text'))
#                            )

In [18]:
# tweets = tweets.withColumn('t_dt', F.col('datetime')).drop('datetime')

In [20]:
# save as parquet and reload

# tweets.write.parquet('../data/tweets.parquet')
tweets = spark.read.parquet('../data/tweets.parquet')

In [21]:
# tweets.select([count(when(col(c).isNull(), c)).alias(c) for c in 
#         tweets.columns]).show()

print("""+-------+-----+------+-----+
        |user_id| t_id|t_text| t_dt|
        +-------+-----+------+-----+
        |  34555|34490| 34232|54671|
        +-------+-----+------+-----+
        """)

+-------+-----+------+-----+
        |user_id| t_id|t_text| t_dt|
        +-------+-----+------+-----+
        |  34555|34490| 34232|54671|
        +-------+-----+------+-----+
        


In [22]:
# tweets.count()

print(8884863)

8884863


In [23]:
# drop remaining rows with null values in the text column
tweets = tweets.dropna(how='any', subset=['t_text'])

In [24]:
tweets.filter(col('t_dt').isNull()).take(5)

[Row(user_id=25513575, t_id='10334442280', t_text='', t_dt=None),
 Row(user_id=25513575, t_id='10333612651', t_text='', t_dt=None),
 Row(user_id=16198727, t_id='6899029209', t_text='This vid cracked me up! haha I w', t_dt=None),
 Row(user_id=20106865, t_id='10362030419', t_text="Ladies and gentlemen... come and join me.  It'", t_dt=None),
 Row(user_id=20106865, t_id='10005503765', t_text='I am talking #Survivor RIGHT NOW in stickam', t_dt=None)]

Clearly I could do some more/better data engineering here, but for this exercise, I'm going to move on, dropping any records with null values or t_text with empty strings

In [25]:
tweets = tweets.dropna(how='any')

In [26]:
# remove empty string tweets from the data set
tweets = tweets.filter(~(tweets.t_text == ""))

In [27]:
# tweets.count()

print(8829912)

8829912


We still have 8.83 million from the original 8.88 million. I'll take it for today's exercise. I'm pretty sure some more/better data engineering could extract more tweets from our text file, but that's a challenge for another day.

# Concert tweets - Classifier

I am deciding to focus on english tweets for now. (may add spanish, others in the future based on presence in the data set).

In [28]:
eng_stopwords = stopwords.words('english')

setting up the pieces of my pipeline to extract text info from the tweets (we'll use a pretrained pipeline later)

In [29]:
documentAssembler = DocumentAssembler() \
     .setInputCol('t_text') \
     .setOutputCol('document')
tokenizer = Tokenizer() \
     .setInputCols(['document']) \
     .setOutputCol('token')
normalizer = Normalizer() \
     .setInputCols(['token']) \
     .setOutputCol('normalized') \
     .setLowercase(True)
lemmatizer = LemmatizerModel.pretrained() \
     .setInputCols(['normalized']) \
     .setOutputCol('lemma')
stopwords_cleaner = StopWordsCleaner() \
     .setInputCols(['lemma']) \
     .setOutputCol('clean_lemma') \
     .setCaseSensitive(False) \
     .setStopWords(eng_stopwords)
finisher = Finisher() \
     .setInputCols(['clean_lemma']) \
     .setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [30]:
pipeline = Pipeline() \
     .setStages([
           documentAssembler,
           tokenizer,
           normalizer,
           lemmatizer,
           stopwords_cleaner,
           finisher
     ])

In [31]:
tweets = pipeline.fit(tweets).transform(tweets)

In [32]:
tweets.columns

['user_id',
 't_id',
 't_text',
 't_dt',
 'document',
 'token',
 'normalized',
 'lemma',
 'clean_lemma',
 'finished_clean_lemma']

## Basic Classifier: contains the word concert

In [33]:
concert_tweets = tweets.withColumn('concert', F.array_contains('finished_clean_lemma', 'concert'))
concert_tweets = concert_tweets.filter(concert_tweets['concert'] == 'true')

In [34]:
concert_tweets.select('t_text').take(3)

[Row(t_text="@herRoyalStarnes I just thought of the history broke down bmw's on bdays free concert tickets in the nose bleeds p (cont) http://tl.gd/4pp7k"),
 Row(t_text='Y is me @RandiICandy, @EpitomeOfADiva, and Leila Bunny n here singing Mary J like we Mary J. We in concert yall buy a ticket yall.'),
 Row(t_text="@beccalexis sup Bee? How'd the shoot go? Will you be at the concert tonight?")]

In [35]:
# concert_tweets.count()

print(12477)

12477


In [36]:
concert_tweets.select("t_text").show(30, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------+
|t_text                                                                                                                                      |
+--------------------------------------------------------------------------------------------------------------------------------------------+
|@herRoyalStarnes I just thought of the history broke down bmw's on bdays free concert tickets in the nose bleeds p (cont) http://tl.gd/4pp7k|
|Y is me @RandiICandy, @EpitomeOfADiva, and Leila Bunny n here singing Mary J like we Mary J. We in concert yall buy a ticket yall.          |
|@beccalexis sup Bee? How'd the shoot go? Will you be at the concert tonight?                                                                |
|RT @BoomKack: Janet was at Lady Gaga concert tonight she is everything!!!!!! Can't touch her!                                               |

These are looking pretty concert-oriented! I want to see if we could catch some more tweets with a more inclusive filter:

## Basic Classifier: contains the word concert or similar words

In [37]:
concert_plus = tweets.withColumn('concert', F.array_contains('finished_clean_lemma', 'concert'))\
                     .withColumn('tour', F.array_contains('finished_clean_lemma', 'tour'))\
                     .withColumn('gig', F.array_contains('finished_clean_lemma', 'gig'))\
                     .withColumn('show', F.array_contains('finished_clean_lemma', 'show'))
concert_plus = concert_plus.withColumn('concert_like', col('concert')|col('tour')|col('gig'))
concert_plus = concert_plus.filter(concert_plus.concert_like == True)

In [38]:
concert_plus.select("t_text").show(30, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------+
|t_text                                                                                                                                      |
+--------------------------------------------------------------------------------------------------------------------------------------------+
|@Lauralu2u yeps I had curve than the tour.   Love my Droid                                                                                  |
|@herRoyalStarnes I just thought of the history broke down bmw's on bdays free concert tickets in the nose bleeds p (cont) http://tl.gd/4pp7k|
|Y is me @RandiICandy, @EpitomeOfADiva, and Leila Bunny n here singing Mary J like we Mary J. We in concert yall buy a ticket yall.          |
|@joeymcintyre You've got to be a LITTLE bit silly on tour or you wouldn't be YOU! ;)                                                        |

Looking at this super small sample, it doesn't seem like these alternate words are adding a lot to our classifier.

#### Future: maybe combination of show/tour/gig and musician/group name in addition to the concert

Since we don't have labeled data, and I'm not sure the best technique for clustering text data in this situation. Or how we would evaluate which techniqes are doing the best job identifying our concert tweets, and whether they are worth the extra complexity/computational requirements.

For now, I'm going to move on using the "concert" lemma classifier

In [39]:
concert_tweets.select('user_id', 't_text', 't_dt').write.parquet('../data/concert_tweets.parquet')

In [40]:
df = spark.read.parquet('../data/concert_tweets.parquet')

In [41]:
df.count()
# print(12444)

12444


In [42]:
# df.take(1)

print("""[Row(user_id=85691996, t_text="@herRoyalStarnes I just thought of the history broke down bmw's on bdays free
concert tickets in the nose bleeds p (cont) http://tl.gd/4pp7k", t_dt=datetime.datetime(2010, 1, 22, 10, 17, 15))]""")

[Row(user_id=85691996, t_text="@herRoyalStarnes I just thought of the history broke down bmw's on bdays free
concert tickets in the nose bleeds p (cont) http://tl.gd/4pp7k", t_dt=datetime.datetime(2010, 1, 22, 10, 17, 15))]


In [43]:
# rename t_text to text for use with pretrained Spark-NLP models
df = df.withColumnRenamed('t_text', 'text')

## Entity Recognition

### WHEN: looking for date-related words

#### Future: update "when" to have a non-hard-coded version of setting the year.

In [44]:
# date matcher pretrained pipeline

date_pipe = PretrainedPipeline("match_datetime", lang="en")

date_annotation = date_pipe.transform(df)

match_datetime download started this may take some time.
Approx size to download 12.9 KB
[OK!]


In [45]:
# check the result
date_annotation.select('text', 't_dt', 'date.result').show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------+-------------------+------------+
|text                                                                                                                                      |t_dt               |result      |
+------------------------------------------------------------------------------------------------------------------------------------------+-------------------+------------+
|The Air Up There 11/28/2009 at Bob's Classic Kicks - Doors open at 9 concert at 10. $3 with kicks $5 without. http://bit.ly/kF6rG         |2009-11-19 10:22:31|[2009/11/28]|
|At Maxwell concert! Common on stage.                                                                                                      |2009-10-02 19:49:37|[]          |
|Heading out to film Cartel and The Summer Set! We will Twitpic all of you tweeters some live pictures from the concert tonight!  

This is cool! It is using day-oriented words, like yesterday! I wonder if there is a way to set a reference date (as opposed to today). At least for the "Radio One concert" tweet... Doesn't look like there is, but I can use the date it outputs, get their relation with today, and apply to the date.

I'm not sure how it got 12/06 from the "Decemberists concert tonight" tweet. - maybe december + the 6 hours later?

In [46]:
# rename date.result to date_result
date_annotation = date_annotation.select('text', F.col('date.result').alias('date_result'))

In [47]:
# unfortunately, I'm getting this error that I didn't get when coding on a smaller sample :(


# Py4JJavaError: An error occurred while calling o6798.collectToPython.
# : org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 126.0 failed 1 times, 
# most recent failure: Lost task 1.0 in stage 126.0 (TID 1298, localhost, executor driver): 
# org.apache.spark.SparkException: Failed to execute user defined function($anonfun$dfAnnotate$1: 
# (array<array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:
#  array<float>>>>) => array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,
# embeddings:array<float>>>)

# I'm going to move on for today

# date_annotation.select('date_result').collect()


For the exercise's sake, I will continue with the date transformation from this 1% sample.

**Future: Investigate this error

In [52]:
# try with a sample of the dataframe
date_annotation = date_pipe.transform(df.sample(fraction=.01, seed=5))

# rename date.result to date_result
date_annotation = date_annotation.select('text', F.col('date.result').alias('date_result'))

# test for error
# date_annotation.select('date_result').collect()

In [54]:
# date_annotation.select(F.size("date_result").alias("no_of_dates")).agg({"no_of_dates": "max"}).show()

print("""+----------------+
|max(no_of_dates)|
+----------------+
|               1|
+----------------+""")

+----------------+
|max(no_of_dates)|
+----------------+
|               1|
+----------------+


I'm deciding to take the first date, since in my small sample, no tweet had more than one.

In [55]:
# get first date from list of dates
date_annotation = date_annotation.withColumn('date_result', F.col('date_result')[0])

# join the extracted date df with the original data frame
df = df.join(date_annotation, on='text', how='left')

# convert to dateType
df = df.withColumn('date_result', F.to_date(df['date_result'],'yyyy/MM/dd'))

# add a column with the difference in date between the date produced by the date extractor and today
df = df.withColumn('date_diff', F.datediff(F.current_timestamp(), df['date_result']))

# if date_result is within two weeks of today, get difference, and apply it to timestamp
# elif date_result has this year's date. reset the year to match the year of the tweet 
# (hardcoeded as 10 years)
df = df.withColumn('when', F.when((col('date_diff') > -14),
                                      F.expr("date_add(t_dt, date_diff)"))\
                          .when((F.col('date_diff') < -14) 
                                & (F.year('date_result') == F.year(F.current_timestamp())), 
                                F.date_sub('date_result', 3652))
            )

# drop the extra columns
df = df.drop('date_result', 'date_diff', 't_dt')

In [56]:
# convert to string for compatibility with pyarrow
df = df.withColumn('when', F.col('when').cast('string'))

# I'm having some errors with toPandas() so I'm going to convert to pandas in stages
date_df = df.toPandas()

## Other entities

Since we only have 12k records and pyspark doesn't support typedLit (passing arrays to udfs) yet, I'm going to collect the text information we need for the rest of the data extraction, and move to pandas.

After a look at several entity models on a small sample, I decided to go with the OntoNotes entities large.

In [57]:
# use pretrained pipeline for NER, Tokens
pipeline_entities = PretrainedPipeline("onto_recognize_entities_lg", lang="en")
annotation_entities = pipeline_entities.transform(df)

onto_recognize_entities_lg download started this may take some time.
Approx size to download 2.3 GB
[OK!]


In [58]:
# convert the needed columns to pandas
entities_df = annotation_entities.select(F.col('entities.result').alias('entities'),
                                            F.col('ner.result').alias('ners'),
                                            F.col('token.result').alias('tokens'),
                                           'text')\
                                .toPandas()

In [59]:
# use pretrained pipeline for sentiment extraction
pipe_sentiment = PretrainedPipeline("analyze_sentimentdl_use_twitter", lang="en")
annotation_sentiment = pipe_sentiment.transform(df)

analyze_sentimentdl_use_twitter download started this may take some time.
Approx size to download 928.3 MB
[OK!]


In [60]:
annotation_sentiment.select('text', 'sentiment.result').show(10, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------+----------+
|text                                                                                                                                      |result    |
+------------------------------------------------------------------------------------------------------------------------------------------+----------+
|The Air Up There 11/28/2009 at Bob's Classic Kicks - Doors open at 9 concert at 10. $3 with kicks $5 without. http://bit.ly/kF6rG         |[positive]|
|At Maxwell concert! Common on stage.                                                                                                      |[positive]|
|Heading out to film Cartel and The Summer Set! We will Twitpic all of you tweeters some live pictures from the concert tonight!           |[positive]|
|super pumped for the Kelly Clarkson concert tonight!                                   

In [61]:
# convert sentiments to pandas
sentiments_df = annotation_sentiment.select('text',
                                            F.col('sentiment.result').alias('sentiments'))\
                                    .toPandas()

In [62]:
# end spark session
spark.stop()

## Combine dataframes in pandas

In [63]:
# confirm head and shape
print(date_df.shape)
date_df.head()

(12444, 3)


Unnamed: 0,text,user_id,when
0,The Air Up There 11/28/2009 at Bob's Classic K...,33947545,
1,At Maxwell concert! Common on stage.,26083271,
2,Heading out to film Cartel and The Summer Set!...,38535159,
3,super pumped for the Kelly Clarkson concert to...,2057421,
4,heres the thing... we started off friends... i...,2057421,


In [64]:
print(sentiments_df.shape)
sentiments_df.head()

(12444, 2)


Unnamed: 0,text,sentiments
0,The Air Up There 11/28/2009 at Bob's Classic K...,[positive]
1,At Maxwell concert! Common on stage.,[positive]
2,Heading out to film Cartel and The Summer Set!...,[positive]
3,super pumped for the Kelly Clarkson concert to...,[positive]
4,heres the thing... we started off friends... i...,[negative]


In [66]:
print(entities_df.shape)
entities_df.head(20)

(12444, 4)


Unnamed: 0,entities,ners,tokens,text
0,"[The Air Up, 11/28/2009, Bob, 9, 10, $3, $5]","[B-PRODUCT, I-PRODUCT, I-PRODUCT, O, B-DATE, O...","[The, Air, Up, There, 11/28/2009, at, Bob, 's,...",The Air Up There 11/28/2009 at Bob's Classic K...
1,[Maxwell],"[O, B-FAC, O, O, O, O, O, O]","[At, Maxwell, concert, !, Common, on, stage, .]",At Maxwell concert! Common on stage.
2,"[Cartel and The Summer Set, tonight]","[O, O, O, O, B-WORK_OF_ART, I-WORK_OF_ART, I-W...","[Heading, out, to, film, Cartel, and, The, Sum...",Heading out to film Cartel and The Summer Set!...
3,"[the Kelly Clarkson, tonight]","[O, O, O, B-ORG, I-ORG, I-ORG, O, B-TIME, O]","[super, pumped, for, the, Kelly, Clarkson, con...",super pumped for the Kelly Clarkson concert to...
4,[],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[heres, the, thing, ., ., ., we, started, off,...",heres the thing... we started off friends... i...
5,"[YEAAAAAAAAA "", 3 MONTHS]","[O, O, O, O, O, O, O, O, O, O, O, B-WORK_OF_AR...","[Listening, to, kelly, clarkson, getting, read...",Listening to kelly clarkson getting ready for ...
6,"[Kelly Clarkson, 12 year old]","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-PERS...","[It, was, easy, to, get, right, in, front, of,...",It was easy to get right in front of the stage...
7,"[Kelly Clarkson, last night]","[O, O, O, O, B-PERSON, I-PERSON, O, B-TIME, I-...","[Yea, I, rocked, the, Kelly, Clarkson, shirt, ...",Yea I rocked the Kelly Clarkson shirt last nig...
8,"[three-day, Austin]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[i, hope, this, cold, i, got, over, the, weeke...",i hope this cold i got over the weekend if ove...
9,"[Juli 2010, SUBWAY, SALLY, #concerts #nightlife]","[O, O, O, O, O, O, B-PERSON, I-PERSON, B-ORG, ...","[via, @, Festival_Watch, :, #mm, 8.-10., Juli,...",via @Festival_Watch: #mm 8.-10. Juli 2010 SUBW...


In [67]:
df_pd = date_df.join(sentiments_df.drop(columns='text'))

In [68]:
df_pd = df_pd.join(entities_df.drop(columns='text'))

In [69]:
df_pd.head()

Unnamed: 0,text,user_id,when,sentiments,entities,ners,tokens
0,The Air Up There 11/28/2009 at Bob's Classic K...,33947545,,[positive],"[The Air Up, 11/28/2009, Bob, 9, 10, $3, $5]","[B-PRODUCT, I-PRODUCT, I-PRODUCT, O, B-DATE, O...","[The, Air, Up, There, 11/28/2009, at, Bob, 's,..."
1,At Maxwell concert! Common on stage.,26083271,,[positive],[Maxwell],"[O, B-FAC, O, O, O, O, O, O]","[At, Maxwell, concert, !, Common, on, stage, .]"
2,Heading out to film Cartel and The Summer Set!...,38535159,,[positive],"[Cartel and The Summer Set, tonight]","[O, O, O, O, B-WORK_OF_ART, I-WORK_OF_ART, I-W...","[Heading, out, to, film, Cartel, and, The, Sum..."
3,super pumped for the Kelly Clarkson concert to...,2057421,,[positive],"[the Kelly Clarkson, tonight]","[O, O, O, B-ORG, I-ORG, I-ORG, O, B-TIME, O]","[super, pumped, for, the, Kelly, Clarkson, con..."
4,heres the thing... we started off friends... i...,2057421,,[negative],[],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[heres, the, thing, ., ., ., we, started, off,..."


### Who

For the sake of time, I focused on pop and hip hop artists from 2009/2010 (data from wikipedia). This is extra tricky when tweeters use the artist handles (eg @JonasBrothers), again this is an area for future iteration

In [70]:
# import artist list
with open('../data/musicians.txt', 'r') as f:
     artists = f.read().splitlines()
        
artists = list(set(artists))

In [71]:
artists[0:5]

['Iyaz', 'Drake', 'Guru', 'Soul Assassins', 'Obie Trice']

In [72]:
# get 'who' with the intersection of the extracted entities from the tweets and my artist list
df_pd['who'] = [[entity for entity in e_list if entity in artists] for e_list in df_pd['entities']]

In [73]:
df_pd.loc[df_pd['who'].str.len() >0]

Unnamed: 0,text,user_id,when,sentiments,entities,ners,tokens,who
6,It was easy to get right in front of the stage...,2057421,,[negative],"[Kelly Clarkson, 12 year old]","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-PERS...","[It, was, easy, to, get, right, in, front, of,...",[Kelly Clarkson]
7,Yea I rocked the Kelly Clarkson shirt last nig...,2057421,,[positive],"[Kelly Clarkson, last night]","[O, O, O, O, B-PERSON, I-PERSON, O, B-TIME, I-...","[Yea, I, rocked, the, Kelly, Clarkson, shirt, ...",[Kelly Clarkson]
55,"So, what about the Jason Mraz concert, huh?!! ...",22549826,,[positive],[Jason Mraz],"[O, O, O, O, O, B-PERSON, I-PERSON, O, O, O, O...","[So, ,, what, about, the, Jason, Mraz, concert...",[Jason Mraz]
56,Post pictures of the Jason Mraz concert in Rap...,22549826,,[positive],"[Jason Mraz, Rapid City, last night, the Civic...","[O, O, O, O, B-PERSON, I-PERSON, O, O, B-GPE, ...","[Post, pictures, of, the, Jason, Mraz, concert...",[Jason Mraz]
57,Thanks to fans who uploaded pictures to my pho...,22549826,,[positive],[Jason Mraz],"[O, O, O, O, O, O, O, O, O, O, O, O, O, B-PERS...","[Thanks, to, fans, who, uploaded, pictures, to...",[Jason Mraz]
...,...,...,...,...,...,...,...,...
12182,Booked at 4am. Think concert ended way earlier...,32793540,,[negative],"[4am, Lil Wayne, last night]","[O, O, B-TIME, O, O, O, O, O, O, O, O, O, O, O...","[Booked, at, 4am, ., Think, concert, ended, wa...",[Lil Wayne]
12190,@ToxicHotGirl84 You could also do a vlog about...,20411431,,[positive],[Britney Spears],"[O, O, O, O, O, O, O, O, O, O, O, O, O, B-PERS...","[@, ToxicHotGirl84, You, could, also, do, a, v...",[Britney Spears]
12254,They are giving away Jay-Z concert tickets for...,25163968,,[negative],"[Jay-Z, tomorrow, night]","[O, O, O, O, B-PERSON, O, O, O, B-DATE, B-TIME...","[They, are, giving, away, Jay-Z, concert, tick...",[Jay-Z]
12364,Doing hair @ JAY Z concert in Philip! Having a...,29734209,,[positive],"[JAY Z, Philip]","[O, O, O, B-ORG, I-ORG, O, O, B-ORG, O, O, O, ...","[Doing, hair, @, JAY, Z, concert, in, Philip, ...",[JAY Z]


In [74]:
# replace empty strings with null/None
df_pd['who'] = df_pd['who'].apply(lambda x: None if len(x)==0 else x)

## WHERE

In [75]:
# look at ner and tokens together. I'll use any 'FAC', 'GEP' or 'LOC' NER tags as the location.
df_pd[['ners', 'tokens']].head(20)

Unnamed: 0,ners,tokens
0,"[B-PRODUCT, I-PRODUCT, I-PRODUCT, O, B-DATE, O...","[The, Air, Up, There, 11/28/2009, at, Bob, 's,..."
1,"[O, B-FAC, O, O, O, O, O, O]","[At, Maxwell, concert, !, Common, on, stage, .]"
2,"[O, O, O, O, B-WORK_OF_ART, I-WORK_OF_ART, I-W...","[Heading, out, to, film, Cartel, and, The, Sum..."
3,"[O, O, O, B-ORG, I-ORG, I-ORG, O, B-TIME, O]","[super, pumped, for, the, Kelly, Clarkson, con..."
4,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[heres, the, thing, ., ., ., we, started, off,..."
5,"[O, O, O, O, O, O, O, O, O, O, O, B-WORK_OF_AR...","[Listening, to, kelly, clarkson, getting, read..."
6,"[O, O, O, O, O, O, O, O, O, O, O, O, O, B-PERS...","[It, was, easy, to, get, right, in, front, of,..."
7,"[O, O, O, O, B-PERSON, I-PERSON, O, B-TIME, I-...","[Yea, I, rocked, the, Kelly, Clarkson, shirt, ..."
8,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[i, hope, this, cold, i, got, over, the, weeke..."
9,"[O, O, O, O, O, O, B-PERSON, I-PERSON, B-ORG, ...","[via, @, Festival_Watch, :, #mm, 8.-10., Juli,..."


In [76]:
target_ners = ['FAC', 'GPE', 'LOC']

In [77]:
# combining the tokens that are tagged with our target NERs into a cohesive location string
locations = []
for ners, tokens in zip(df_pd['ners'], df_pd['tokens']):
    location = []
    for ner, token in zip(ners, tokens):
        if any(target_ner in ner for target_ner in target_ners):
            location.append(token)
    location = " ".join(location)
    locations.append(location)

In [78]:
# adding our locations to the pandas dataframe
df_pd['where'] = locations

In [79]:
df_pd.head(10)

Unnamed: 0,text,user_id,when,sentiments,entities,ners,tokens,who,where
0,The Air Up There 11/28/2009 at Bob's Classic K...,33947545,,[positive],"[The Air Up, 11/28/2009, Bob, 9, 10, $3, $5]","[B-PRODUCT, I-PRODUCT, I-PRODUCT, O, B-DATE, O...","[The, Air, Up, There, 11/28/2009, at, Bob, 's,...",,
1,At Maxwell concert! Common on stage.,26083271,,[positive],[Maxwell],"[O, B-FAC, O, O, O, O, O, O]","[At, Maxwell, concert, !, Common, on, stage, .]",,Maxwell
2,Heading out to film Cartel and The Summer Set!...,38535159,,[positive],"[Cartel and The Summer Set, tonight]","[O, O, O, O, B-WORK_OF_ART, I-WORK_OF_ART, I-W...","[Heading, out, to, film, Cartel, and, The, Sum...",,
3,super pumped for the Kelly Clarkson concert to...,2057421,,[positive],"[the Kelly Clarkson, tonight]","[O, O, O, B-ORG, I-ORG, I-ORG, O, B-TIME, O]","[super, pumped, for, the, Kelly, Clarkson, con...",,
4,heres the thing... we started off friends... i...,2057421,,[negative],[],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[heres, the, thing, ., ., ., we, started, off,...",,
5,Listening to kelly clarkson getting ready for ...,2057421,,[positive],"[YEAAAAAAAAA "", 3 MONTHS]","[O, O, O, O, O, O, O, O, O, O, O, B-WORK_OF_AR...","[Listening, to, kelly, clarkson, getting, read...",,
6,It was easy to get right in front of the stage...,2057421,,[negative],"[Kelly Clarkson, 12 year old]","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-PERS...","[It, was, easy, to, get, right, in, front, of,...",[Kelly Clarkson],
7,Yea I rocked the Kelly Clarkson shirt last nig...,2057421,,[positive],"[Kelly Clarkson, last night]","[O, O, O, O, B-PERSON, I-PERSON, O, B-TIME, I-...","[Yea, I, rocked, the, Kelly, Clarkson, shirt, ...",[Kelly Clarkson],
8,i hope this cold i got over the weekend if ove...,15663100,,[negative],"[three-day, Austin]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[i, hope, this, cold, i, got, over, the, weeke...",,Austin
9,via @Festival_Watch: #mm 8.-10. Juli 2010 SUBW...,42475511,,[positive],"[Juli 2010, SUBWAY, SALLY, #concerts #nightlife]","[O, O, O, O, O, O, B-PERSON, I-PERSON, B-ORG, ...","[via, @, Festival_Watch, :, #mm, 8.-10., Juli,...",,


## Sentiment

I'm curious about the differences in results from some of the different sentiment algorithms, but for now, we'll just go with the twitter-based sentiment analysis pretrained pipeline.

In [80]:
# combining the sentiment readings for each row: +1 for positive, -1 for negative, then sum
df_pd['sentiments'] = [sum([1 if s == 'positive' else -1 if s == 'negative' else 0 for s in s_list]) 
                             for s_list in df_pd['sentiments']]

In [81]:
df_pd.head()

Unnamed: 0,text,user_id,when,sentiments,entities,ners,tokens,who,where
0,The Air Up There 11/28/2009 at Bob's Classic K...,33947545,,1,"[The Air Up, 11/28/2009, Bob, 9, 10, $3, $5]","[B-PRODUCT, I-PRODUCT, I-PRODUCT, O, B-DATE, O...","[The, Air, Up, There, 11/28/2009, at, Bob, 's,...",,
1,At Maxwell concert! Common on stage.,26083271,,1,[Maxwell],"[O, B-FAC, O, O, O, O, O, O]","[At, Maxwell, concert, !, Common, on, stage, .]",,Maxwell
2,Heading out to film Cartel and The Summer Set!...,38535159,,1,"[Cartel and The Summer Set, tonight]","[O, O, O, O, B-WORK_OF_ART, I-WORK_OF_ART, I-W...","[Heading, out, to, film, Cartel, and, The, Sum...",,
3,super pumped for the Kelly Clarkson concert to...,2057421,,1,"[the Kelly Clarkson, tonight]","[O, O, O, B-ORG, I-ORG, I-ORG, O, B-TIME, O]","[super, pumped, for, the, Kelly, Clarkson, con...",,
4,heres the thing... we started off friends... i...,2057421,,-1,[],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[heres, the, thing, ., ., ., we, started, off,...",,


In [82]:
# convert sentiment numbers to strings
df_pd['sentiment'] = ['positive' if s > 0 else 'neutral' if s==0 else 'negative' for s in df_pd['sentiments']]

In [83]:
df_pd.head(5)

Unnamed: 0,text,user_id,when,sentiments,entities,ners,tokens,who,where,sentiment
0,The Air Up There 11/28/2009 at Bob's Classic K...,33947545,,1,"[The Air Up, 11/28/2009, Bob, 9, 10, $3, $5]","[B-PRODUCT, I-PRODUCT, I-PRODUCT, O, B-DATE, O...","[The, Air, Up, There, 11/28/2009, at, Bob, 's,...",,,positive
1,At Maxwell concert! Common on stage.,26083271,,1,[Maxwell],"[O, B-FAC, O, O, O, O, O, O]","[At, Maxwell, concert, !, Common, on, stage, .]",,Maxwell,positive
2,Heading out to film Cartel and The Summer Set!...,38535159,,1,"[Cartel and The Summer Set, tonight]","[O, O, O, O, B-WORK_OF_ART, I-WORK_OF_ART, I-W...","[Heading, out, to, film, Cartel, and, The, Sum...",,,positive
3,super pumped for the Kelly Clarkson concert to...,2057421,,1,"[the Kelly Clarkson, tonight]","[O, O, O, B-ORG, I-ORG, I-ORG, O, B-TIME, O]","[super, pumped, for, the, Kelly, Clarkson, con...",,,positive
4,heres the thing... we started off friends... i...,2057421,,-1,[],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[heres, the, thing, ., ., ., we, started, off,...",,,negative


### Audience

I'll add the audience column.

**Future: perhaps use the NER to determine the subject (but not the performer) or POS?

Unfortunately, my attempt to use the POS tagger didn't work for today.

I'm going with the basic solution - contains I or we, then the audience is the tweeter

In [84]:
# pos_tagger = PerceptronApproach() \
#     .setInputCols(["token", "document"]) \
#     .setOutputCol("pos") \
#     .setNIterations(5)\
#     .fit() # I'm not sure where to get the training data set for this....

# finisher = finisher = Finisher() \
#      .setInputCols(['pos']) \
#      .setCleanAnnotations(True)

In [85]:
# pipe = Pipeline()\
#                .setStages([
#                     documentAssembler,
#                     tokenizer,
#                     pos_tagger,
#                     finisher
#                 ])

In [86]:
# pipe.transform(df_pos)

**Future: perhaps use the NER to determine the subject (but not the performer) or POS?

Unfortunately, my attempt to use the POS tagger didn't work for today.


In [87]:
# pos_tagger = PerceptronApproach() \
#     .setInputCols(["token", "document"]) \
#     .setOutputCol("pos") \
#     .setNIterations(5)\
#     .fit() # I'm not sure where to get the training data set for this....

# finisher = finisher = Finisher() \
#      .setInputCols(['pos']) \
#      .setCleanAnnotations(True)

In [88]:
# pipe = Pipeline()\
#                .setStages([
#                     documentAssembler,
#                     tokenizer,
#                     pos_tagger,
#                     finisher
#                 ])

In [89]:
# pipe.transform(df_pos)

So going with a simple solution:

In [90]:
lower_tokens_list = []
for token_list in df_pd['tokens']:
    lower_tokens_list.append([token.lower() for token in token_list])

In [91]:
df_pd['audience'] = [u if ('i' in t or 'we' in t) 
                     else None 
                     for u, t in zip(df_pd['user_id'], lower_tokens_list)]

## Last Cleaning

In [92]:
df_pd.head()

Unnamed: 0,text,user_id,when,sentiments,entities,ners,tokens,who,where,sentiment,audience
0,The Air Up There 11/28/2009 at Bob's Classic K...,33947545,,1,"[The Air Up, 11/28/2009, Bob, 9, 10, $3, $5]","[B-PRODUCT, I-PRODUCT, I-PRODUCT, O, B-DATE, O...","[The, Air, Up, There, 11/28/2009, at, Bob, 's,...",,,positive,
1,At Maxwell concert! Common on stage.,26083271,,1,[Maxwell],"[O, B-FAC, O, O, O, O, O, O]","[At, Maxwell, concert, !, Common, on, stage, .]",,Maxwell,positive,
2,Heading out to film Cartel and The Summer Set!...,38535159,,1,"[Cartel and The Summer Set, tonight]","[O, O, O, O, B-WORK_OF_ART, I-WORK_OF_ART, I-W...","[Heading, out, to, film, Cartel, and, The, Sum...",,,positive,38535159.0
3,super pumped for the Kelly Clarkson concert to...,2057421,,1,"[the Kelly Clarkson, tonight]","[O, O, O, B-ORG, I-ORG, I-ORG, O, B-TIME, O]","[super, pumped, for, the, Kelly, Clarkson, con...",,,positive,
4,heres the thing... we started off friends... i...,2057421,,-1,[],"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[heres, the, thing, ., ., ., we, started, off,...",,,negative,2057421.0


In [93]:
df_pd = df_pd[['text', 'who', 'when', 'where', 'audience', 'sentiment']]

In [94]:
df_pd.head(30)

Unnamed: 0,text,who,when,where,audience,sentiment
0,The Air Up There 11/28/2009 at Bob's Classic K...,,,,,positive
1,At Maxwell concert! Common on stage.,,,Maxwell,,positive
2,Heading out to film Cartel and The Summer Set!...,,,,38535159.0,positive
3,super pumped for the Kelly Clarkson concert to...,,,,,positive
4,heres the thing... we started off friends... i...,,,,2057421.0,negative
5,Listening to kelly clarkson getting ready for ...,,,,,positive
6,It was easy to get right in front of the stage...,[Kelly Clarkson],,,,negative
7,Yea I rocked the Kelly Clarkson shirt last nig...,[Kelly Clarkson],,,2057421.0,positive
8,i hope this cold i got over the weekend if ove...,,,Austin,15663100.0,negative
9,via @Festival_Watch: #mm 8.-10. Juli 2010 SUBW...,,,,,positive


I'd love to add more artists to my artist list, to make this more satisfying, and to figure out the issue with the date recognition. Test the POS for audience detecting and concert-tweet identification. Compare the different entity detection methods. Another day.