# Concert Tweet Classifier

## Import Necesary Packages

In [28]:
import sparknlp
from pyspark.sql.types import *
from pyspark.sql.functions import count, when, col
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer,
                                LemmatizerModel, StopWordsCleaner)
from pyspark.ml import Pipeline
from nltk.corpus import stopwords
import pyspark.sql.functions as F
import pandas as pd

## Start the spark-NLP session

In [3]:
spark = sparknlp.start()

In [4]:
spark.sparkContext.defaultParallelism

4

In [5]:
# adjust show output format to pandas-like
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

## Read the data

In the first go, there were 25k rows of null - where the schema did not match the data. I decided to do some quick cleaning.

In [109]:
def remove_extra_seps(in_file, out_file, sep):
    """removes newline characters that come before the line reaches four segments(3 separators)
    and combines "middle sections" with extra separators into a single segment by removing the separators.
    
    Args:
        in_file: path to read file
        out_file: path to write file
        sep: separator/delimitor
    """
    n_chunks = 4
    
    with open(in_file, 'r') as rf:
        with open(out_file, 'w') as wf:
            while True:
                line = rf.readline()
                
                # if end of file
                if line == '':
                    break
                    
                # if line has less than n_sep, strip the newline and add the next line
                if len(line.split(sep)) < n_chunks:
                    line = line.strip('\n')
                    line += rf.readline()
                
                wf.write(line)

In [110]:
remove_extra_seps('../../data/test_set_tweets.txt',
                      '../../data/test_set_tweets_clean.txt',
                     '\t')
remove_extra_seps('../../data/training_set_tweets.txt',
                      '../../data/training_set_tweets_clean.txt',
                     '\t')

In [111]:
# set the schema
tweet_schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("t_id", StringType(), True),
    StructField("t_text", StringType(), True),
    StructField("t_dt", TimestampType(), True)
    ])

In [112]:
tweets_test = spark.read.csv('../../data/test_set_tweets_clean.txt', 
                              sep='\t',
                              schema=tweet_schema,
                              header="false")

In [113]:
tweets_training = spark.read.csv('../../data/training_set_tweets_clean.txt', 
                                 sep="\t", 
                                 schema=tweet_schema,
                                 header='false')

Since our data is unlabeled for our task, these test/train splits are not particularly useful, but a vestige of the original data set and purpose. We'll combine them.

In [114]:
tweets = tweets_test.union(tweets_training)

#### Future: Consider reading the data as a single column and then parsing. Compare outcome / number of tweets retrieved to that with the csv reading

## Basic Info About the Data Set

### Tweets

In [115]:
tweets.select('*').show(5)

+--------+-----------+--------------------+-------------------+
| user_id|       t_id|              t_text|               t_dt|
+--------+-----------+--------------------+-------------------+
|22077441|10538487904|Ok today I have t...|2010-03-15 17:35:58|
|22077441|10536835844|I am glad I'm hav...|2010-03-15 16:53:44|
|22077441|10536809086|Honestly I don't ...|2010-03-15 16:52:59|
|22077441|10534149786|@LovelyJ_Janelle ...|2010-03-15 15:42:07|
|22077441|10530203659|Sitting infront o...|2010-03-15 13:55:22|
+--------+-----------+--------------------+-------------------+
only showing top 5 rows



In [116]:
tweets.select([count(when(col(c).isNull(), c)).alias(c) for c in 
        tweets.columns]).show()

# print("""+-------+-----+------+-----+
# |user_id| t_id|t_text| t_dt|
# +-------+-----+------+-----+
# |  33289|33179| 32631|53805|
# +-------+-----+------+-----+""")

+-------+-----+------+-----+
|user_id| t_id|t_text| t_dt|
+-------+-----+------+-----+
|  34555|34490| 34232|56489|
+-------+-----+------+-----+



In [117]:
tweets.count()

# print(8884863)

8884863

In [118]:
# tweets.distinct().count()
print(8850656)

8850656


It looks like the time stamp can be parsed from the end of the tweet text for many of these "null" datetimes.

In [120]:
tweets.filter(col('t_dt').isNull()).take(5)

[Row(user_id=22398295, t_id='10172355714', t_text='From my vantage point, when it comes to money, women tend to lack confidence in their ability to do the (cont) http://tl.gd/entj8\t2010-03-08 00:00:00', t_dt=None),
 Row(user_id=22398295, t_id='10025719159', t_text='The best way to move a mountain is one stone at a time. Nothing is insurmountable if you take one step (cont) http://tl.gd/dt36c\t2010-03-05 00:00:00', t_dt=None),
 Row(user_id=22398295, t_id='9828243702', t_text="I believe luck is preparation meeting opportunity. If you hadn't been prepared when the opportunity came (cont) http://tl.gd/cqt73\t2010-03-01 00:00:00", t_dt=None),
 Row(user_id=22398295, t_id='9542436310', t_text='Religion, philosophy, greeting cards, self-help books—they all tout the power of love. Being a chronic and (cont) http://tl.gd/b88fa\t2010-02-23 00:00:00', t_dt=None),
 Row(user_id=22398295, t_id='9504909244', t_text=' A Leader is someone who nurtures others and allows them to progress and perform to t

In [121]:
tweets = tweets.withColumn('datetime', 
                           F.when(F.col('t_dt').isNull(), 
                                  F.to_date(F.substring('t_text', -19, 19)))
                           .otherwise(F.col('t_dt'))
                          )

In [122]:
tweets = tweets.withColumn('t_text', 
                           F.when(F.col('t_dt').isNull(), 
                                  F.expr('substring(t_text, 1, length(t_text)-20)'))
                           .otherwise(F.col('t_text'))
                           )

In [123]:
tweets = tweets.withColumn('t_dt', F.col('datetime')).drop('datetime')

In [124]:
# save as parquet and reload
tweets.write.parquet('../../data/tweets.parquet')
tweets = spark.read.parquet('../../data/tweets.parquet')

In [125]:
tweets.select([count(when(col(c).isNull(), c)).alias(c) for c in 
        tweets.columns]).show()

+-------+-----+------+-----+
|user_id| t_id|t_text| t_dt|
+-------+-----+------+-----+
|  34555|34490| 34232|54671|
+-------+-----+------+-----+



In [126]:
tweets.count()

8884863

In [127]:
tweets = tweets.dropna(how='any', subset=['t_text'])

In [128]:
tweets.filter(col('t_dt').isNull()).take(5)

[Row(user_id=25513575, t_id='10334442280', t_text='', t_dt=None),
 Row(user_id=25513575, t_id='10333612651', t_text='', t_dt=None),
 Row(user_id=16198727, t_id='6899029209', t_text='This vid cracked me up! haha I w', t_dt=None),
 Row(user_id=20106865, t_id='10362030419', t_text="Ladies and gentlemen... come and join me.  It'", t_dt=None),
 Row(user_id=20106865, t_id='10005503765', t_text='I am talking #Survivor RIGHT NOW in stickam', t_dt=None)]

Clearly I could do some more/better data engineering here, but for this exercise, I'm going to move on, dropping any records with null values or t_text with empty strings

In [129]:
tweets = tweets.dropna(how='any')

In [130]:
tweets = tweets.filter(~(tweets.t_text == ""))

In [131]:
tweets.count()

8829912

# Concert tweets - Classifier

I am deciding to focus on english tweets for now. (may add spanish, others in the future based on presence in the data set).

In [132]:
eng_stopwords = stopwords.words('english')

setting up the pieces of my pipeline to extract text info from the tweets (we'll use a pretrained pipeline later)

In [133]:
documentAssembler = DocumentAssembler() \
     .setInputCol('t_text') \
     .setOutputCol('document')
tokenizer = Tokenizer() \
     .setInputCols(['document']) \
     .setOutputCol('token')
normalizer = Normalizer() \
     .setInputCols(['token']) \
     .setOutputCol('normalized') \
     .setLowercase(True)
lemmatizer = LemmatizerModel.pretrained() \
     .setInputCols(['normalized']) \
     .setOutputCol('lemma')
stopwords_cleaner = StopWordsCleaner() \
     .setInputCols(['lemma']) \
     .setOutputCol('clean_lemma') \
     .setCaseSensitive(False) \
     .setStopWords(eng_stopwords)
finisher = Finisher() \
     .setInputCols(['clean_lemma']) \
     .setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [134]:
pipeline = Pipeline() \
     .setStages([
           documentAssembler,
           tokenizer,
           normalizer,
           lemmatizer,
           stopwords_cleaner,
           finisher
     ])

In [135]:
tweets = pipeline.fit(tweets).transform(tweets)

In [136]:
tweets.columns

['user_id',
 't_id',
 't_text',
 't_dt',
 'document',
 'token',
 'normalized',
 'lemma',
 'clean_lemma',
 'finished_clean_lemma']

## Basic Classifier: contains the word concert

In [137]:
concert_tweets = tweets.withColumn('concert', F.array_contains('finished_clean_lemma', 'concert'))
concert_tweets = concert_tweets.filter(concert_tweets['concert'] == 'true')

In [138]:
concert_tweets.select('t_text').take(3)

[Row(t_text="@herRoyalStarnes I just thought of the history broke down bmw's on bdays free concert tickets in the nose bleeds p (cont) http://tl.gd/4pp7k"),
 Row(t_text='Y is me @RandiICandy, @EpitomeOfADiva, and Leila Bunny n here singing Mary J like we Mary J. We in concert yall buy a ticket yall.'),
 Row(t_text="@beccalexis sup Bee? How'd the shoot go? Will you be at the concert tonight?")]

In [139]:
# concert_tweets.count()

print(12477)

12477


In [140]:
concert_tweets.columns

['user_id',
 't_id',
 't_text',
 't_dt',
 'document',
 'token',
 'normalized',
 'lemma',
 'clean_lemma',
 'finished_clean_lemma',
 'concert']

In [146]:
concert_tweets.select("t_text").show(30, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------+
|t_text                                                                                                                                      |
+--------------------------------------------------------------------------------------------------------------------------------------------+
|@herRoyalStarnes I just thought of the history broke down bmw's on bdays free concert tickets in the nose bleeds p (cont) http://tl.gd/4pp7k|
|Y is me @RandiICandy, @EpitomeOfADiva, and Leila Bunny n here singing Mary J like we Mary J. We in concert yall buy a ticket yall.          |
|@beccalexis sup Bee? How'd the shoot go? Will you be at the concert tonight?                                                                |
|RT @BoomKack: Janet was at Lady Gaga concert tonight she is everything!!!!!! Can't touch her!                                               |

## Basic Classifier: contains the word concert or similar words

In [165]:
concert_plus = tweets.withColumn('concert', F.array_contains('finished_clean_lemma', 'concert'))\
                     .withColumn('tour', F.array_contains('finished_clean_lemma', 'tour'))\
                     .withColumn('gig', F.array_contains('finished_clean_lemma', 'gig'))\
                     .withColumn('show', F.array_contains('finished_clean_lemma', 'show'))
concert_plus = concert_plus.withColumn('concert_like', col('concert')|col('tour')|col('gig'))
concert_plus = concert_plus.filter(concert_plus.concert_like == True)

In [166]:
concert_plus.select("t_text").show(30, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------+
|t_text                                                                                                                                      |
+--------------------------------------------------------------------------------------------------------------------------------------------+
|@Lauralu2u yeps I had curve than the tour.   Love my Droid                                                                                  |
|@herRoyalStarnes I just thought of the history broke down bmw's on bdays free concert tickets in the nose bleeds p (cont) http://tl.gd/4pp7k|
|Y is me @RandiICandy, @EpitomeOfADiva, and Leila Bunny n here singing Mary J like we Mary J. We in concert yall buy a ticket yall.          |
|@joeymcintyre You've got to be a LITTLE bit silly on tour or you wouldn't be YOU! ;)                                                        |

Looking at this super small sample, it doesn't seem like these alternate words are adding a lot to our classifier.

#### Future: maybe combination of show/tour/gig and musician/group name in addition to the concert

Since we don't have labeled data, and I'm not sure the best technique for clustering text data in this situation. Or how we would evaluate which techniqes are doing the best job identifying our concert tweets, and whether they are worth the extra complexity/computational requirements.

For now, I'm going to move on using the "concert" lemma classifier