In [1]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 900)

In [27]:
tweets = pd.read_csv('data/twitter-airline/Tweets.csv', usecols=['text'])
tweets.head(10)

Unnamed: 0,text
0,@VirginAmerica What @dhepburn said.
1,@VirginAmerica plus you've added commercials to the experience... tacky.
2,@VirginAmerica I didn't today... Must mean I need to take another trip!
3,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse"
4,@VirginAmerica and it's a really big bad thing about it
5,@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.\nit's really the only bad thing about flying VA
6,"@VirginAmerica yes, nearly every time I fly VX this “ear worm” won’t go away :)"
7,"@VirginAmerica Really missed a prime opportunity for Men Without Hats parody, there. https://t.co/mWpG7grEZP"
8,"@virginamerica Well, I didn't…but NOW I DO! :-D"
9,"@VirginAmerica it was amazing, and arrived an hour early. You're too good to me."


In [28]:
import re

HANDLE = '@\w+'
LINK = 'https?://t\.co/\w+'
SPECIAL_CHARS = '&lt;|&lt;|&amp;|#'
def clean(text):
    text = re.sub(HANDLE, ' ', text)
    text = re.sub(LINK, ' ', text)
    text = re.sub(SPECIAL_CHARS, ' ', text)
    return text

tweets['text'] = tweets.text.apply(clean)
tweets.head(10)

Unnamed: 0,text
0,What said.
1,plus you've added commercials to the experience... tacky.
2,I didn't today... Must mean I need to take another trip!
3,"it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces they have little recourse"
4,and it's a really big bad thing about it
5,seriously would pay $30 a flight for seats that didn't have this playing.\nit's really the only bad thing about flying VA
6,"yes, nearly every time I fly VX this “ear worm” won’t go away :)"
7,"Really missed a prime opportunity for Men Without Hats parody, there."
8,"Well, I didn't…but NOW I DO! :-D"
9,"it was amazing, and arrived an hour early. You're too good to me."


## LDA

In [29]:
from gensim.parsing.preprocessing import preprocess_string

tweets = tweets.text.apply(preprocess_string).tolist()

In [30]:
len(tweets)

14640

In [31]:
from gensim import corpora
from gensim.models.ldamodel import LdaModel

dictionary = corpora.Dictionary(tweets)
corpus = [dictionary.doc2bow(text) for text in tweets]

In [32]:
corpus

[[(0, 1)],
 [(1, 1), (2, 1), (3, 1), (4, 1), (5, 1)],
 [(6, 1), (7, 1), (8, 1), (9, 1)],
 [(10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)],
 [(18, 1), (19, 1), (20, 1)],
 [(18, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)],
 [(22, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1)],
 [(34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1)],
 [(40, 1)],
 [(41, 1), (42, 1), (43, 1), (44, 1), (45, 1)],
 [(46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1)],
 [(53, 1), (54, 1), (55, 1), (56, 1), (57, 1)],
 [(9, 2), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1)],
 [(22, 1), (27, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1)],
 [(68, 1)],
 [(69, 1), (70, 1), (71, 1), (72, 1)],
 [(20, 1),
  (21, 1),
  (60, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1)],
 [(25, 1),
  (72, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 1),
  (88, 1),
  (89, 1),
  (90, 1)

In [33]:
list(dictionary.values())

['said',
 'ad',
 'commerci',
 'experi',
 'plu',
 'tacki',
 'mean',
 'need',
 'todai',
 'trip',
 'aggress',
 'blast',
 'entertain',
 'face',
 'guest',
 'littl',
 'obnoxi',
 'recours',
 'bad',
 'big',
 'thing',
 'flight',
 'fly',
 'pai',
 'plai',
 'seat',
 'serious',
 'awai',
 'nearli',
 'time',
 'won’t',
 'worm”',
 'ye',
 '“ear',
 'hat',
 'men',
 'miss',
 'opportun',
 'parodi',
 'prime',
 't…but',
 'amaz',
 'arriv',
 'earli',
 'good',
 'hour',
 'caus',
 'death',
 'know',
 'lead',
 'second',
 'suicid',
 'teen',
 'better',
 'graphic',
 'iconographi',
 'minim',
 'pretti',
 'deal',
 'gone',
 'great',
 'haven',
 'think',
 'fabul',
 'seduct',
 'ski',
 'stress',
 'travel',
 'thank',
 'mia',
 'pdx',
 'schedul',
 'sfo',
 'america',
 'countri',
 'cross',
 'daystogo',
 'excit',
 'heard',
 'lax',
 'mco',
 'virgin',
 'couldn',
 'flew',
 'fulli',
 'gentleman',
 'help',
 'larg',
 'nyc',
 'sit',
 'week',
 '☺️👍',
 'amazingli',
 'awesom',
 'bo',
 'fll',
 'want',
 'avail',
 'carrier',
 'fare',
 'select',


In [34]:
NUM_TOPICS = 10
ldamodel = LdaModel(corpus, 
                    num_topics = NUM_TOPICS, 
                    id2word=dictionary, passes=15)

In [35]:
ldamodel.print_topics(num_words=6)

[(0,
  '0.049*"help" + 0.037*"hold" + 0.029*"phone" + 0.028*"hour" + 0.024*"minut" + 0.022*"chang"'),
 (1,
  '0.092*"servic" + 0.088*"custom" + 0.017*"like" + 0.016*"rude" + 0.015*"agent" + 0.015*"bad"'),
 (2,
  '0.051*"flight" + 0.038*"book" + 0.034*"problem" + 0.024*"morn" + 0.021*"travel" + 0.014*"seat"'),
 (3,
  '0.071*"bag" + 0.029*"luggag" + 0.029*"check" + 0.024*"airlin" + 0.022*"baggag" + 0.020*"lost"'),
 (4,
  '0.176*"flight" + 0.074*"cancel" + 0.037*"flightl" + 0.027*"tomorrow" + 0.021*"rebook" + 0.019*"help"'),
 (5,
  '0.036*"seat" + 0.030*"right" + 0.029*"ticket" + 0.028*"trip" + 0.020*"nice" + 0.019*"thing"'),
 (6,
  '0.027*"ye" + 0.026*"time" + 0.024*"got" + 0.020*"tri" + 0.019*"answer" + 0.017*"email"'),
 (7,
  '0.083*"flight" + 0.040*"plane" + 0.040*"delai" + 0.031*"hour" + 0.029*"gate" + 0.027*"wait"'),
 (8,
  '0.154*"thank" + 0.030*"great" + 0.027*"respons" + 0.021*"gui" + 0.018*"appreci" + 0.018*"good"'),
 (9,
  '0.069*"know" + 0.023*"abl" + 0.018*"pick" + 0.017*"let