In [2]:
import pandas as pd
import numpy as np
import re

In [12]:
# Read data from the CSV file
INPUT_PATH = '../twitter-airline-sentiment/Tweets.csv'
raw_data = pd.read_csv(INPUT_PATH, header=0)

df = raw_data.copy()[['text', 'airline_sentiment']]
df = df.rename(columns={'airline_sentiment': 'sentiment'})

print("DATA SIZE: " + str(df.shape))
df.head()

DATA SIZE: (14640, 2)


Unnamed: 0,text,sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [89]:
# get class counts
class_counts = df.groupby('sentiment').count()
class_counts

Unnamed: 0_level_0,text
sentiment,Unnamed: 1_level_1
negative,9178
neutral,3099
positive,2363


In [128]:
class_counts.text.min()

2363

In [69]:
# let's find the handlers
from collections import Counter

def get_text_sequence(_df):
    return (row.text for _, row in _df.iterrows())

def get_all_handlers(text_it):
    handler_counts = Counter()

    tokenizer = TweetTokenizer(preserve_case=False)
    
    for text in text_it:
        tokens = tokenizer.tokenize(text)
        handler_counts.update(t for t in tokens if len(t) > 1 and t.startswith('@'))

    return handler_counts

handler_counts = get_all_handlers(get_text_sequence(df))
handler_counts.most_common()

[('@united', 3893),
 ('@usairways', 2998),
 ('@americanair', 2961),
 ('@southwestair', 2458),
 ('@jetblue', 2248),
 ('@virginamerica', 518),
 ('@delta', 68),
 ('@imaginedragons', 45),
 ('@phlairport', 20),
 ('@dfwairport', 17),
 ('@wsj', 13),
 ('@ladygaga', 12),
 ('@carrieunderwood', 12),
 ('@fortunemagazine', 12),
 ('@love_dragonss', 10),
 ('@virginatlantic', 9),
 ('@flytpa', 9),
 ('@cowboycerrone', 9),
 ('@staralliance', 8),
 ('@gg8929', 8),
 ('@spiritairlines', 8),
 ('@velourlive', 8),
 ('@aircanada', 7),
 ('@dulles_airport', 6),
 ('@fly2ohare', 6),
 ('@cnn', 6),
 ('@bostonlogan', 6),
 ('@triflight', 6),
 ('@nytimes', 5),
 ('@flylaxairport', 5),
 ('@dallaslovefield', 5),
 ('@tsa', 5),
 ('@annricord', 5),
 ('@expedia', 5),
 ('@silverairways', 4),
 ('@southwest', 4),
 ('@deltaassist', 4),
 ('@ntrustopen', 4),
 ('@perfectomobile', 4),
 ('@british_airways', 4),
 ('@ny_njairports', 4),
 ('@mco', 4),
 ('@kylejudah', 4),
 ('@jayvig', 4),
 ('@askpaypal', 4),
 ('@derekc21', 4),
 ('@faanews',

### APPROACH 1: Naive Bayes, all classes with full size

In [285]:
from nltk.corpus import stopwords
from nltk.tokenize.casual import TweetTokenizer
#from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk

REGEX_URL = re.compile(r'https?://|www.')
#REGEX_HANDLE = re.compile(r'@\w+')
REGEX_NUM = re.compile(r'[0-9.-]?[0-9][0-9.-]?')

TOKEN_URL = '__URL'
#TOKEN_HANDLE = '__HANDLE'
TOKEN_NUM = '__NUM'

STOPWORDS_EN = set(stopwords.words('english') + ["i've"])
STOP_PUNCT = set('."\'&”“’,:;/*()[]{}')

def normalize_token(token):
    for regex, marker in [(REGEX_URL, TOKEN_URL)]: #, (REGEX_NUM, TOKEN_NUM)]:
        if re.match(regex, token):
            return marker
    return token

def get_wordnet_pos(pos):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def normalize_text(text, remove_stopwords=True, use_stemmer=True, remove_prefixes=True):
    tokenizer = TweetTokenizer(preserve_case=False)
    #stemmer = SnowballStemmer('english')
    lemmatizer = WordNetLemmatizer()
    
    tokens = tokenizer.tokenize(text)
    
    # if the first token is a handler, it's normally irrelevant
    if tokens[0].startswith('@'):
        tokens.pop(0)

    ntokens = []
    
    # STEP 1: cleanup, formatting
    for token in tokens:
        ntoken = normalize_token(token)
        
        if remove_prefixes and any(token.startswith(c) for c in ['@', '#']):
            ntoken = ntoken[1:]
        if len(ntoken) == 0:
            continue      
        
        ntokens.append(ntoken)
    
    if len(ntokens) == 0:
        return ''
    
    # STEP 2: NLP tagging
    pos_tags = nltk.pos_tag(ntokens)
    
    # STEP 3: "semantic" cleaning
    ltokens = []
    for token, pos in pos_tags:
        if token in STOP_PUNCT:
            continue
        if remove_stopwords and token in STOPWORDS_EN:
            continue
        if pos == 'CD' or re.match(REGEX_NUM, token):
            ltoken = TOKEN_NUM
        else:
            ltoken = lemmatizer.lemmatize(token, get_wordnet_pos(pos))
        ltokens.append(ltoken)
           
    return ' '.join(ltokens)

In [286]:
import string

def normalize_trivial(tweet):
    tweet = ''.join(c for c in tweet if c not in string.punctuation)
    tweet = re.sub('((www\S+)|(http\S+))', 'urlsite', tweet)
    tweet = re.sub(r'\d+', 'contnum', tweet)
    tokens = re.split(r'\s+', tweet.lower().strip())
    ntokens = [t for t in tokens if len(t) > 0 and t not in STOPWORDS_EN]
    return ' '.join(ntokens)

In [287]:
def show_normalization(_df):
    for i, row in _df.iterrows():
        print(row.sentiment.upper() + ": " + row.text + '\n>> ' + normalize_text(row.text) + '\n')

show_normalization(df.head(30))

NEUTRAL: @VirginAmerica What @dhepburn said.
>> dhepburn say

POSITIVE: @VirginAmerica plus you've added commercials to the experience... tacky.
>> plus added commercial experience ... tacky

NEUTRAL: @VirginAmerica I didn't today... Must mean I need to take another trip!
>> today ... must mean need take another trip !

NEGATIVE: @VirginAmerica it's really aggressive to blast obnoxious "entertainment" in your guests' faces &amp; they have little recourse
>> really aggressive blast obnoxious entertainment guest face little recourse

NEGATIVE: @VirginAmerica and it's a really big bad thing about it
>> really big bad thing

NEGATIVE: @VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.
it's really the only bad thing about flying VA
>> seriously would pay $ __NUM flight seat playing really bad thing fly va

POSITIVE: @VirginAmerica yes, nearly every time I fly VX this “ear worm” won’t go away :)
>> yes nearly every time fly vx ear worm go away :)

NEUTRA

In [288]:
show_normalization(df.sample(20))

NEGATIVE: @SouthwestAir pls help me get this resolved &amp; reimbursement made.  gracias, rico; pls send me private msg with phone number or email address
>> pls help get resolve reimbursement make gracias rico pls send private msg phone number email address

NEGATIVE: @USAirways you guys suck at JFK tonight -- oh this MORNING!!!
>> guy suck jfk tonight - - oh morning ! ! !

NEUTRAL: @JetBlue Domestic.  To be clear -- I do not have to sit in (your lovely) terminal, just need to be there 1 hour prior to boarding?  Thanks!
>> domestic clear - - sit lovely terminal need __NUM hour prior board ? thanks !

NEGATIVE: @united they held the flight for our group of nearly 20 people.
>> hold flight group nearly __NUM people

NEGATIVE: @USAirways Gave up after more than 2 hours on hold. Still need that receipt; last time it was promised, it never arrived. Help?!
>> give __NUM hour hold still need receipt last time promise never arrive help ? !

NEGATIVE: @united Without baggage for 5 days and can

Let's just do a binary classification for now

In [289]:
# apply normalization to ALL text
df['norm_text'] = df.text.apply(normalize_text)
df.head(30)

Unnamed: 0,text,sentiment,norm_text
0,@VirginAmerica What @dhepburn said.,neutral,dhepburn say
1,@VirginAmerica plus you've added commercials t...,positive,plus added commercial experience ... tacky
2,@VirginAmerica I didn't today... Must mean I n...,neutral,today ... must mean need take another trip !
3,@VirginAmerica it's really aggressive to blast...,negative,really aggressive blast obnoxious entertainmen...
4,@VirginAmerica and it's a really big bad thing...,negative,really big bad thing
5,@VirginAmerica seriously would pay $30 a fligh...,negative,seriously would pay $ __NUM flight seat playin...
6,"@VirginAmerica yes, nearly every time I fly VX...",positive,yes nearly every time fly vx ear worm go away :)
7,@VirginAmerica Really missed a prime opportuni...,neutral,really miss prime opportunity men without hat ...
8,"@virginamerica Well, I didn't…but NOW I DO! :-D",positive,well … ! :-D
9,"@VirginAmerica it was amazing, and arrived an ...",positive,amaze arrive hour early good


In [290]:
df_binary = df[df.sentiment != 'neutral']
print(df_binary.shape)
df_binary.head()

(11541, 3)


Unnamed: 0,text,sentiment,norm_text
1,@VirginAmerica plus you've added commercials t...,positive,plus added commercial experience ... tacky
3,@VirginAmerica it's really aggressive to blast...,negative,really aggressive blast obnoxious entertainmen...
4,@VirginAmerica and it's a really big bad thing...,negative,really big bad thing
5,@VirginAmerica seriously would pay $30 a fligh...,negative,seriously would pay $ __NUM flight seat playin...
6,"@VirginAmerica yes, nearly every time I fly VX...",positive,yes nearly every time fly vx ear worm go away :)


In [291]:
from sklearn.model_selection import train_test_split

text_train, text_test, label_train, label_test = train_test_split(df_binary['norm_text'], df_binary['sentiment'], test_size=0.33, random_state=0)

print("Training: %d; test: %d" % (len(text_train), len(text_test)))

Training: 7732; test: 3809


### Multinomian NB using counts

In [292]:
LABELS = {'positive': 1, 'neutral': 0, 'negative': -1}

y_train = [LABELS[l] for l in label_train]
y_test = [LABELS[l] for l in label_test]

In [293]:
from sklearn.feature_extraction.text import CountVectorizer

counter = CountVectorizer()
X_train = counter.fit_transform(text_train)
X_test = counter.transform(text_test)

In [294]:
for row in text_train[:10]:
    print(row)

boyfriend love imaginedragons since __NUM would awesome go vegas event way help ?
finally thx
flight delay __NUM hour insane every flight make nyc
thanks much !
haha boarding pas __NUM board plane gate agent tell go __NUM i'm worry
__NUM smf jfk !
joanna wonderful job ! thank ?
enough staff rude ignored passenger think accept whatever reason
delay understandable look time flight boarding time weird !
seriously attendant go awol __NUM min flight delayed lite match either __URL


In [295]:
X_train.shape

(7732, 7378)

In [296]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(alpha=1.0)
clf.fit(X_train, y_train)

print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.9389549922400414
0.8994486741927015


In [225]:
def balance_class_counts(df):
    class_counts = df.groupby('sentiment').count()
    min_size = class_counts['text'].min()
    classes = []
    for label in class_counts.index:
        classes.append(df.copy()[df.sentiment == label][:min_size])
    return pd.concat(classes, ignore_index=True).reset_index(drop=True)

In [226]:
df_bin_bal = balance_class_counts(df_binary)
df_bin_bal.shape

(4726, 3)

### Gaussian NB using TF-IDF, direct

In [298]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
v_train = vectorizer.fit_transform(text_train)
v_test = vectorizer.transform(text_test)

X_train = v_train.toarray()
X_test = v_test.toarray()

print(X_train.shape)

(7732, 7173)


In [299]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X_train, y_train)

print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.8043197102948785
0.639012864268837


### Gaussian NB using TF-IDF and feature selection

In [300]:
from sklearn.feature_selection import SelectPercentile, f_classif

selector = SelectPercentile(f_classif, percentile=10)
selector.fit(v_train, y_train)
vs_train = selector.transform(v_train)
vs_test  = selector.transform(v_test)

X_train = vs_train.toarray()
X_test = vs_test.toarray()

print(X_train.shape)

(7732, 717)


In [301]:
clf = GaussianNB()
clf.fit(X_train, y_train)

print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.8625193998965339
0.8049356786558152


### SVM

In [304]:
from sklearn.svm import SVC
from time import time

# we try different values of C and kernel
kernels = ['linear', 'rbf']
Cs = [1.0, 10.0, 100.0, 1000.0]

results = []

for kernel in kernels:
    for cc in Cs:
        print('Trying with kernel=%s, C=%f' % (kernel, cc))        
        clf = SVC(kernel=kernel, C=cc, gamma='auto') # setting gamma=auto to avoid warning
        t0 = time()
        clf.fit(X_train, y_train)
        t1 = time()
        score_train = clf.score(X_train, y_train)
        score_test = clf.score(X_test, y_test)
        t2 = time()
        # (kernel, c, train score, test score, train time, score time)
        results.append((kernel, cc, score_train, score_test, t1 - t0, t2 - t1))

Trying with kernel=linear, C=1.000000
Trying with kernel=linear, C=10.000000
Trying with kernel=linear, C=100.000000
Trying with kernel=linear, C=1000.000000
Trying with kernel=rbf, C=1.000000
Trying with kernel=rbf, C=10.000000
Trying with kernel=rbf, C=100.000000
Trying with kernel=rbf, C=1000.000000


### Decision Tree

In [204]:
svc_df = pd.DataFrame(results, columns=['kernel', 'C', 'score_train', 'score_test', 'time_train', 'time_score'])
svc_df

Unnamed: 0,kernel,C,score_train,score_test,time_train,time_score
0,linear,1.0,0.919814,0.904962,19.71032,25.384943
1,linear,10.0,0.947362,0.9068,15.906641,19.04755
2,linear,100.0,0.952535,0.899974,18.055134,16.144613
3,linear,1000.0,0.95344,0.898399,35.974997,15.673101
4,rbf,1.0,0.797982,0.789709,26.160089,37.076061
5,rbf,10.0,0.818676,0.813599,26.876825,36.623412
6,rbf,100.0,0.883989,0.878971,23.059513,30.147097
7,rbf,1000.0,0.935334,0.909163,17.150419,22.355724


In [302]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(min_samples_split=40)

clf.fit(X_train, y_train)

print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))


0.9496896016554578
0.8724074560252034


#### Notes about pre-processing:
- stemming did not seem to help
- in fact, lemmatization using POS tags ended up in _slightly worse_ results that just simple normalization
- the exception is the Decision Tree, it actually got better with lemmatization

In [305]:
svc_df = pd.DataFrame(results, columns=['kernel', 'C', 'score_train', 'score_test', 'time_train', 'time_score'])
svc_df

Unnamed: 0,kernel,C,score_train,score_test,time_train,time_score
0,linear,1.0,0.918391,0.900761,17.343747,21.201815
1,linear,10.0,0.945939,0.901286,13.586512,15.813359
2,linear,100.0,0.948267,0.898661,16.264316,13.488832
3,linear,1000.0,0.94969,0.898661,36.859171,12.856801
4,rbf,1.0,0.797982,0.789709,22.111068,31.139696
5,rbf,10.0,0.821133,0.8157,21.581112,30.20004
6,rbf,100.0,0.889938,0.885272,18.664486,25.444301
7,rbf,1000.0,0.938308,0.902862,15.291044,18.857582
