In [47]:
import pandas as pd
import numpy as np
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [48]:
train_df = pd.read_csv('train_twitter.csv')
test_df = pd.read_csv('test_twitter.csv')

In [49]:
train_df

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)
...,...,...,...,...,...,...,...,...,...,...,...,...
10975,569934458364813313,neutral,American,,Cottopanama85,,0,@AmericanAir followback,,2015-02-23 10:58:58 -0800,"ohio,panama",
10976,568564006329434113,positive,United,,PaulBEsteves,,0,@united thanks for the help. Wish the phone re...,,2015-02-19 16:13:17 -0800,Brooklyn,Eastern Time (US & Canada)
10977,569643648910028801,negative,US Airways,,runfixsteve,,0,@usairways the. Worst. Ever. #dca #customerser...,,2015-02-22 15:43:24 -0800,"St. Augustine, Florida",
10978,568864981917110272,negative,US Airways,,CLChicosky,,0,@nrhodes85: look! Another apology. DO NOT FLY ...,,2015-02-20 12:09:15 -0800,,


In [50]:
train_df = train_df[['airline_sentiment','text']]

In [51]:
train_df

Unnamed: 0,airline_sentiment,text
0,negative,"@SouthwestAir I am scheduled for the morning, ..."
1,positive,@SouthwestAir seeing your workers time in and ...
2,positive,@united Flew ORD to Miami and back and had gr...
3,negative,@SouthwestAir @dultch97 that's horse radish 😤🐴
4,negative,@united so our flight into ORD was delayed bec...
...,...,...
10975,neutral,@AmericanAir followback
10976,positive,@united thanks for the help. Wish the phone re...
10977,negative,@usairways the. Worst. Ever. #dca #customerser...
10978,negative,@nrhodes85: look! Another apology. DO NOT FLY ...


In [52]:
training_data = train_df.values

In [53]:
training_data

array([['negative',
        '@SouthwestAir I am scheduled for the morning, 2 days after the fact, yes..not sure why my evening flight was the only one Cancelled Flightled'],
       ['positive',
        '@SouthwestAir seeing your workers time in and time out going above and beyond is why I love flying with you guys. Thank you!'],
       ['positive',
        '@united Flew ORD to Miami and back and  had great crew, service on both legs. THANKS'],
       ...,
       ['negative', '@usairways the. Worst. Ever. #dca #customerservice'],
       ['negative',
        '@nrhodes85: look! Another apology. DO NOT FLY @USAirways'],
       ['negative',
        '@united you are by far the worst airline. 4 plane delays on 1 round trip flight. How is that possible.']],
      dtype=object)

# Forming word tokens

In [54]:
from nltk.tokenize import word_tokenize
document = []
for sentiment,text in training_data:
    document.append([sentiment,word_tokenize(text)])
document[0]
    

['negative',
 ['@',
  'SouthwestAir',
  'I',
  'am',
  'scheduled',
  'for',
  'the',
  'morning',
  ',',
  '2',
  'days',
  'after',
  'the',
  'fact',
  ',',
  'yes',
  '..',
  'not',
  'sure',
  'why',
  'my',
  'evening',
  'flight',
  'was',
  'the',
  'only',
  'one',
  'Cancelled',
  'Flightled']]

In [55]:
from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punc = list(string.punctuation)
stops.update(punc)

In [56]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [57]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
lemmatizer = WordNetLemmatizer()
def clean_rev(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_words = lemmatizer.lemmatize(w,pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_words.lower())
    return output_words

In [58]:
document = [[clean_rev(text),sentiment] for sentiment,text in document]

In [59]:
document[0]

[['southwestair',
  'schedule',
  'morning',
  '2',
  'day',
  'fact',
  'yes',
  '..',
  'sure',
  'even',
  'flight',
  'one',
  'cancelled',
  'flightled'],
 'negative']

In [60]:
y_train = []
x_train = []
for tweet,sentiment in document:
    x_train.append(" ".join(tweet))
    y_train.append(sentiment)

In [61]:
x_train

['southwestair schedule morning 2 day fact yes .. sure even flight one cancelled flightled',
 'southwestair see worker time time go beyond love fly guy thank',
 'united flew ord miami back great crew service leg thanks',
 "southwestair dultch97 's horse radish 😤🐴",
 'united flight ord delayed air force one last flight sbn 8:20 5 min land',
 'united load u fly sardine knew pilot 2 hour late flight incompetent beyond belief',
 "jetblue stock response delays frustrate poor cust serv amp told 3 ppl wait amp 'd come back",
 "jetblue 'd nice hoping rack enough mile take trip seattle enjoy perfect latte city coffee",
 'united frankly bad customer service ever problems happen deal defines company never united',
 "southwestair yeah haha never one 's expensive 😂😂 much fun destinationdragons",
 "southwestair mco- gt dca flight almost full people screw msy-dca cancelled flightation united usairways n't cancelled flight swa=mistake",
 "jetblue 's easy way get ticket receipt get one check get one on

In [78]:
count_vec = CountVectorizer(max_features=2200)
x_train_ = count_vec.fit_transform(x_train)

# testing the dataset

In [79]:
test_df

Unnamed: 0,text
0,@AmericanAir In car gng to DFW. Pulled over 1h...
1,"@AmericanAir after all, the plane didn’t land ..."
2,@SouthwestAir can't believe how many paying cu...
3,@USAirways I can legitimately say that I would...
4,@AmericanAir still no response from AA. great ...
...,...
3655,@USAirways Been stuck for 40+ minutes due to l...
3656,@USAirways 4 hours... 4 hours... FOUR HOURS. ...
3657,Nice RT @VirginAmerica: The man of steel might...
3658,@AmericanAir Aww Thanks AA..DFW was on GMA up ...


In [80]:
test_df = test_df[['text']]

In [81]:
test_df

Unnamed: 0,text
0,@AmericanAir In car gng to DFW. Pulled over 1h...
1,"@AmericanAir after all, the plane didn’t land ..."
2,@SouthwestAir can't believe how many paying cu...
3,@USAirways I can legitimately say that I would...
4,@AmericanAir still no response from AA. great ...
...,...
3655,@USAirways Been stuck for 40+ minutes due to l...
3656,@USAirways 4 hours... 4 hours... FOUR HOURS. ...
3657,Nice RT @VirginAmerica: The man of steel might...
3658,@AmericanAir Aww Thanks AA..DFW was on GMA up ...


In [82]:
test_tweet = []
for i in test_df['text']:
    print(i)
    clean = clean_rev(word_tokenize(i))
    test_tweet.append(" ".join(clean))
test_tweet

@AmericanAir In car gng to DFW. Pulled over 1hr ago - very icy roads. On-hold with AA since 1hr. Can't reach arpt for AA2450. Wat 2 do?
@AmericanAir after all, the plane didn’t land in identical or worse) conditions at GRK according to METARs.
@SouthwestAir can't believe how many paying customers you left high and dry with no reason for flight Cancelled Flightlations Monday out of BDL! Wow.
@USAirways I can legitimately say that I would have rather driven cross country than flown on US Airways.
@AmericanAir still no response from AA. great job guys!
@united we have developers flying down tmrw morn. w/45 min layover, there is an earlier flight to have 1.5hr layover, can move them up?
@USAirways hello??? Anyone there?
@USAirways @husainhaqqani Mr. Husain u shld protest as well when one of ur party member Rehman Malik delayed a PIA flight for hours..???
@USAirways not likely, flightaware says plane is still in Durango and hasn't departed.
@AmericanAir they don't even give an option to hol

["americanair car gng dfw pulled 1hr ago icy road on-hold aa since 1hr ca n't reach arpt aa2450 wat 2",
 'americanair plane ’ land identical bad condition grk accord metars',
 "southwestair ca n't believe many pay customer left high dry reason flight cancelled flightlations monday bdl wow",
 'usairways legitimately say would rather driven cross country flown us airways',
 'americanair still response aa great job guy',
 'united developer fly tmrw morn w/45 min layover earlier flight 1.5hr layover move',
 'usairways hello anyone',
 'usairways husainhaqqani mr. husain u shld protest well one ur party member rehman malik delayed pia flight hour ..',
 "usairways likely flightaware say plane still durango n't depart",
 "americanair n't even give option hold .. say line busy plz try late flightr",
 'united announcement pre boarding address mobility disability require travel lot stuff preboard',
 'usairways really embarrass ask complimentary drink/snack detailed http //t.co/9za6xb1h89 amp argu

In [83]:
test_data = count_vec.fit_transform(test_tweet)

In [84]:
test_data

<3660x2200 sparse matrix of type '<class 'numpy.int64'>'
	with 32841 stored elements in Compressed Sparse Row format>

# Testing various classification algorithms

## SVC

In [85]:
svc = SVC()
svc.fit(x_train_,y_train)

SVC()

In [86]:
y_pred_svc = svc.predict(test_data)

In [87]:
y_pred_svc

array(['negative', 'negative', 'negative', ..., 'negative', 'negative',
       'negative'], dtype='<U8')

In [88]:
df = pd.DataFrame(y_pred_svc)
df.to_csv('predictions_svc.csv', index = False, header = False)


## Multibinomial Naive Bayes Theorem

In [89]:
mnv = MultinomialNB()
mnv.fit(x_train_,y_train)

MultinomialNB()

In [90]:
y_pred_mnv = mnv.predict(test_data)

In [91]:
df = pd.DataFrame(y_pred_mnv)
df.to_csv('predictions_mnv.csv', index = False, header = False)

## Random Forest Classifier

In [92]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(x_train_,y_train)

RandomForestClassifier(max_depth=2, random_state=0)

In [93]:
y_pred_rf = clf.predict(test_data)
df = pd.DataFrame(y_pred_rf)
df.to_csv('predictions_rf.csv', index = False, header = False)