In [1]:
import nltk
from nltk.corpus import stopwords
import string
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import re

In [46]:
trainingData = pd.read_csv("twitterTrain.csv")
x_train = trainingData[["text", "retweet_count"]]
y_train = trainingData[["airline_sentiment"]]

In [3]:
testingData = pd.read_csv("twitterTest.csv")
x_test = testingData[["text", "retweet_count"]]

In [4]:
x_train

Unnamed: 0,text,retweet_count
0,"@SouthwestAir I am scheduled for the morning, ...",0
1,@SouthwestAir seeing your workers time in and ...,0
2,@united Flew ORD to Miami and back and had gr...,0
3,@SouthwestAir @dultch97 that's horse radish üò§üê¥,0
4,@united so our flight into ORD was delayed bec...,0
...,...,...
10975,@AmericanAir followback,0
10976,@united thanks for the help. Wish the phone re...,0
10977,@usairways the. Worst. Ever. #dca #customerser...,0
10978,@nrhodes85: look! Another apology. DO NOT FLY ...,0


In [5]:
x_test

Unnamed: 0,text,retweet_count
0,@AmericanAir In car gng to DFW. Pulled over 1h...,0
1,"@AmericanAir after all, the plane didn‚Äôt land ...",0
2,@SouthwestAir can't believe how many paying cu...,1
3,@USAirways I can legitimately say that I would...,0
4,@AmericanAir still no response from AA. great ...,0
...,...,...
3655,@USAirways Been stuck for 40+ minutes due to l...,0
3656,@USAirways 4 hours... 4 hours... FOUR HOURS. ...,0
3657,Nice RT @VirginAmerica: The man of steel might...,1
3658,@AmericanAir Aww Thanks AA..DFW was on GMA up ...,0


In [47]:
y_train = y_train.values
y_train

array([['negative'],
       ['positive'],
       ['positive'],
       ...,
       ['negative'],
       ['negative'],
       ['negative']], dtype=object)

In [8]:
y_train.shape

(10980, 1)

In [48]:
y_train = y_train[:, 0]

In [49]:
y_train.shape

(10980,)

In [50]:
y_train

array(['negative', 'positive', 'positive', ..., 'negative', 'negative',
       'negative'], dtype=object)

In [11]:
trainingText = x_train['text'].values
testingText = x_test['text'].values

In [12]:
from nltk.corpus import wordnet

def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    
    elif tag.startswith('V'):
        return wordnet.VERB
    
    elif tag.startswith('N'):
        return wordnet.NOUN
    
    elif tag.startswith('R'):
        return wordnet.ADV
    
    else:
        return wordnet.NOUN

In [13]:
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [14]:
lemmatizer = WordNetLemmatizer()

In [17]:
def clean_review(text):
    
    new_words = []
    
    text = re.split(r'\W+', text)
    for word in text:
        if word.lower() not in stops:
            pos = pos_tag([word])[0][1]
            realPos = get_simple_pos(pos)
            cleaned_word = lemmatizer.lemmatize(word, pos=realPos)
            new_words.append(cleaned_word.lower())
    
    return " ".join(new_words)

In [18]:
trainText = [clean_review(doc) for doc in trainingText]
testText = [clean_review(doc) for doc in testingText]

In [20]:
trainText[0]

'southwestair schedule morning day fact yes sure even flight one cancelled flightled'

In [21]:
trainingText[0]

'@SouthwestAir I am scheduled for the morning, 2 days after the fact, yes..not sure why my evening flight was the only one Cancelled Flightled'

In [26]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [27]:
count_vec = CountVectorizer(max_features = 2500, ngram_range=(1,2), max_df=0.8)
tfidf_vec = TfidfVectorizer(max_features = 2500, ngram_range=(1,2), max_df=0.8)

In [44]:
x_train_count_vec = count_vec.fit_transform(trainText)
x_train_tfidf_vec = tfidf_vec.fit_transform(trainText)

In [45]:
count_vec.get_feature_names()

['00',
 '000',
 '000 mile',
 '03',
 '10',
 '10 day',
 '10 hour',
 '10 hr',
 '10 min',
 '10 minute',
 '100',
 '1000',
 '10pm',
 '11',
 '12',
 '12 hour',
 '13',
 '130',
 '14',
 '15',
 '15 min',
 '15 minute',
 '150',
 '16',
 '17',
 '18',
 '19',
 '1hr',
 '1k',
 '1st',
 '1st class',
 '20',
 '20 min',
 '20 minute',
 '200',
 '2015',
 '21',
 '22',
 '23',
 '24',
 '24 hour',
 '24 hr',
 '24hrs',
 '25',
 '25 min',
 '26',
 '27',
 '28',
 '2day',
 '2hrs',
 '2nd',
 '2nd time',
 '2x',
 '30',
 '30 min',
 '30 minute',
 '300',
 '30am',
 '30pm',
 '32',
 '35',
 '36',
 '3hrs',
 '3rd',
 '40',
 '40 min',
 '40 minute',
 '400',
 '42',
 '45',
 '45 min',
 '45 minute',
 '48',
 '4th',
 '50',
 '50 min',
 '50 minute',
 '500',
 '55',
 '5hrs',
 '60',
 '70',
 '700',
 '728',
 '737',
 '75',
 '777',
 '7am',
 '80',
 '800',
 '800 number',
 '90',
 '90 min',
 'a320',
 'aa',
 'able',
 'able get',
 'absolute',
 'absolutely',
 'absurd',
 'accept',
 'acceptable',
 'access',
 'accommodate',
 'accommodation',
 'account',
 'acct',
 'a

In [51]:
tfidf_vec.get_feature_names()

['00',
 '000',
 '000 mile',
 '03',
 '10',
 '10 day',
 '10 hour',
 '10 hr',
 '10 min',
 '10 minute',
 '100',
 '1000',
 '10pm',
 '11',
 '12',
 '12 hour',
 '13',
 '130',
 '14',
 '15',
 '15 min',
 '15 minute',
 '150',
 '16',
 '17',
 '18',
 '19',
 '1hr',
 '1k',
 '1st',
 '1st class',
 '20',
 '20 min',
 '20 minute',
 '200',
 '2015',
 '21',
 '22',
 '23',
 '24',
 '24 hour',
 '24 hr',
 '24hrs',
 '25',
 '25 min',
 '26',
 '27',
 '28',
 '2day',
 '2hrs',
 '2nd',
 '2nd time',
 '2x',
 '30',
 '30 min',
 '30 minute',
 '300',
 '30am',
 '30pm',
 '32',
 '35',
 '36',
 '3hrs',
 '3rd',
 '40',
 '40 min',
 '40 minute',
 '400',
 '42',
 '45',
 '45 min',
 '45 minute',
 '48',
 '4th',
 '50',
 '50 min',
 '50 minute',
 '500',
 '55',
 '5hrs',
 '60',
 '70',
 '700',
 '728',
 '737',
 '75',
 '777',
 '7am',
 '80',
 '800',
 '800 number',
 '90',
 '90 min',
 'a320',
 'aa',
 'able',
 'able get',
 'absolute',
 'absolutely',
 'absurd',
 'accept',
 'acceptable',
 'access',
 'accommodate',
 'accommodation',
 'account',
 'acct',
 'a

In [56]:
x_train_count_vec = x_train_count_vec.toarray()
x_train_tfidf_vec = x_train_tfidf_vec.toarray()

In [62]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [84]:
clf1 = MultinomialNB()
clf1.fit(x_train_count_vec, y_train)
clf1.score(x_train_count_vec, y_train)

0.8068306010928962

In [66]:
clf1 = MultinomialNB()
clf1.fit(x_train_tfidf_vec, y_train)
clf1.score(x_train_tfidf_vec, y_train)

0.7872495446265938

In [70]:
retweet_train = x_train['retweet_count'].values.reshape(-1, 1)
retweet_train

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [1]], dtype=int64)

In [76]:
x_test_count_vec = count_vec.transform(testText)
x_test_tfidf_vec = tfidf_vec.transform(testText)

In [78]:
x_test_count_vec = x_test_count_vec.toarray()
x_test_tfidf_vec = x_test_tfidf_vec.toarray()

In [74]:
x_train_with_retweet_count = np.concatenate((x_train_count_vec, retweet_train), axis = 1)
x_train_with_retweet_tfidf = np.concatenate((x_train_tfidf_vec, retweet_train), axis = 1)

In [81]:
retweet_test = x_test['retweet_count'].values.reshape(-1, 1)
retweet_test

array([[0],
       [0],
       [1],
       ...,
       [1],
       [0],
       [0]], dtype=int64)

In [89]:
clf1 = MultinomialNB()
clf1.fit(x_train_with_retweet_count, y_train)
clf1.score(x_train_with_retweet_count, y_train)

0.8063752276867031

In [83]:
clf1 = MultinomialNB()
clf1.fit(x_train_with_retweet_tfidf, y_train)
clf1.score(x_train_with_retweet_tfidf, y_train)

0.7879781420765027

In [90]:
x_test_with_retweet_count = np.concatenate((x_test_count_vec, retweet_test), axis = 1)
x_test_with_retweet_tfidf = np.concatenate((x_test_tfidf_vec, retweet_test), axis = 1)

In [105]:
y_pred = clf1.predict(x_test_with_retweet_count)
y_pred

array(['negative', 'neutral', 'negative', ..., 'negative', 'positive',
       'negative'], dtype=object)

In [106]:
np.savetxt("TwitterResult.csv", y_pred, delimiter=",", fmt='%s' )

In [100]:
from sklearn.linear_model import LogisticRegression

In [101]:
clf1 = LogisticRegression()
clf1.fit(x_train_count_vec, y_train)
clf1.score(x_train_count_vec, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8883424408014572

In [104]:
clf1 = LogisticRegression()
clf1.fit(x_train_with_retweet_count, y_train)
clf1.score(x_train_with_retweet_count, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8891621129326047

In [96]:
from sklearn.ensemble import RandomForestClassifier

In [97]:
clf1 = RandomForestClassifier()
clf1.fit(x_train_count_vec, y_train)
clf1.score(x_train_count_vec, y_train)

0.992167577413479

In [102]:
y_pred = clf1.predict(x_test_count_vec)
y_pred

array(['negative', 'neutral', 'negative', ..., 'negative', 'positive',
       'negative'], dtype=object)