In [14]:
#Imports
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
tweets=pd.read_csv('airline_Tweets.csv')
tweets.sample(2)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
5520,568926716644745216,positive,1.0,,,Southwest,,fair_island,,0,@SouthwestAir Thank you!,,2015-02-20 16:14:34 -0800,Somewhere in Tennessee,Quito
12213,570261511781941251,neutral,1.0,,,American,,RockUrHumanity,,0,@AmericanAir Thank you for responding so quick...,,2015-02-24 08:38:34 -0800,East Coast/New England,Eastern Time (US & Canada)


We are mainly interested in the columns 'airline_sentiment' and 'text'

But first, we will drop all tweets (rows) which have a low ‘airline_sentiment_confidence’, ie. Tweets where the confidence that the sentiment is correct is low

### Dropping rows with low confidence

In [3]:
cleaned_tweets=tweets.drop(tweets[tweets['airline_sentiment_confidence']<0.5].index, axis=0) #axis=0 drops rows

print(f'In total, {len(tweets)-len(cleaned_tweets)} tweets was removed')

In total, 236 tweets was removed


### Defining the input X and outcome y

In [4]:
X=cleaned_tweets['text']
y=cleaned_tweets['airline_sentiment']

### Simplifying text

First, we download English stopwords and define them

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Anders\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
stop_words=stopwords.words('english')
punct=string.punctuation
stemmer=PorterStemmer()

In [7]:
cleaned_data=[]                 # Empty list
words = re.compile('[^a-zA-Z]') # Regular Expression for everything that is NOT (^) letters

for i in range(len(X)):
    tweet=re.sub(words,' ',X.iloc[i])   # Takes each element in X and removes everything that is not a-z and A-Z letters
    tweet=tweet.lower().split()         # Makes all letters lower-case, and splits by word
    
    #list compr. that checks every word in tweet, to see if its a stopword or a punctuation, and converts the word into the stem
    tweet=[stemmer.stem(word) for word in tweet if (word not in stop_words) and (word not in punct)] 
    
    tweet=' '.join(tweet)               # Joins the sentence back together
    cleaned_data.append(tweet)          # Puts the cleaned tweet into the cleaned_data list


# A comparison of before and after for one of the tweets
print(X[3])  
print(cleaned_data[2])

@VirginAmerica it's really aggressive to blast obnoxious "entertainment" in your guests' faces &amp; they have little recourse
virginamerica realli aggress blast obnoxi entertain guest face amp littl recours


### Converting words into numbers by the ‘Bag of Words’ approach

For the input $X$:

In [8]:
#Creates bags of words for each word in each tweet. Exclude airline names
count = CountVectorizer(max_features=3000,stop_words=['virginamerica','unit'])

X_fin=count.fit_transform(cleaned_data).toarray()


For the output $y$,  we convert each value in y into the index that the value has in the list 'sentiment_order'.

This means that:
* negative = 0
* neutral  = 1
* positive = 2

In [9]:
sentiment_order = ['negative', 'neutral', 'positive']


#Below we convert each value in y into the index that the value has in the above list.
#This means that
y_fin = y.apply(lambda x: sentiment_order.index(x))





## Time to go!

In [15]:
#Multinomial model is used as it is generally good with text based data (according to google anyways)
model=MultinomialNB()

X_train,X_test,y_train,y_test=train_test_split(X_fin,y_fin,test_size=0.3)
model.fit(X_train,y_train)

y_pred=model.predict(X_test)
cf=classification_report(y_test,y_pred)
print(cf)

              precision    recall  f1-score   support

           0       0.83      0.89      0.86      2736
           1       0.61      0.49      0.54       916
           2       0.72      0.68      0.70       670

    accuracy                           0.77      4322
   macro avg       0.72      0.69      0.70      4322
weighted avg       0.76      0.77      0.77      4322

