In [1]:
# Import libraries
import pandas as pd
import collections
import re
import nltk
from nltk import ngrams
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()
import numpy as np
from scipy.sparse import hstack
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
def normalizer(tweet):
    only_letters = re.sub("[^a-zA-Z]", " ",tweet) 
    tokens = nltk.word_tokenize(only_letters)[2:]
    lower_case = [l.lower() for l in tokens]
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas

In [3]:
def ngrams(input_list):
    #onegrams = input_list
    bigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:]))]
    trigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:], input_list[2:]))]
    return bigrams+trigrams

In [4]:
def count_words(input):
    cnt = collections.Counter()
    for row in input:
        for word in row:
            cnt[word] += 1
    return cnt

In [5]:
def sentiment2target(sentiment):
    return {
        'negative': 0,
        'neutral': 1,
        'positive' : 2
    }[sentiment]

In [6]:
### Preprocessing ###

# To see the full content of cells
pd.set_option('display.max_colwidth', -1)

# Read data
data = pd.read_csv("./Tweets.csv")

# Normalize tweets
data['normalized_tweet'] = data.text.apply(normalizer)

# Construct bigrams and trigrams
data['grams'] = data.normalized_tweet.apply(ngrams)


In [11]:
### Data preparation ###

# Create inputs and outputs
count_vectorizer = CountVectorizer(ngram_range=(1,2))
vectorized_data = count_vectorizer.fit_transform(data.text)
X = hstack((np.array(range(0,vectorized_data.shape[0]))[:,None], vectorized_data))
y = data.airline_sentiment.apply(sentiment2target)

# Split training set and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)


In [12]:
### Model training ###

# Create a Naive Bayes Classifier
clf_nb = MultinomialNB()

# Train the classifier
clf_nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [13]:
### Model testing ###

y_pred = clf_nb.predict(X_test)

# Accuracy
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))

# F1 score
print("F1 Score: {:.2f}".format(f1_score(y_test, y_pred, average='micro') * 100))

# Confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 65.81%
F1 Score: 65.81
Confusion Matrix:
 [[1717   74   79]
 [ 456  138   20]
 [ 351   21   72]]
