# Sentiment Analysis Project

## Data Collection and Preprocessing

In [4]:
import pandas as pd
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
import re
from langdetect import detect_langs

In [5]:
df = pd.read_csv("Tweets.csv")

In [6]:
def removeUrls(text):
    text = re.sub(r'\s?([@#][\w_-]+)', '', text)
    return re.sub(r'http\S+', '', text)

In [7]:
labels = df['airline_sentiment']
text = df['text']
text_processed = df['text'].apply(removeUrls)

## Stemming

#### Stemming resulted in a lower F1 score. As a result, its usage was discarded.

In [8]:
# ps = PorterStemmer()
# print(text_tokenized[29])
# for i in range(len(text_tokenized)):
#     new_sentence = ''
#     for token in (text_tokenized[i].split(' ')):
#         new_sentence += ps.stem(token) + ' '
#     text_tokenized[i] = new_sentence
# print(text_tokenized[29])

## Filtering the tweets

In [9]:
def getEnProb(text):
    langs = detect_langs(text)
    en_prob = 0.0
    for lang in langs:
        if lang.lang == 'en':
            en_prob = lang.prob
            break
    return en_prob

    #Remove RT
    #Remove Tweets with lenght < 20
    #Remove Tweets that contain less than 85% of its content not in the English Language
def applyFilters(features, classes):
    filtered_features = []
    filtered_classes = []

    for i in range(len(features)):
        rt_filter = ' RT ' not in features[i]
        length_filter = len(features[i]) >= 20
        #this filter takes a little bit more time (about 4 mins)
        en_prob_filter = getEnProb(features[i]) >= 0.85
        if rt_filter and length_filter and en_prob_filter:
            filtered_features.append(features[i])
            filtered_classes.append(classes[i])
    return filtered_features, filtered_classes

##  MultinomialNB Model Fitting and Evaluating

In [10]:
def create_mnb_model_and_eval(features, classes, apply_filter):
    if apply_filter:
        features, classes = applyFilters(features, classes)
        
    tweet_tokenizer = TweetTokenizer()
    tfidf_vectorizer = TfidfVectorizer(norm = None, tokenizer = tweet_tokenizer.tokenize ,analyzer = 'word', stop_words = 'english')
    X = tfidf_vectorizer.fit_transform(features)
    Y = classes
    
    split_index = int(0.8*df.shape[0])
    training_X = X[:split_index]
    training_Y = Y[:split_index]
    testing_X = X[split_index:]
    testing_Y = Y[split_index:]
    
    multinomialNB_model = MultinomialNB()
    multinomialNB_model.fit(training_X, training_Y)
    predictionNB = multinomialNB_model.predict(testing_X)
    return f1_score(testing_Y, predictionNB, average='micro')


In [13]:
mnb_unfiltered = create_mnb_model_and_eval(text_processed, labels, False)
mnb_filtered = create_mnb_model_and_eval(text_processed, labels, True)
print("MultinomialNB Model w/o filtering f1-measure:", mnb_unfiltered)
print("MultinomialNB Model w/ filtering f1-measure:", mnb_filtered)

MultinomialNB Model w/o filtering f1-measure: 0.7237021857923497
MultinomialNB Model w/ filtering f1-measure: 0.739297475301866


##  kNN Model Fitting and Evaluating

In [11]:
def create_knn_model_and_eval(features, classes, apply_filter):
    if apply_filter:
        features, classes = applyFilters(features, classes)
        
    tweet_tokenizer = TweetTokenizer()
    tfidf_vectorizer = TfidfVectorizer(norm = None, tokenizer = tweet_tokenizer.tokenize ,analyzer = 'word', stop_words = 'english')
    X = tfidf_vectorizer.fit_transform(features)
    Y = classes
    
    split_index = int(0.8*df.shape[0])
    training_X = X[:split_index]
    training_Y = Y[:split_index]
    testing_X = X[split_index:]
    testing_Y = Y[split_index:]
    
    kNeighborsClassifier_model = KNeighborsClassifier(n_neighbors=3)
    kNeighborsClassifier_model.fit(training_X, training_Y) 
    kNeighborsClassifier_model.score(testing_X, testing_Y)
    predictionKNN = kNeighborsClassifier_model.predict(testing_X)
    return f1_score(testing_Y, predictionKNN, average='micro')

In [14]:
knn_unfiltered = create_knn_model_and_eval(text_processed, labels, False)
knn_filtered = create_knn_model_and_eval(text_processed, labels, True)
print("kNN Model w/o filtering f1-measure:", knn_unfiltered)
print("kNN Model w/ filtering f1-measure:", knn_filtered)

kNN Model w/o filtering f1-measure: 0.5420081967213115
kNN Model w/ filtering f1-measure: 0.6118166758696852


##  Random Forest Model Fitting and Evaluating

In [12]:
def create_rfc_model_and_eval(features, classes, apply_filter):
    if apply_filter:
        features, classes = applyFilters(features, classes)
        
    tweet_tokenizer = TweetTokenizer()
    tfidf_vectorizer = TfidfVectorizer(norm = None, tokenizer = tweet_tokenizer.tokenize ,analyzer = 'word', stop_words = 'english')
    X = tfidf_vectorizer.fit_transform(features)
    Y = classes
    
    split_index = int(0.8*df.shape[0])
    training_X = X[:split_index]
    training_Y = Y[:split_index]
    testing_X = X[split_index:]
    testing_Y = Y[split_index:]
    
    randomForestClassifier_model = RandomForestClassifier(random_state=0)
    randomForestClassifier_model.fit(training_X, training_Y)
    predictionRF = randomForestClassifier_model.predict(testing_X)
    return f1_score(testing_Y, predictionRF, average='micro')

In [15]:
rfc_unfiltered = create_rfc_model_and_eval(text_processed, labels, False)
rfc_filtered = create_rfc_model_and_eval(text_processed, labels, True)
print("RFC Model w/o filtering f1-measure:", rfc_unfiltered)
print("RFC Model w/ filtering f1-measure:", rfc_filtered)

RFC Model w/o filtering f1-measure: 0.7780054644808743
RFC Model w/ filtering f1-measure: 0.8170663692136084
