In [1]:
# Load in training dataset (Airline sentiment tweet data)
import pandas as pd
import numpy as np
from datasets import load_dataset

airline_df = pd.read_csv("hf://datasets/osanseviero/twitter-airline-sentiment/Tweets.csv")



# Load in the main Twitter corpus
# twitter_df = pd.read_csv("training.1600000.processed.noemoticon.csv")
#  Error: 'utf-8' codec can't decode bytes in position 232719-232720: invalid continuation byte

In [2]:
display(airline_df)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0000,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0000,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0000,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0000,Can't Tell,1.0000,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14635,569587686496825344,positive,0.3487,,0.0000,American,,KristenReenders,,0,@AmericanAir thank you we got on a different f...,,2015-02-22 12:01:01 -0800,,
14636,569587371693355008,negative,1.0000,Customer Service Issue,1.0000,American,,itsropes,,0,@AmericanAir leaving over 20 minutes Late Flig...,,2015-02-22 11:59:46 -0800,Texas,
14637,569587242672398336,neutral,1.0000,,,American,,sanyabun,,0,@AmericanAir Please bring American Airlines to...,,2015-02-22 11:59:15 -0800,"Nigeria,lagos",
14638,569587188687634433,negative,1.0000,Customer Service Issue,0.6659,American,,SraJackson,,0,"@AmericanAir you have my money, you change my ...",,2015-02-22 11:59:02 -0800,New Jersey,Eastern Time (US & Canada)


In [3]:
# Helpful resource for multinomial regression: 
# https://machinelearningmastery.com/multinomial-logistic-regression-with-python/
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from nltk.tokenize import TweetTokenizer 


# Split the airline data into training and test splits

air_feats = airline_df["text"]
air_labels = airline_df["airline_sentiment"]

feat_train, feat_test, label_train, label_test = train_test_split(air_feats, air_labels, test_size = 0.8)

# Tokenize the airline tweets in either split

def _remove_airline_tok(tokens):
    return tokens[1:] if tokens[0].startswith('@') else tokens    

tokenizer = TweetTokenizer()

#Tokenizing training airline tweets
train_tweets = feat_train.values
train_tweets_tokenized = [tokenizer.tokenize(tweet) for tweet in train_tweets]
train_tweets_clean = [_remove_airline_tok(tokens) for tokens in train_tweets_tokenized]

#Tokenizing testing airline tweets
test_tweets = feat_test.values
test_tweets_tokenized = [tokenizer.tokenize(tweet) for tweet in test_tweets]
test_tweets_clean = [_remove_airline_tok(tokens) for tokens in test_tweets_tokenized]


In [4]:
# loading word2vec vectors

# Useful resource on word2vec 
# https://medium.com/swlh/sentiment-classification-using-word-embeddings-word2vec-aedf28fbb8ca

import os
from gensim.models import KeyedVectors

w2v_file = 'GoogleNews-vectors-negative300.bin.gz'
w2v_file = os.path.expanduser(w2v_file)


w2v_vectors = KeyedVectors.load_word2vec_format(w2v_file, binary=True)
print('done loading.')

done loading.


In [6]:
# Training multinomial regression using word2vec vectors

from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

# Helpful guide: https://medium.com/swlh/sentiment-classification-using-word-embeddings-word2vec-aedf28fbb8ca



# Functioning for creating the average word2vec vector for each tweet 
def tweet_to_avg_vector(tweet_tokens):
    vectors = []
    for word in tweet_tokens:
        if word in w2v_vectors:
            vectors.append(w2v_vectors[word])
        else:
            vectors.append(np.zeros(300))  # If word is not in word2Vec, use zero vector
    return np.mean(vectors, axis=0)  # Average word vectors to represent the whole sentence 

# Convert training tweets to their average Word2Vec vector representations
x_train_vectors = np.array([tweet_to_avg_vector(tweet) for tweet in train_tweets_clean])

# Convert testing tweets to their average Word2Vec vector representations
x_test_vectors = np.array([tweet_to_avg_vector(tweet) for tweet in test_tweets_clean])




# Train a multinomial logistic regression model (using word2vec vectors)
mr = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
mr.fit(x_train_vectors, label_train)

# Get predictions then evaluate performance on test set
label_pred = mr.predict(x_test_vectors)
print(classification_report(label_test, label_pred))



              precision    recall  f1-score   support

    negative       0.77      0.96      0.86      7322
     neutral       0.68      0.35      0.46      2486
    positive       0.81      0.52      0.63      1904

    accuracy                           0.76     11712
   macro avg       0.75      0.61      0.65     11712
weighted avg       0.76      0.76      0.74     11712



In [12]:
# Training multinomial regression using TF-IDF vectors 

# Get TF-IDF vectors after tokenizing
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer()

# Convert tokenized tweets into strings that tf-idf vectorizer can actually use
train_tweets_clean_joined = [' '.join(tokens) for tokens in train_tweets_clean]
test_tweets_clean_joined = [' '.join(tokens) for tokens in test_tweets_clean]

train_tweets_tfidf = vectorizer.fit_transform(train_tweets_clean_joined)
test_tweets_tfidf = vectorizer.transform(test_tweets_clean_joined)


# Train a multinomial logistic regression model
mr = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
mr.fit(train_tweets_tfidf, label_train)

# Get predictions then evaluate performance on test set
label_pred = mr.predict(test_tweets_tfidf)
print(classification_report(label_test, label_pred))


              precision    recall  f1-score   support

    negative       0.75      0.97      0.85      7322
     neutral       0.71      0.36      0.48      2486
    positive       0.85      0.45      0.59      1904

    accuracy                           0.75     11712
   macro avg       0.77      0.59      0.64     11712
weighted avg       0.76      0.75      0.72     11712



In [13]:
# Comparing the averages between the two models, it seems that word2vec performs slightly better
# but it's a very small difference 

"""
word2vec:
    accuracy                           0.76     11712
   macro avg       0.75      0.61      0.65     11712
weighted avg       0.76      0.76      0.74     11712


tf-idf:
    accuracy                           0.75     11712
   macro avg       0.77      0.59      0.64     11712
weighted avg       0.76      0.75      0.72     11712
"""


'\nword2vec:\n    accuracy                           0.76     11712\n   macro avg       0.75      0.61      0.65     11712\nweighted avg       0.76      0.76      0.74     11712\n\n\ntf-idf:\n    accuracy                           0.75     11712\n   macro avg       0.77      0.59      0.64     11712\nweighted avg       0.76      0.75      0.72     11712\n'