In [10]:
# Load in training dataset (Airline sentiment tweet data)
import pandas as pd

airline_sentiment_df = pd.read_csv("hf://datasets/osanseviero/twitter-airline-sentiment/Tweets.csv")

In [6]:
airline_sentiment_df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [13]:
#make the test and training splits
from sklearn.model_selection import train_test_split

airline_feats = airline_sentiment_df["text"]
airline_labels = airline_sentiment_df["airline_sentiment"]

X_train, X_test, y_train, y_test = train_test_split(airline_feats, airline_labels, test_size = 0.2)

In [15]:
#tokenize the tweets
#remove the @ in the tweet
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer()


def _remove_airline_tok(tokens):
    return tokens[1:] if tokens[0].startswith('@') else tokens    

tokenizer = TweetTokenizer()

#Tokenizing training airline tweets
train_tweets = X_train.values
train_tokenized = [tokenizer.tokenize(tweet) for tweet in train_tweets]
train_clean = [_remove_airline_tok(tokens) for tokens in train_tokenized]

#Tokenizing testing airline tweets
test_tweets = X_test.values
test_tokenized = [tokenizer.tokenize(tweet) for tweet in test_tweets]
test_clean = [_remove_airline_tok(tokens) for tokens in test_tokenized]

In [16]:
# Get TF-IDF vectors after tokenizing
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer()

# Convert tokenized tweets into strings that tf-idf vectorizer can actually use
train_clean_usable = [' '.join(tokens) for tokens in train_clean]
test_clean_usable = [' '.join(tokens) for tokens in test_clean]

train_tweets_tfidf = vectorizer.fit_transform(train_clean_usable)
test_tweets_tfidf = vectorizer.transform(test_clean_usable)

In [19]:
# KNN classifier with cosine similarity
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

knn = KNeighborsClassifier(n_neighbors=5, metric='cosine')
knn.fit(train_tweets_tfidf, y_train)

sentiment_preds = knn.predict(test_tweets_tfidf)

print('Test accuracy: ', accuracy_score(y_test, sentiment_preds))
print('Test micro-F1: ', f1_score(y_test, sentiment_preds, average='micro'))
print('Test macro-F1: ', f1_score(y_test, sentiment_preds, average='macro'))
print()

Test accuracy:  0.7240437158469946
Test micro-F1:  0.7240437158469946
Test macro-F1:  0.6383852668115161



In [21]:
#report of the results:
from sklearn.metrics import classification_report

print(classification_report(y_test, sentiment_preds))

              precision    recall  f1-score   support

    negative       0.79      0.86      0.82      1833
     neutral       0.52      0.45      0.48       608
    positive       0.66      0.56      0.61       487

    accuracy                           0.72      2928
   macro avg       0.66      0.62      0.64      2928
weighted avg       0.71      0.72      0.72      2928

