In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import gensim.downloader as gensim_api
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from contractions import fix as expand_contractions

nltk.download("punkt")|
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

data = pd.read_csv("Tweets.csv")[["airline_sentiment", "text"]].dropna()

stopword_set = set(stopwords.words("english"))
lemmatize = WordNetLemmatizer()

def clean_text(raw_text):
    raw_text = raw_text.lower()
    raw_text = re.sub(r"http\S+|www.\S+", "", raw_text)
    raw_text = re.sub(r"@\w+", "", raw_text)
    raw_text = re.sub(r"#", "", raw_text)
    raw_text = re.sub(r"[^\w\s]", "", raw_text)
    raw_text = expand_contractions(raw_text)
    tokens = word_tokenize(raw_text)
    final_words = [lemmatize.lemmatize(token) for token in tokens if token.isalpha() and token not in stopword_set]
    return final_words

print("Fetching pre-trained Google News Word2Vec model...")
word2vec = gensim_api.load("word2vec-google-news-300")

def tweet_to_vector(tweet, embedding_model):
    words = clean_text(tweet)
    vectors = [embedding_model[word] for word in words if word in embedding_model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(embedding_model.vector_size)

X_features = np.array([tweet_to_vector(tweet, word2vec) for tweet in data["text"]])
y_labels = data["airline_sentiment"].values

X_train, X_val, y_train, y_val = train_test_split(X_features, y_labels, test_size=0.2, stratify=y_labels, random_state=42)

classifier = LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=1000)
classifier.fit(X_train, y_train)

predictions = classifier.predict(X_val)
print("Test Set Accuracy:", accuracy_score(y_val, predictions))

def get_sentiment(trained_model, vector_model, tweet_text):
    vector_input = tweet_to_vector(tweet_text, vector_model).reshape(1, -1)
    return trained_model.predict(vector_input)[0]

sample = "I love how quickly Delta rebooked my cancelled flight!"
print("Predicted Sentiment:", get_sentiment(classifier, word2vec, sample))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Fetching pre-trained Google News Word2Vec model...




Test Set Accuracy: 0.7687841530054644
Predicted Sentiment: positive
