# Step 1: Setup and Data Loading

In [2]:
!pip install pandas nltk gensim scikit-learn

import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import gensim.downloader as api
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

# Load the dataset (assuming it's named 'Tweets.csv')
df = pd.read_csv('Tweets.csv')
print(df[['airline_sentiment', 'text']].head())



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aratt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Aratt\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aratt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Aratt\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


  airline_sentiment                                               text
0           neutral                @VirginAmerica What @dhepburn said.
1          positive  @VirginAmerica plus you've added commercials t...
2           neutral  @VirginAmerica I didn't today... Must mean I n...
3          negative  @VirginAmerica it's really aggressive to blast...
4          negative  @VirginAmerica and it's a really big bad thing...


# Step 2: Text Preprocessing

In [4]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove mentions (@username) and hashtags
    text = re.sub(r'@\w+|\#\w+', '', text)
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Expand contractions
    contractions = {
        "don't": "do not",
        "can't": "cannot",
        "won't": "will not",
        "it's": "it is",
        "i'm": "i am",
        "you're": "you are",
        "they're": "they are",
        "we're": "we are",
        "that's": "that is",
        "what's": "what is",
        "where's": "where is",
        "there's": "there is",
        "who's": "who is",
        "how's": "how is",
        "let's": "let us",
        "he's": "he is",
        "she's": "she is",
        "ain't": "am not",
        "wouldn't": "would not",
        "shouldn't": "should not",
        "couldn't": "could not",
        "doesn't": "does not",
        "didn't": "did not",
        "haven't": "have not",
        "hasn't": "has not",
        "hadn't": "had not",
        "mustn't": "must not",
        "mightn't": "might not",
        "needn't": "need not",
        "shan't": "shall not",
        "i've": "i have",
        "you've": "you have",
        "we've": "we have",
        "they've": "they have",
        "i'd": "i would",
        "you'd": "you would",
        "he'd": "he would",
        "she'd": "she would",
        "we'd": "we would",
        "they'd": "they would",
        "i'll": "i will",
        "you'll": "you will",
        "he'll": "he will",
        "she'll": "she will",
        "we'll": "we will",
        "they'll": "they will"
    }
    for cont, expanded in contractions.items():
        text = text.replace(cont, expanded)
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in lemmatized_tokens if word not in stop_words]
    
    # Join tokens back to string
    processed_text = ' '.join(filtered_tokens)
    
    return processed_text

# Apply preprocessing to all tweets
df['processed_text'] = df['text'].apply(preprocess_text)
print(df[['text', 'processed_text']].head())

                                                text  \
0                @VirginAmerica What @dhepburn said.   
1  @VirginAmerica plus you've added commercials t...   
2  @VirginAmerica I didn't today... Must mean I n...   
3  @VirginAmerica it's really aggressive to blast...   
4  @VirginAmerica and it's a really big bad thing...   

                                      processed_text  
0                                               said  
1       plus youve added commercial experience tacky  
2       didnt today must mean need take another trip  
3  really aggressive blast obnoxious entertainmen...  
4                               really big bad thing  


# Step 3: Load Word2Vec Model

In [6]:
# Load the pre-trained Word2Vec model
print("Loading Word2Vec model...")
w2v_model = api.load('word2vec-google-news-300')
print("Model loaded successfully!")

Loading Word2Vec model...
Model loaded successfully!


# Step 4: Vectorize Tweets

In [12]:
def tweet_to_vector(tweet, model, vector_size=300):
    words = tweet.split()
    vector = np.zeros(vector_size)
    count = 0
    
    for word in words:
        if word in model:
            vector += model[word]
            count += 1
    
    if count != 0:
        vector /= count
    
    return vector

# Vectorize all tweets
X = np.array([tweet_to_vector(tweet, w2v_model) for tweet in df['processed_text']])
y = df['airline_sentiment'].values

print(f"Shape of feature matrix: {X.shape}")

Shape of feature matrix: (14640, 300)


# Step 5: Train-Test Split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

Training set size: 11712
Test set size: 2928


# Step 6: Train Logistic Regression Model

In [18]:
# Initialize and train the model
lr_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
lr_model.fit(X_train, y_train)

# Predict on test set
y_pred = lr_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {accuracy:.4f}")



Test accuracy: 0.7770


# Step 7: Prediction Function

In [21]:
def predict_tweet_sentiment(model, w2v_model, tweet):
    # Preprocess the tweet
    processed_tweet = preprocess_text(tweet)
    
    # Convert to vector
    tweet_vector = tweet_to_vector(processed_tweet, w2v_model)
    
    # Reshape for single prediction
    tweet_vector = tweet_vector.reshape(1, -1)
    
    # Predict sentiment
    sentiment = model.predict(tweet_vector)[0]
    
    return sentiment

# Example usage
sample_tweet = "The flight was amazing and the crew was very friendly!"
predicted_sentiment = predict_tweet_sentiment(lr_model, w2v_model, sample_tweet)
print(f"Predicted sentiment: {predicted_sentiment}")

Predicted sentiment: positive
