In [1]:
# Sentiment Analysis for US Airlines tweets
# Step 1: Imports and dataset load
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import nltk
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

import joblib

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("crowdflower/twitter-airline-sentiment")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'twitter-airline-sentiment' dataset.
Path to dataset files: /kaggle/input/twitter-airline-sentiment


In [3]:
# Try to load the Kaggle dataset file 'Tweets.csv' in the current folder
candidates = ['Tweets.csv', 'tweets.csv', 'twitter-airline-sentiment.csv', os.path.join(path, 'Tweets.csv')]
for fn in candidates:
    if os.path.exists(fn):
        df = pd.read_csv(fn)
        print(f"Loaded dataset from '{fn}'")
        break
else:
    raise FileNotFoundError(
        "Dataset file not found. Please put 'Tweets.csv' in the same folder as this notebook or change the filename."
    )

print('Dataset shape:', df.shape)
print('Columns:', df.columns.tolist())
# Keep only required columns
if 'airline_sentiment' in df.columns and 'text' in df.columns:
    df = df[['airline_sentiment', 'text']].copy()
else:
    raise KeyError("Expected columns 'airline_sentiment' and 'text' not found in the dataset.")

# Quick class distribution
print(df['airline_sentiment'].value_counts())

Loaded dataset from '/kaggle/input/twitter-airline-sentiment/Tweets.csv'
Dataset shape: (14640, 15)
Columns: ['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence', 'negativereason', 'negativereason_confidence', 'airline', 'airline_sentiment_gold', 'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord', 'tweet_created', 'tweet_location', 'user_timezone']
airline_sentiment
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64


In [5]:
# Step 2: Preprocess text - define clean_text and apply
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab') # Add this line to download the missing resource

ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

import string

def clean_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    # remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    # tokenize
    tokens = nltk.word_tokenize(text)
    cleaned = []
    for token in tokens:
        token = token.strip()
        if token in stop_words:
            continue
        if all(ch in string.punctuation for ch in token):
            continue
        # stem
        token = ps.stem(token)
        cleaned.append(token)
    return ' '.join(cleaned)

# Apply cleaning (this may take a short time)
print('Cleaning texts...')
df['text_cleaned'] = df['text'].apply(clean_text)
print('Example cleaned text:')
print(df['text'].iloc[0])
print(df['text_cleaned'].iloc[0])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Cleaning texts...
Example cleaned text:
@VirginAmerica What @dhepburn said.
virginamerica dhepburn said


In [6]:
# Step 3: Feature extraction - TF-IDF
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['text_cleaned']).toarray()

# Encode labels
le = LabelEncoder()
Y = le.fit_transform(df['airline_sentiment'])

print('Feature matrix shape:', X.shape)
print('Encoded classes:', list(zip(le.classes_, le.transform(le.classes_))))


Feature matrix shape: (14640, 3000)
Encoded classes: [('negative', np.int64(0)), ('neutral', np.int64(1)), ('positive', np.int64(2))]


In [7]:
# Step 4: Train/test split and train Multinomial Naive Bayes
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=2, stratify=Y)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

# Multinomial Naive Bayes
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
print('Naive Bayes Accuracy:', accuracy_score(y_test, y_pred_nb))
print('\nClassification Report (NB):')
print(classification_report(y_test, y_pred_nb, target_names=le.classes_))


Train shape: (11712, 3000) Test shape: (2928, 3000)
Naive Bayes Accuracy: 0.7346311475409836

Classification Report (NB):
              precision    recall  f1-score   support

    negative       0.73      0.98      0.84      1835
     neutral       0.70      0.27      0.39       620
    positive       0.84      0.40      0.54       473

    accuracy                           0.73      2928
   macro avg       0.76      0.55      0.59      2928
weighted avg       0.74      0.73      0.69      2928



In [8]:
# Step 5: Train Random Forest classifier
rf = RandomForestClassifier(random_state=2)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print('Random Forest Accuracy:', accuracy_score(y_test, y_pred_rf))
print('\nClassification Report (RF):')
print(classification_report(y_test, y_pred_rf, target_names=le.classes_))


Random Forest Accuracy: 0.7575136612021858

Classification Report (RF):
              precision    recall  f1-score   support

    negative       0.78      0.94      0.85      1835
     neutral       0.66      0.38      0.48       620
    positive       0.74      0.55      0.64       473

    accuracy                           0.76      2928
   macro avg       0.73      0.62      0.65      2928
weighted avg       0.75      0.76      0.74      2928



In [9]:
# Step 6: Save vectorizer and models
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(nb, 'naive_bayes_model.joblib')
joblib.dump(rf, 'random_forest_model.joblib')
joblib.dump(le, 'label_encoder.joblib')
print('Saved: tfidf_vectorizer.joblib, naive_bayes_model.joblib, random_forest_model.joblib, label_encoder.joblib')


Saved: tfidf_vectorizer.joblib, naive_bayes_model.joblib, random_forest_model.joblib, label_encoder.joblib
