<a href="https://colab.research.google.com/github/aniqamirul/aiml/blob/main/day30miniproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ✈️ Airline Tweet Sentiment Classifier
Mini project using the `airline_tweets_sample.csv` dataset.

Steps:
- Load dataset
- Preprocess text
- Perform basic EDA
- Build sentiment classifier
- Evaluate using precision, recall, and F1-score

## 1. Install and import libraries

In [1]:
# Install required libraries
!pip install nltk wordcloud scikit-learn matplotlib seaborn

# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# NLP tools
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Machine learning tools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


## 2. Upload and load dataset

In [None]:
from google.colab import files

# Upload `airline_tweets_sample.csv`
uploaded = files.upload()

csv_name = 'airline_tweets_sample.csv'
df = pd.read_csv(csv_name)

# Inspect first few rows and columns
display(df.head())
print("Columns:", df.columns.tolist())

## 3. Text preprocessing
Preprocessing steps:
- Lowercase
- Tokenization
- Remove non-alphabetic tokens
- Remove stopwords
- Lemmatization
Result stored in column `clean_text`.

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = str(text).lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha()]
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

# Assume raw text column is named 'text'
df['clean_text'] = df['text'].apply(preprocess_text)

df[['text', 'clean_text']].head()

## 4. Exploratory data analysis: sentiment distribution

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(data=df, x='airline_sentiment', order=df['airline_sentiment'].value_counts().index)
plt.title('Sentiment distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

## 5. Wordclouds for each sentiment

In [None]:
sentiment_col = 'airline_sentiment'
sentiments = df[sentiment_col].dropna().unique()

for s in sentiments:
    text = " ".join(df[df[sentiment_col] == s]['clean_text'])
    if not text.strip():
        print(f"No text for sentiment: {s}")
        continue

    wc = WordCloud(width=800, height=400, background_color='white').generate(text)

    plt.figure(figsize=(10,5))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Wordcloud — {s}', fontsize=18)
    plt.show()

## 6. TF-IDF vectorization

In [None]:
tfidf = TfidfVectorizer(max_features=5000)

X = tfidf.fit_transform(df['clean_text'])
y = df[sentiment_col]

print("Feature matrix shape:", X.shape)

## 7. Train–test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])

## 8. Train sentiment classification model (Logistic Regression)

In [None]:
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

## 9. Evaluation: precision, recall, F1-score for each sentiment

In [None]:
print(classification_report(y_test, y_pred))