# Sentiment analysis

In [1]:
# Install required packages
!pip install nltk seaborn scikit-learn --quiet

# Import libraries
import pandas as pd
import numpy as np
import re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Download NLTK stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bmoha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
# Load dataset
df = pd.read_csv("https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv")
df = df[['label', 'tweet']]
df.columns = ['label', 'text']
df = df[df['text'].notnull()]
df.head()

Unnamed: 0,label,text
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [3]:
# Balance dataset
min_count = df['label'].value_counts().min()
df = df.groupby('label').sample(n=min_count, random_state=42).reset_index(drop=True)

In [4]:
# Text preprocessing
stop_words = set(stopwords.words('english'))
def clean_text(text):
    text = re.sub(r"http\\S+|@\\S+|#\\S+", "", text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.lower().split()
    return " ".join([word for word in text if word not in stop_words])

df['clean_text'] = df['text'].apply(clean_text)

In [5]:
# TF-IDF vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['label']

In [6]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Model training
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [8]:
# Evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8573021181716833
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       443
           1       0.88      0.84      0.86       454

    accuracy                           0.86       897
   macro avg       0.86      0.86      0.86       897
weighted avg       0.86      0.86      0.86       897



In [9]:
# Inference function
def predict_sentiment(text):
    cleaned = clean_text(text)
    vector = vectorizer.transform([cleaned])
    prediction = model.predict(vector)[0]
    return "Positive" if prediction == 0 else "Negative"

In [17]:
# Sample predictions
samples = [
    "I love this product!",
    "Terrible experience.",
    "Amazing performance by the actor.",
    "I hate this so much.",
    "Wonderful service, thank you!",
]

for s in samples:
    print(f"'{s}' => {predict_sentiment(s)}")

'I love this product!' => Positive
'Terrible experience.' => Positive
'Amazing performance by the actor.' => Positive
'I hate this so much.' => Negative
'Wonderful service, thank you!' => Positive
