# Advanced Twitter Sentiment Analysis
This notebook performs advanced sentiment analysis on Twitter data using Python. It includes preprocessing, feature extraction, model training, and evaluation.

In [None]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import emoji
import warnings
warnings.filterwarnings('ignore')
nltk.download('stopwords')
nltk.download('wordnet')


In [None]:
# Load sample dataset (you can replace this with your own Twitter dataset)
df = pd.read_csv('https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv')
df = df[['label', 'tweet']]
df.columns = ['label', 'text']
df.head()


In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)  # remove mentions
    text = re.sub(r'#', '', text)  # remove hashtag symbol
    text = emoji.replace_emoji(text, replace='')  # remove emojis
    text = re.sub(r'[^a-zA-Z ]', '', text)  # remove punctuation
    text = text.lower()  # lowercase
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df['cleaned_text'] = df['text'].apply(clean_text)
df[['text', 'cleaned_text']].head()


In [None]:
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['cleaned_text']).toarray()
y = df['label']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
# Add length of tweet and number of exclamations as new features
df['text_length'] = df['cleaned_text'].apply(len)
df['num_exclaims'] = df['text'].apply(lambda x: x.count('!'))

# Merge with TF-IDF features
features_df = pd.DataFrame(X, columns=tfidf.get_feature_names_out())
features_df['text_length'] = df['text_length'].values
features_df['num_exclaims'] = df['num_exclaims'].values

X_train2, X_test2, y_train2, y_test2 = train_test_split(features_df, y, test_size=0.2, random_state=42)

model2 = LogisticRegression()
model2.fit(X_train2, y_train2)
y_pred2 = model2.predict(X_test2)

print("Classification Report with Additional Features:")
print(classification_report(y_test2, y_pred2))

sns.heatmap(confusion_matrix(y_test2, y_pred2), annot=True, fmt='d', cmap='Greens')
plt.title("Confusion Matrix with Additional Features")
plt.show()
