<a href="https://colab.research.google.com/github/abhishekTP623/Twitter-Sentiment-Analysis/blob/main/Copy_of_SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
saurabhshahane_twitter_sentiment_dataset_path = kagglehub.dataset_download('saurabhshahane/twitter-sentiment-dataset')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression
from textblob import TextBlob
from wordcloud import WordCloud
from sklearn.svm import SVC
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading the dataset

In [None]:
twitter=pd.read_csv('/kaggle/input/twitter-sentiment-dataset/Twitter_Data.csv')
twitter.head(5)

In [None]:
twitter['category'] = twitter['category'].replace({-1: 'negative', 0: 'neutral', 1: 'positive'})

In [None]:
twitter.head()

In [None]:
twitter.info()

In [None]:
twitter.isna().sum()

In [None]:
# Drop rows with missing values
twitter.dropna(subset=['clean_text', 'category'], inplace=True)

# Creating a word cloud to view the most appearing word

In [None]:
text = ''

for tweet in twitter[twitter['category'] == "positive"]['clean_text']:
    text += f" {tweet}"

wordcloud = WordCloud(
width=3000, height=2000, background_color='black',
stopwords = set(nltk.corpus.stopwords.words("english"))).generate(text)

fig = plt.figure(figsize=(40,30), facecolor='k',edgecolor='k')

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show

del text

In [None]:
text = ''

for tweet in twitter[twitter['category'] == "neutral"]['clean_text']:
    text += f" {tweet}"

wordcloud = WordCloud(
width=3000, height=2000, background_color='black',
stopwords = set(nltk.corpus.stopwords.words("english"))).generate(text)

fig = plt.figure(figsize=(40,30), facecolor='k',edgecolor='k')

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show

del text

In [None]:
text = ''

for tweet in twitter[twitter['category'] == "negative"]['clean_text']:
    text += f" {tweet}"

wordcloud = WordCloud(
width=3000, height=2000, background_color='black',
stopwords = set(nltk.corpus.stopwords.words("english"))).generate(text)

fig = plt.figure(figsize=(40,30), facecolor='k',edgecolor='k')

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show

del text

In [None]:
# Display some stats
print(twitter['category'].value_counts())

In [None]:
dist = twitter['category'].value_counts()
def distribution_plot(x, y, name):
    plt.figure(figsize=(10, 6))
    sns.barplot(x=x, y=y)
    plt.title(name)
    plt.show()

distribution_plot(x=dist.index, y=dist.values, name='Class Distribution train')

# Using polarity and subjectivity.
 Polarity refers to the sentiment expressed in a text, such as whether a statement is positive, negative, or neutral. In sentiment analysis, the polarity of a text is typically determined by analyzing the tone and emotion expressed in the language used.

 Subjectivity, on the other hand, refers to the extent to which a statement reflects personal opinions, feelings, or beliefs rather than factual information. Subjective text often includes personal perspectives and emotions, while objective text presents information without personal bias.

In [None]:
pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

twitter['polarity'] = twitter['clean_text'].apply(pol)
twitter['subjectivity'] = twitter['clean_text'].apply(sub)
twitter

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(twitter['polarity'], bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Polarity')
plt.xlabel('Polarity')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Plot Subjectivity
plt.figure(figsize=(10, 6))
plt.hist(twitter['subjectivity'], bins=20, color='lightgreen', edgecolor='black')
plt.title('Distribution of Subjectivity')
plt.xlabel('Subjectivity')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Using TF-IDF for text vectorization and logistic regression for model training

In [None]:
# Text Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(twitter['clean_text'])

# Encode target labels (category) directly into numeric values
y = twitter['category'].map({'negative': 0, 'neutral': 1, 'positive': 2})

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression classifier
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# Predict on test set
y_pred = lr.predict(X_test)

# Evaluate the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))



# Training the model with Naive Bayes

In [None]:
# Train a Naive Bayes classifier
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Predict on test set
y_pred_nb = nb.predict(X_test)

# Evaluate the classifier
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Naive Bayes F1 Score:", f1_score(y_test, y_pred_nb, average='weighted'))
print("Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))
print("Naive Bayes Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))

# We can see that Logistic Regression produces a higher accuracy than Naive Bayes.