In [None]:
import re
import spacy
from nltk.tokenize import word_tokenize,sent_tokenize
import nltk
from nltk.corpus import wordnet,stopwords
import pandas as pd
from spacy import displacy
from spacy.matcher import Matcher
from spacy.tokens import Span as sp
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix



nlp=spacy.load('en_core_web_sm')

nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("jp797498e/twitter-entity-sentiment-analysis")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/twitter-entity-sentiment-analysis


In [None]:
data=pd.read_csv("/root/.cache/kagglehub/datasets/jp797498e/twitter-entity-sentiment-analysis/versions/2/twitter_training.csv",header=None)
data=data.rename({0:'id',1:'Company',2:'Sentiment',3:'Tweet'},axis=1)

In [None]:
data.isnull().sum()

Unnamed: 0,0
id,0
Company,0
Sentiment,0
Tweet,686


In [None]:
data.dropna(inplace=True)

In [None]:


data['Sentiment'].unique()


array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [None]:
data=data[data['Sentiment']!='Irrelevant']
data=data[data['Sentiment']!='Neutral']
data['Sentiment'].unique()

array(['Positive', 'Negative'], dtype=object)

In [None]:
tweets=data['Tweet'].values


TEXT PREPROCESSING

In [None]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

processed_tweets = []

In [None]:
for tweet in tweets:
  tweet=str(tweet)
  tweet=re.sub(r'[^\w\s]','',tweet)
  tweet=tweet.lower()

  words = [word for word in tweet.split() if word not in stop_words]

  stemmed_words = [stemmer.stem(word) for word in words if len(word) != 0]

  processed_tweet = ' '.join(stemmed_words)
  processed_tweets.append(processed_tweet)


In [None]:
processed_tweets[-10]

'let elit go unnot nvidia highlight automat record best moment fennitegam gfn'

TF IDF

In [None]:
tfidf=TfidfVectorizer()
enc_tweets=tfidf.fit_transform(processed_tweets)

df=pd.DataFrame(enc_tweets.toarray(),columns=tfidf.get_feature_names_out())
df.head()

Unnamed: 0,00,000,00011,00014,00015,00015cant,00016,00054,00105,00107,...,это,юууу,ясс,اunk,اللعبه,حبيت,خلاص,خلاصunk,٥υ,ℐℓ٥
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
x=enc_tweets.toarray()
y=pd.get_dummies(data['Sentiment']).astype(int).values[:,1:]

In [None]:
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
y

array([[1],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]])

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2,random_state=4)

In [None]:
len(x_train)

34410

NAIVE BAYES MODEL

In [None]:
model=MultinomialNB()
model.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


In [None]:
y_pred=model.predict(x_test)

In [None]:
y_test

array([[0],
       [0],
       [1],
       ...,
       [0],
       [0],
       [0]])

EVALUATING

In [None]:
correct=0
wrong=0

for i,j in zip(y_test,y_pred):
  if(i[0]==j):
    correct+=1
  else:
    wrong+=1

accuracy=correct/(correct+wrong)
print(f" Testing Accuracy: {accuracy*100}")

 Testing Accuracy: 88.07392769963967


In [None]:
y_pred_train=model.predict(x_train)
print( f"Training Accuracy: :{accuracy_score(y_train,y_pred_train)*100}")

Training Accuracy: :90.10462074978204


In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89      4482
           1       0.90      0.85      0.87      4121

    accuracy                           0.88      8603
   macro avg       0.88      0.88      0.88      8603
weighted avg       0.88      0.88      0.88      8603

