In [2]:
import pandas as pd
import pymongo
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from scipy.special import softmax
import tqdm as notebook_tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
client = pymongo.MongoClient('localhost', 27017)

In [4]:
client.list_database_names()

['admin', 'config', 'db', 'local', 'project', 'test', 'tweets']

In [5]:
db = client.project

In [6]:
cursor = db.palestine.find({})
data = list(cursor)

In [7]:
data

[{'_id': ObjectId('6620ef9632d7e0ed64baf959'),
  'url': 'https://x.com/palestine/status/1780686755894858024',
  'twitterUrl': 'https://twitter.com/palestine/status/1780686755894858024',
  'id': '1780686755894858024',
  'text': "RT @QudsNen: BREAKING| Israel's national security minister, Itmar Ben Gvir, said he believes that death penalty is the right solution to th…",
  'retweetCount': 2952,
  'replyCount': 610,
  'likeCount': 3735,
  'quoteCount': 576,
  'createdAt': 'Wed Apr 17 19:56:20 +0000 2024',
  'bookmarkCount': 383,
  'isRetweet': True,
  'isQuote': False},
 {'_id': ObjectId('6620ef9632d7e0ed64baf95a'),
  'url': 'https://x.com/palestine/status/1780151942012101103',
  'twitterUrl': 'https://twitter.com/palestine/status/1780151942012101103',
  'id': '1780151942012101103',
  'text': 'RT @ShaykhSulaiman: Thoughts and prayers go out to Bishop Mar Mari Emmanuel and the Church attendees. \n\nHorrific attack again San Anti-Zion…',
  'retweetCount': 871,
  'replyCount': 344,
  'likeCou

In [8]:
df = pd.DataFrame(data)

In [9]:
df

Unnamed: 0,_id,url,twitterUrl,id,text,retweetCount,replyCount,likeCount,quoteCount,createdAt,bookmarkCount,isRetweet,isQuote
0,6620ef9632d7e0ed64baf959,https://x.com/palestine/status/178068675589485...,https://twitter.com/palestine/status/178068675...,1780686755894858024,RT @QudsNen: BREAKING| Israel's national secur...,2952,610,3735,576,Wed Apr 17 19:56:20 +0000 2024,383,True,False
1,6620ef9632d7e0ed64baf95a,https://x.com/palestine/status/178015194201210...,https://twitter.com/palestine/status/178015194...,1780151942012101103,RT @ShaykhSulaiman: Thoughts and prayers go ou...,871,344,4435,45,Tue Apr 16 08:31:11 +0000 2024,310,True,False
2,6620ef9632d7e0ed64baf95b,https://x.com/palestine/status/178008524801093...,https://twitter.com/palestine/status/178008524...,1780085248010932544,RT @DrLoupis: Bishop Mar Mari Emmanuel stabbed...,10331,1230,50401,476,Tue Apr 16 04:06:10 +0000 2024,6343,True,False
3,6620ef9632d7e0ed64baf95c,https://x.com/palestine/status/177954983184711...,https://twitter.com/palestine/status/177954983...,1779549831847113042,RT @QudsNen: An activist recorded the moment w...,13207,481,15187,363,Sun Apr 14 16:38:37 +0000 2024,689,True,False
4,6620ef9632d7e0ed64baf95d,https://x.com/palestine/status/177897815899972...,https://twitter.com/palestine/status/177897815...,1778978158999728431,RT @MiddleEastEye: In an exclusive interview w...,6309,386,9863,628,Sat Apr 13 02:46:59 +0000 2024,1003,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,6620f00932d7e0ed64bafa26,https://x.com/palestine/status/176223505300087...,https://twitter.com/palestine/status/176223505...,1762235053000872081,RT @Lowkey0nline: As people rushed to try and ...,40783,5700,141424,3147,Mon Feb 26 21:55:52 +0000 2024,7802,True,False
206,6620f00932d7e0ed64bafa27,https://x.com/palestine/status/176223479814647...,https://twitter.com/palestine/status/176223479...,1762234798146474487,"RT @Neolibtears: ""I'm lsraeli, Basel is Palest...",11039,91,37363,185,Mon Feb 26 21:54:51 +0000 2024,2388,True,False
207,6620f00932d7e0ed64bafa28,https://x.com/palestine/status/176223432212653...,https://twitter.com/palestine/status/176223432...,1762234322126536831,RT @MaxBlumenthal: “Many of us like to ask our...,2551,89,5376,46,Mon Feb 26 21:52:57 +0000 2024,154,True,False
208,6620f00932d7e0ed64bafa29,https://x.com/palestine/status/176223413632728...,https://twitter.com/palestine/status/176223413...,1762234136327283151,"RT @DrLoupis: RIP Aaron Bushnell, you will alw...",18427,1769,77239,352,Mon Feb 26 21:52:13 +0000 2024,1041,True,False


## roBERTa 

In [10]:
df_text = df[['text']]
# df_text

In [11]:
def process_tweet(tweet):
    tweet_words = []
    for word in tweet.split(' '):
        if word.startswith('@') and len(word) > 1:
            word = '@user'
        tweet_words.append(word)
    return ' '.join(tweet_words)

In [12]:
processed_tweets = df_text['text'].apply(process_tweet)

In [13]:
roberta = 'cardiffnlp/twitter-roberta-base-sentiment'

model = AutoModelForSequenceClassification.from_pretrained(roberta)
tokenizer = AutoTokenizer.from_pretrained(roberta)
labels = ['Negative', 'Neutral', 'Positive']

In [14]:
processed_tweets_encoded = []
for tweet in processed_tweets :
    encoded_tweet = tokenizer(tweet, return_tensors='pt')
    processed_tweets_encoded.append(encoded_tweet)

In [15]:
processed_tweets_encoded[0]  # attention mask for attention models 

{'input_ids': tensor([[    0, 13963,   787, 12105, 13530, 16371, 15483,  1870,    18,   632,
           573,  1269,     6,    85,  3916,  1664,   272, 28641,     6,    26,
            37,  2046,    14,   744,  2861,    16,     5,   235,  2472,     7,
          3553,  1174,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [16]:
all_results = []

# Iterate over each encoded tweet
for encoded_tweet in processed_tweets_encoded:
    # Pass the encoded tweet through your model
    output = model(**encoded_tweet)
    
    # Calculate scores
    scores = output[0][0].detach().numpy()
    
    # Apply softmax
    scores = softmax(scores)
    
    # Initialize an empty dictionary to store labels and scores for the current tweet
    tweet_result = {}
    
    # Iterate over each label and score
    for i in range(len(scores)):
        label = labels[i]
        score = scores[i]
        # Store label and score in the dictionary
        tweet_result[label] = score
    
    # Append the dictionary to the list
    all_results.append(tweet_result)

In [17]:
probas = pd.DataFrame(all_results)

In [18]:
probas

Unnamed: 0,Negative,Neutral,Positive
0,0.281749,0.689909,0.028342
1,0.676176,0.292864,0.030959
2,0.792587,0.199977,0.007436
3,0.703724,0.290431,0.005845
4,0.028906,0.944209,0.026885
...,...,...,...
205,0.464716,0.517699,0.017585
206,0.390905,0.577153,0.031942
207,0.469815,0.504795,0.025389
208,0.140054,0.590846,0.269100


In [19]:
new_column = []
for index, row in probas.iterrows():
    max_proba = max(row[0], row[1], row[2])
    if row[0] == max_proba:
        new_column.append('Negative Sentiment')
    elif row[1] == max_proba:
        new_column.append('Neutral Sentiment')
    elif row[2] == max_proba:
        new_column.append('Positive Sentiment')

  max_proba = max(row[0], row[1], row[2])
  if row[0] == max_proba:
  elif row[1] == max_proba:


In [20]:
# just for testing
for index, row in probas.iterrows():
    print(row[0]) 

0.28174874
0.6761762
0.7925874
0.70372444
0.02890588
0.87962794
0.95216686
0.91702217
0.558537
0.72267044
0.43038285
0.14086792
0.7475516
0.6115139
0.78069746
0.91677624
0.84053195
0.5030741
0.82492095
0.7111283
0.8134545
0.15640137
0.9477199
0.6871733
0.3426723
0.20835847
0.10470027
0.064826295
0.12719856
0.18788388
0.46471626
0.39090532
0.4698155
0.14005445
0.94773453
0.28174874
0.6761762
0.7925874
0.70372444
0.02890588
0.87962794
0.95216686
0.91702217
0.558537
0.72267044
0.43038285
0.14086792
0.7475516
0.6115139
0.78069746
0.91677624
0.84053195
0.5030741
0.82492095
0.7111283
0.8134545
0.15640137
0.9477199
0.6871733
0.3426723
0.20835847
0.10470027
0.064826295
0.12719856
0.18788388
0.46471626
0.39090532
0.4698155
0.14005445
0.94773453
0.28174874
0.6761762
0.7925874
0.70372444
0.02890588
0.87962794
0.95216686
0.91702217
0.558537
0.72267044
0.43038285
0.14086792
0.7475516
0.6115139
0.78069746
0.91677624
0.84053195
0.5030741
0.82492095
0.7111283
0.8134545
0.15640137
0.9477199
0.6871733
0

  print(row[0])


In [21]:
new_column

['Neutral Sentiment',
 'Negative Sentiment',
 'Negative Sentiment',
 'Negative Sentiment',
 'Neutral Sentiment',
 'Negative Sentiment',
 'Negative Sentiment',
 'Negative Sentiment',
 'Negative Sentiment',
 'Negative Sentiment',
 'Neutral Sentiment',
 'Neutral Sentiment',
 'Negative Sentiment',
 'Negative Sentiment',
 'Negative Sentiment',
 'Negative Sentiment',
 'Negative Sentiment',
 'Negative Sentiment',
 'Negative Sentiment',
 'Negative Sentiment',
 'Negative Sentiment',
 'Neutral Sentiment',
 'Negative Sentiment',
 'Negative Sentiment',
 'Neutral Sentiment',
 'Neutral Sentiment',
 'Neutral Sentiment',
 'Neutral Sentiment',
 'Neutral Sentiment',
 'Neutral Sentiment',
 'Neutral Sentiment',
 'Neutral Sentiment',
 'Neutral Sentiment',
 'Neutral Sentiment',
 'Negative Sentiment',
 'Neutral Sentiment',
 'Negative Sentiment',
 'Negative Sentiment',
 'Negative Sentiment',
 'Neutral Sentiment',
 'Negative Sentiment',
 'Negative Sentiment',
 'Negative Sentiment',
 'Negative Sentiment',
 'Neg

In [22]:
df['sentiment'] = new_column

In [23]:
df.sentiment

0       Neutral Sentiment
1      Negative Sentiment
2      Negative Sentiment
3      Negative Sentiment
4       Neutral Sentiment
              ...        
205     Neutral Sentiment
206     Neutral Sentiment
207     Neutral Sentiment
208     Neutral Sentiment
209    Negative Sentiment
Name: sentiment, Length: 210, dtype: object

### NLTK

In [25]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd

# Download NLTK resources
nltk.download('vader_lexicon')

# Initialize Sentiment Intensity Analyzer
sid = SentimentIntensityAnalyzer()

# Function to get sentiment label
def get_sentiment_label(text):
    # Get polarity scores
    scores = sid.polarity_scores(text)
    
    # Determine sentiment label based on compound score
    if scores['compound'] >= 0.05:
        return 'Positive'
    elif scores['compound'] <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis to each text in the DataFrame
df_test = df.copy()
df_test['sentiment'] = df_text['text'].apply(get_sentiment_label)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [27]:
df_test.sentiment

0      Negative
1      Negative
2      Negative
3       Neutral
4      Positive
         ...   
205    Positive
206     Neutral
207    Negative
208     Neutral
209    Negative
Name: sentiment, Length: 210, dtype: object

### textblob

In [28]:
from textblob import TextBlob

# Load your DataFrame
# df = pd.read_csv('your_dataset.csv')

# Function to get sentiment label
def get_sentiment_label(text):
    analysis = TextBlob(text)
    polarity = analysis.sentiment.polarity
    
    # Determine sentiment label based on polarity score
    if polarity > 0:
        return 'Positive'
    elif polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis to each text in the DataFrame
df_test2 = df.copy()
df_test2['sentiment'] = df_text['text'].apply(get_sentiment_label)

In [29]:
df_test2.sentiment

0      Positive
1      Negative
2       Neutral
3       Neutral
4       Neutral
         ...   
205    Positive
206    Positive
207    Positive
208     Neutral
209     Neutral
Name: sentiment, Length: 210, dtype: object

0       Neutral Sentiment
1      Negative Sentiment
2      Negative Sentiment
3      Negative Sentiment
4       Neutral Sentiment
              ...        
205     Neutral Sentiment
206     Neutral Sentiment
207     Neutral Sentiment
208     Neutral Sentiment
209    Negative Sentiment
Name: sentiment, Length: 210, dtype: object

In [32]:
df.to_csv("res.csv", index=False)