## **Data Gathering**

In [None]:
pip install snscrape

In [None]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import numpy as np


query = "(from:elonmusk) until:2023-01-01 since:2005-01-01"
tweets = []
limit = 100000


for tweet in sntwitter.TwitterSearchScraper(query).get_items():
    
    # print(vars(tweet))
    # break
    if len(tweets) == limit:
        break
    else:
        tweets.append([tweet.date, tweet.username, tweet.content,tweet.likeCount, tweet.retweetCount])
        
df = pd.DataFrame(tweets, columns=['Date', 'User', 'Tweet','Like_Count','Retweet_Count'])
print(df)

# to save to csv
# df.to_csv('tweets.csv')

## **Hashtags removal**

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

# remove hashtags
def hashtags(text):
  hash = re.findall(r"#(\w+)", text)
  return hash




## **Emojis translation**

In [None]:
pip install emot


In [None]:
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO
# translate emoji
def emoji(text):
  for emot in UNICODE_EMOJI:
    if text == None:
      text = text
    else:
      text = text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",", "").replace(":", "").split()))
    return text

## **Usernames removal**

In [None]:
# remove retweet username and tweeted at @username
def remove_users(tweet):
  '''Takes a string and removes retweet and @user information'''
  tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) 
  # remove tweeted at
  return tweet

## **Links removal**

In [None]:
# remove links
def remove_links(tweet):
  '''Takes a string and removes web links from it'''
  tweet = re.sub(r'http\S+', '', tweet) # remove http links
  tweet = re.sub(r'bit.ly/\S+', '', tweet) # remove bitly links
  # tweet = tweet.strip('[link]') # remove [links]
  return tweet
def clean_html(text):
  html = re.compile('<.*?>')#regex
  return html.sub(r'',text)

## **Non-ASCII characters removaL**

In [None]:
# remove non ascii character
def non_ascii(s):
  return "".join(i for i in s if ord(i)<128)

def lower(text):
  return text.lower()
  # remove email address
 

## **Email address and punctuation removal**

In [None]:
def email_address(text):
  email = re.compile(r'[\w\.-]+@[\w\.-]+')
  return email.sub(r'',text)

def punct(text):
  token=RegexpTokenizer(r'\w+')#regex
  text = token.tokenize(text)
  text= " ".join(text)
  return text

## **Stopwords removal**

In [None]:
# remove stopwords
def removeStopWords(str):
#select english stopwords
  cachedStopWords = set(stopwords.words("english"))
#add custom words
  cachedStopWords.update(('and','I','A','http','And','So','arnt','This','When','It','many','Many','so','cant','Yes','yes','No','no','These','these','mailto','regards','ayanna','like','email'))
#remove stop words
  new_str = ' '.join([word for word in str.split() if word not in cachedStopWords]) 
  return new_str

## **Special characters removal**

In [None]:
def remove_(tweet):
  tweet = re.sub('([_]+)', "", tweet)
  return tweet

## **Data Preprocessing**

In [None]:
#apply all the functions above
df['hashtag'] = df.Tweet.apply(func = hashtags)
df['new_tweet'] = df.Tweet.apply(func = emoji)
df['new_tweet'] = df.new_tweet.apply(func = remove_users)
df['new_tweet'] = df.new_tweet.apply(func = clean_html)
df['new_tweet'] = df.new_tweet.apply(func = remove_links)
df['new_tweet'] = df.new_tweet.apply(func = non_ascii)
df['new_tweet'] = df.new_tweet.apply(func = lower)
df['new_tweet'] = df.new_tweet.apply(func = email_address)
df['new_tweet'] = df.new_tweet.apply(func = removeStopWords)
df['new_tweet'] = df.new_tweet.apply(func = clean_html)
df['new_tweet'] = df.new_tweet.apply(func = punct)
df['new_tweet'] = df.new_tweet.apply(func = remove_)

In [None]:
df

## **Sentiment Classification**

In [None]:
!pip install modelzoo-client[transformers]

In [None]:
pip install scipy


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
# load model and tokenizer
roberta = "cardiffnlp/twitter-roberta-base-sentiment"

model = AutoModelForSequenceClassification.from_pretrained(roberta)
tokenizer = AutoTokenizer.from_pretrained(roberta)

labels = ['Negative', 'Neutral', 'Positive']

In [None]:
def get_sentiment_label(tweet):
 
  encoded_tweet = tokenizer(tweet, return_tensors='pt')
  # print(encoded_tweet)
  output = model(**encoded_tweet)

  scores = output[0][0].detach().numpy()
  scores = softmax(scores)

  for i in range(len(scores)):
    
      l = labels[i]
      s = scores[i]
      # print(l,s)
  major_sentiment = labels[np.argmax(scores)] 
  return major_sentiment

In [None]:
mask = df['Tweet'].str.contains("dogecoin",case=False)
df = df[mask]
df

In [None]:
df['label'] = df['new_tweet'].apply(get_sentiment_label)

In [None]:
df.to_csv('tweets.csv')
from google.colab import files
files.download('tweets.csv')