In [42]:
import tweepy
from dataclasses import dataclass, field
import dataclasses
from typing import List
import re
import json
from datetime import datetime
import stanza
import emoji
import html
import mysql.connector

In [43]:
bearer_token = 'AAAAAAAAAAAAAAAAAAAAAHpjjAEAAAAAn%2BCYkmL02wnLmmY1BEn0%2FhZDcAw%3D1yCPthd5oyDAmQdEGowVHmOWmunSflrAK4nWrxneNwYEZ3VDZT'

In [44]:
client = tweepy.Client(bearer_token=bearer_token)

In [45]:
query = '("data science" OR "artificial intelligence" OR "machine learning" OR "Big Data" OR "Deep Learning" OR ml OR ai) (context:131.1303989823011606528 OR context:131.1495104058285125642) lang:en has:links -is:retweet'
pages = tweepy.Paginator(client.search_recent_tweets, query=query, expansions=['author_id'], tweet_fields=['context_annotations','created_at','author_id','entities','public_metrics','text'], user_fields=['username','name','location','created_at'], limit=400, max_results=100)

In [46]:
@dataclass
class User():
    id: str = ''
    name: str = ''
    username: str = ''
    location: str = ''
    dateJoined: str = ''

@dataclass
class EventTweetData():
    id: str = ''
    user: User = None
    description: str = ''
    datePosted: str = ''
    likeCount: int = 0
    links: List[str] = field(default_factory=list)
    tags: List[str] = field(default_factory=list)
    derivedTags: List[str] = field(default_factory=list)


In [47]:
def get_emoji_free_text(text):
    return emoji.replace_emoji(text, replace='', version=-1)

def remove_special_char(text):
    string = html.unescape(text)
    string = html.unescape(string)
    return re.sub('&lt;/?[a-z]+&gt;', '', string)

In [48]:
eventDataModels = []

for page in pages:

    userList = dict()
    tweets = page

    for user in tweets.includes['users']:
        userModel = User()
        userModel.id = user['id']
        userModel.name = user['name']
        userModel.username = user['username']
        userModel.location = user['location']
        userModel.dateJoined = user['created_at'].strftime('%Y-%m-%d')
        userList[userModel.id] = userModel

    for tweet in tweets.data:
        eventTweetModel = EventTweetData()
        eventTweetModel.id = tweet['id']
        eventTweetModel.user = userList[tweet['author_id']]
        eventTweetModel.datePosted = tweet['created_at'].strftime('%Y-%m-%d')
        eventTweetModel.description = tweet['text']
        eventTweetModel.likeCount = tweet['public_metrics']['like_count']
        urlList = list()
        for url in tweet['entities']['urls']:
            if re.search('twitter', url['expanded_url']):
                continue
            urlList.append(url['expanded_url'])
        if len(urlList) == 0: 
            continue
        eventTweetModel.links = urlList
        tagList = list()
        if ('hashtags' in tweet['entities']):
            for hashtag in tweet['entities']['hashtags']:
                tagList.append(hashtag['tag'])
        tagList = list(set(tagList))
        eventTweetModel.tags = tagList
        eventDataModels.append(eventTweetModel)

In [49]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner', download_method=None)

2022-11-13 17:47:48 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2022-11-13 17:47:48 INFO: Use device: cpu
2022-11-13 17:47:48 INFO: Loading: tokenize
2022-11-13 17:47:48 INFO: Loading: ner
2022-11-13 17:47:50 INFO: Done loading processors!


In [50]:
def remove_hashtags(text):
    return re.sub("#[A-Za-z0-9_]+","", text.replace('-', ''))

for model in eventDataModels:
    processed_data = nlp(remove_hashtags(model.description))
    for sent in processed_data.sentences:
        for ent in sent.ents:
            if(ent.type == 'ORG'):
                for tag in ent.text.split('\n'):
                    model.derivedTags.append(tag.strip())
            elif(ent.type == 'TITLE'):
                model.title = ent.text
    model.derivedTags = list(set(model.derivedTags))

In [51]:
jsonMappedData = dict()
jsonEventDictList = []
for model in eventDataModels:
    jsonEventDictList.append(dataclasses.asdict(model))
jsonMappedData['twitterEventData'] = jsonEventDictList

with open('twitterEventData'+datetime.now().strftime('%m_%d_%Y')+'.json', 'w', encoding='utf-8') as file:
    json.dump(jsonMappedData, file, ensure_ascii=False, indent=4)

In [52]:
conn = mysql.connector.connect(
    user='root', password='test@123', host='127.0.0.1', database='ai_jobs_database'
)

cursor = conn.cursor()

In [53]:
userInsertQuery = """ INSERT IGNORE INTO twitter_user(user_id, name, user_name, location, date_joined)
                      VALUES (%s, %s, %s, %s, %s);
                  """
eventTweetInsertQuery = """INSERT IGNORE INTO event_tweets(tweet_id, user_id, description, date_posted, like_count)
                           VALUES (%s, %s, %s, %s, %s);
                        """
twitterTagInsertQuery = """INSERT IGNORE INTO twitter_tag(tag_name, tweet_id)
                           VALUES (%s, %s)
                        """
derivedTagInsertQuery = """INSERT IGNORE INTO derived_tag(der_tag_name, tweet_id)
                           VALUES (%s, %s)
                        """
urlInsertQuery = """INSERT IGNORE INTO tweet_url(url, tweet_id)
                    VALUES (%s, %s)
                 """

In [54]:
userData = []
eventTweetData = []
twitterTagData = []
derivedTagData = []
urlData = []
for model in eventDataModels:
    userData.append((model.user.id, model.user.name, model.user.username, model.user.location, model.user.dateJoined))
    eventTweetData.append((model.id, model.user.id, model.description, model.datePosted, model.likeCount))
    for tag in model.tags:
        twitterTagData.append((tag, model.id))
    for tag in model.derivedTags:
        derivedTagData.append((tag, model.id))
    for link in model.links:
        urlData.append((link, model.id))

try:
    cursor.executemany(userInsertQuery, userData)
    cursor.executemany(eventTweetInsertQuery, eventTweetData)
    cursor.executemany(twitterTagInsertQuery, twitterTagData)
    cursor.executemany(derivedTagInsertQuery, derivedTagData)
    cursor.executemany(urlInsertQuery, urlData)
    conn.commit()
except Exception as e:
    conn.rollback()
    print(e)
    
conn.close()