In [46]:
import tweepy
from dataclasses import dataclass, field
import dataclasses
from typing import List
import re
import json
from datetime import datetime
import stanza
import emoji
import html

In [47]:
bearer_token = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'

In [48]:
client = tweepy.Client(bearer_token=bearer_token)

In [49]:
query = '("data science" OR "artificial intelligence" OR "machine learning" OR "Big Data" OR "Deep Learning" OR ml OR ai) (context:131.1303989823011606528 OR context:131.1495104058285125642) lang:en has:links -is:retweet'
pages = tweepy.Paginator(client.search_recent_tweets, query=query, expansions=['author_id'], tweet_fields=['context_annotations','created_at','author_id','entities','public_metrics','text'], user_fields=['username','name','location','created_at'], limit=400, max_results=100)

In [50]:
@dataclass
class User():
    id: str = ''
    name: str = ''
    username: str = ''
    location: str = ''
    dateJoined: str = ''

@dataclass
class EventTweetData():
    id: str = ''
    user: User = None
    description: str = ''
    datePosted: str = ''
    likeCount: int = 0
    links: List[str] = field(default_factory=list)
    tags: List[str] = field(default_factory=list)
    derivedTags: List[str] = field(default_factory=list)


In [51]:
def get_emoji_free_text(text):
    return emoji.replace_emoji(text, replace='', version=-1)

def remove_special_char(text):
    string = html.unescape(text)
    string = html.unescape(string)
    return re.sub('&lt;/?[a-z]+&gt;', '', string)

In [52]:
eventDataModels = []

for page in pages:

    userList = dict()
    tweets = page

    for user in tweets.includes['users']:
        userModel = User()
        userModel.id = user['id']
        userModel.name = user['name']
        userModel.username = user['username']
        userModel.location = user['location']
        userModel.dateJoined = user['created_at'].strftime('%m/%d/%Y')
        userList[userModel.id] = userModel

    for tweet in tweets.data:
        eventTweetModel = EventTweetData()
        eventTweetModel.id = tweet['id']
        eventTweetModel.user = userList[tweet['author_id']]
        eventTweetModel.datePosted = tweet['created_at'].strftime('%m/%d/%Y')
        eventTweetModel.description = tweet['text']
        eventTweetModel.likeCount = tweet['public_metrics']['like_count']
        urlList = list()
        for url in tweet['entities']['urls']:
            if re.search('twitter', url['expanded_url']):
                continue
            urlList.append(url['expanded_url'])
        if len(urlList) == 0: 
            continue
        eventTweetModel.links = urlList
        tagList = list()
        if ('hashtags' in tweet['entities']):
            for hashtag in tweet['entities']['hashtags']:
                tagList.append(hashtag['tag'])
        tagList = list(set(tagList))
        eventTweetModel.tags = tagList
        eventDataModels.append(eventTweetModel)

In [None]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner', download_method=None)

In [54]:
def remove_hashtags(text):
    return re.sub("#[A-Za-z0-9_]+","", text.replace('-', ''))

for model in eventDataModels:
    processed_data = nlp(remove_hashtags(model.description))
    for sent in processed_data.sentences:
        for ent in sent.ents:
            if(ent.type == 'ORG'):
                for tag in ent.text.split('\n'):
                    model.derivedTags.append(tag.strip())
            elif(ent.type == 'TITLE'):
                model.title = ent.text
    model.derivedTags = list(set(model.derivedTags))

In [55]:
jsonMappedData = dict()
jsonEventDictList = []
for model in eventDataModels:
    jsonEventDictList.append(dataclasses.asdict(model))
jsonMappedData['twitterEventData'] = jsonEventDictList

with open('twitterEventData'+datetime.now().strftime('%m_%d_%Y')+'.json', 'w', encoding='utf-8') as file:
    json.dump(jsonMappedData, file, ensure_ascii=False, indent=4)