In [1]:
import tweepy
from dataclasses import dataclass, field
import dataclasses
from typing import List
import re
import json
from datetime import datetime
import stanza
import emoji
import html
import mysql.connector

In [2]:
bearerToken = 'AAAAAAAAAAAAAAAAAAAAAHpjjAEAAAAAn%2BCYkmL02wnLmmY1BEn0%2FhZDcAw%3D1yCPthd5oyDAmQdEGowVHmOWmunSflrAK4nWrxneNwYEZ3VDZT'
client = tweepy.Client(bearer_token=bearerToken)

In [16]:
query = "(#machinelearning OR #AI OR #BigData OR #DataScience OR #Analytics OR #Python OR #ArtificialIntelligence OR #ML OR #DeepLearning OR #TensorFlow OR #PyTorch OR #rstats) (#hiring OR #recruitment OR #recruiting OR #jobs) lang:en has:links -is:retweet"
if len(query) > 512:
    raise "Query longer than 512 characters"
pages = tweepy.Paginator(client.search_recent_tweets, query=query, expansions=['author_id'], tweet_fields=['context_annotations','created_at','author_id','entities','text', 'public_metrics'], user_fields=['username', 'name','location', 'created_at'],limit=400, max_results=100)

In [17]:
@dataclass
class User():
    id: str = ''
    name: str = ''
    username: str = ''
    dateJoined: str = ''
    location: str = ''

@dataclass
class JobTweetData():
    id: str = ''
    user: User = None
    description: str = ''
    datePosted: str = '' 
    likeCount: int = 0
    derivedTags: List[str] = field(default_factory=list)
    links: List[str] = field(default_factory=list)
    tags: List[str] = field(default_factory=list)

In [18]:
def get_emoji_free_text(text):
    return emoji.replace_emoji(text, replace='', version=-1)

In [19]:
def remove_special_char(text):
    string = html.unescape(text)
    string = html.unescape(string)
    return re.sub('&lt;/?[a-z]+&gt;', '', string)


jobDataModels = []
for page in pages:
    userList = dict()
    tweets = page
    for user in tweets.includes['users']:
        userModel = User()
        userModel.id = user['id']
        userModel.name = get_emoji_free_text(user['name'])
        userModel.username = user['username']
        userModel.location = user['location']
        userModel.dateJoined = user['created_at'].strftime('%m/%d/%Y')
        userList[userModel.id] = userModel
    for tweet in tweets.data:
        jobTweetModel = JobTweetData()
        jobTweetModel.id = tweet['id']
        currentUser = userList[tweet['author_id']]
        jobTweetModel.user = currentUser
        jobTweetModel.datePosted = tweet['created_at'].strftime('%m/%d/%Y')
        jobTweetModel.description = remove_special_char(get_emoji_free_text(tweet['text']))
        jobTweetModel.likeCount = tweet['public_metrics']['like_count']
        urlList = list()
        for url in tweet['entities']['urls']:
            if re.search('twitter', url['expanded_url']):
                continue
            urlList.append(url['expanded_url'])
        if len(urlList) == 0: 
            continue
        for hashtag in tweet['entities']['hashtags']:
            jobTweetModel.tags.append(hashtag['tag'])
        jobTweetModel.links = urlList
        jobDataModels.append(jobTweetModel)

In [None]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner', download_method=None)

2022-11-12 21:44:13 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2022-11-12 21:44:13 INFO: Use device: cpu
2022-11-12 21:44:13 INFO: Loading: tokenize
2022-11-12 21:44:13 INFO: Loading: ner


In [14]:
def remove_hashtags(text):
    return re.sub("#[A-Za-z0-9_]+","", text.replace('-', ''))

for model in jobDataModels:
    processed_data = nlp(remove_hashtags(model.description))
    for sent in processed_data.sentences:
        for ent in sent.ents:
            if(ent.type == 'ORG'):
                for tag in ent.text.split('\n'):
                    model.derivedTags.append(tag.strip())
            elif(ent.type == 'TITLE'):
                model.title = ent.text
    model.tags = list(set(model.tags))
    model.derivedTags = list(set(model.derivedTags))
    print(*[f'entity: {ent.text}\ttype: {ent.type}' ], sep='\n')

entity: Snowflake	type: ORG
entity: Snowflake	type: ORG
entity: Python	type: GPE
entity: Python	type: GPE
entity: Python	type: GPE


In [15]:
jsonMappedData = dict()
jsonJobDictList = []
for model in jobDataModels:
    jsonJobDictList.append(dataclasses.asdict(model))
jsonMappedData['twitterJobData'] = jsonJobDictList

with open('twitterData'+datetime.now().strftime('%m_%d_%Y')+'.json', 'w', encoding='utf-8') as file:
    json.dump(jsonMappedData, file, ensure_ascii=False, indent=4)