In [1]:
import tweepy
from dataclasses import dataclass, field
import dataclasses
from typing import List
import re
import json
from datetime import datetime
import stanza
import emoji
import html
import mysql.connector

In [2]:
bearerToken = 'AAAAAAAAAAAAAAAAAAAAAHpjjAEAAAAAn%2BCYkmL02wnLmmY1BEn0%2FhZDcAw%3D1yCPthd5oyDAmQdEGowVHmOWmunSflrAK4nWrxneNwYEZ3VDZT'
client = tweepy.Client(bearer_token=bearerToken)

In [3]:
query = "(#machinelearning OR #AI OR #BigData OR #DataScience OR #Analytics OR #Python OR #ArtificialIntelligence OR #ML OR #DeepLearning OR #TensorFlow OR #PyTorch OR #rstats) (#hiring OR #recruitment OR #recruiting OR #jobs) lang:en has:links -is:retweet"
if len(query) > 512:
    raise "Query longer than 512 characters"
pages = tweepy.Paginator(client.search_recent_tweets, query=query, expansions=['author_id'], tweet_fields=['context_annotations','created_at','author_id','entities','text', 'public_metrics'], user_fields=['username', 'name','location', 'created_at'],limit=400, max_results=100)

In [4]:
@dataclass
class User():
    id: str = ''
    name: str = ''
    username: str = ''
    dateJoined: str = ''
    location: str = ''

@dataclass
class JobTweetData():
    id: str = ''
    user: User = None
    description: str = ''
    datePosted: str = '' 
    likeCount: int = 0
    derivedTags: List[str] = field(default_factory=list)
    links: List[str] = field(default_factory=list)
    tags: List[str] = field(default_factory=list)

In [5]:
def get_emoji_free_text(text):
    return emoji.replace_emoji(text, replace='', version=-1)

In [6]:
def remove_special_char(text):
    string = html.unescape(text)
    string = html.unescape(string)
    return re.sub('&lt;/?[a-z]+&gt;', '', string)


jobDataModels = []
for page in pages:
    userList = dict()
    tweets = page
    for user in tweets.includes['users']:
        userModel = User()
        userModel.id = user['id']
        userModel.name = get_emoji_free_text(user['name'])
        userModel.username = user['username']
        userModel.location = user['location']
        userModel.dateJoined = user['created_at'].strftime('%Y-%m-%d')
        userList[userModel.id] = userModel
    for tweet in tweets.data:
        jobTweetModel = JobTweetData()
        jobTweetModel.id = tweet['id']
        currentUser = userList[tweet['author_id']]
        jobTweetModel.user = currentUser
        jobTweetModel.datePosted = tweet['created_at'].strftime('%Y-%m-%d')
        jobTweetModel.description = remove_special_char(get_emoji_free_text(tweet['text']))
        jobTweetModel.likeCount = tweet['public_metrics']['like_count']
        urlList = list()
        for url in tweet['entities']['urls']:
            if re.search('twitter', url['expanded_url']):
                continue
            urlList.append(url['expanded_url'])
        if len(urlList) == 0: 
            continue
        for hashtag in tweet['entities']['hashtags']:
            jobTweetModel.tags.append(hashtag['tag'])
        jobTweetModel.links = urlList
        jobDataModels.append(jobTweetModel)

In [7]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner', download_method=None)

2022-11-23 15:27:45 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2022-11-23 15:27:45 INFO: Use device: cpu
2022-11-23 15:27:45 INFO: Loading: tokenize
2022-11-23 15:27:45 INFO: Loading: ner
2022-11-23 15:27:46 INFO: Done loading processors!


In [8]:
def remove_hashtags(text):
    return re.sub("#[A-Za-z0-9_]+","", text.replace('-', ''))

for model in jobDataModels:
    processed_data = nlp(remove_hashtags(model.description))
    for sent in processed_data.sentences:
        for ent in sent.ents:
            if(ent.type == 'ORG'):
                for tag in ent.text.split('\n'):
                    model.derivedTags.append(tag.strip())
            elif(ent.type == 'TITLE'):
                model.title = ent.text
    model.tags = list(set(model.tags))
    model.derivedTags = list(set(model.derivedTags))
    print(*[f'entity: {ent.text}\ttype: {ent.type}' ], sep='\n')

entity: 2	type: CARDINAL
entity: 2	type: CARDINAL
entity: 2	type: CARDINAL
entity: EntryLevel	type: ORG
entity: EntryLevel	type: ORG
entity: Coforge	type: GPE
entity: United States	type: GPE
entity: FullStack Cloud Software	type: ORG
entity: Singapore	type: GPE
entity: the Chicago Connectory	type: ORG
entity: Pandas	type: ORG
entity: only ̖$5	type: MONEY
entity: AI	type: ORG
entity: AI	type: ORG
entity: Frisco TX.	type: GPE
entity: Python	type: GPE
entity: United States	type: GPE
entity: United States	type: GPE
entity: United States	type: GPE
entity: Dec. 16, 2022	type: DATE
entity: Dec. 16, 2022	type: DATE
entity: MPH Careers	type: ORG
entity: 3	type: CARDINAL
entity: 3	type: CARDINAL
entity: 2,230	type: MONEY
entity: 1,981	type: MONEY
entity: 1,981	type: MONEY
entity: 1,981	type: MONEY
entity: Remote /Gurgaon, Remote/ IndiaHashtrust Technologies	type: ORG
entity: Remote /Gurgaon, Remote/ IndiaHashtrust Technologies	type: ORG
entity: Remote /Gurgaon, Remote/ IndiaHashtrust Technologie

entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 120000	type: CARDINAL
entity: 120000	type: CARDINAL
entity: Full Stack Engineer	type: ORG
entity: more than 5	type: CARDINAL
entity: 48	type: CARDINAL
entity: 48	type: CARDINAL
entity: Fortune 500	type: ORG
entity: Simulation  Python	type: GPE
entity: Remote  Latam	type: ORG
entity: Remote  Latam	type: ORG
entity: Python	type: GPE
entity: Python	type: GPE
entity: Python	type: GPE
entity: English	type: LANGUAGE
entity: Synergy	type: ORG
entity: Synergy	type: ORG
entity: 2022	type: DATE
entity: 2022	typ

entity: Google Reviews	type: ORG
entity: Google Reviews	type: ORG
entity: Python	type: GPE
entity: Python	type: GPE
entity: CoW Protocol CoW Protocol	type: LAW
entity: Python 
 NannyML company	type: ORG
entity: Python Full Stack Software Developer 
 Optimile company	type: ORG
entity: Python Full Stack Software Developer 
 Optimile company	type: ORG
entity: Python Full Stack Software Developer 
 Optimile company	type: ORG
entity: Python Full Stack Software Developer 
 Optimile company	type: ORG
entity: Python Full Stack Software Developer 
 Optimile company	type: ORG
entity: Python Full Stack Software Developer 
 Optimile company	type: ORG
entity: Python Full Stack Software Developer 
 Optimile company	type: ORG
entity: Python Full Stack Software Developer 
 Optimile company	type: ORG
entity: Python Full Stack Software Developer 
 Optimile company	type: ORG
entity: Python Full Stack Software Developer 
 Optimile company	type: ORG
entity: Python Full Stack Software Developer 
 Optimile c

entity: Simulation  Python	type: ORG
entity: Simulation  Python	type: ORG
entity: Simulation  Python	type: ORG
entity: Simulation  Python	type: ORG
entity: Cryptocurrency Trading  London 
 GSR company	type: ORG
entity: Lead Application Security Engineer 
 Galaxy	type: ORG
entity: Lead Application Security Engineer 
 Galaxy	type: ORG
entity: 2022	type: DATE
entity: 2022	type: DATE
entity: 2022	type: DATE
entity: 2022	type: DATE
entity: 2022	type: DATE
entity: 2022	type: DATE
entity: 2022	type: DATE
entity: 2022	type: DATE
entity: 2022	type: DATE
entity: 2022	type: DATE
entity: 2022	type: DATE
entity: 2022	type: DATE
entity: 2022	type: DATE
entity: 2022	type: DATE
entity: 2022	type: DATE
entity: 2022	type: DATE
entity: 2022	type: DATE
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	type: CARDINAL
entity: 433	ty

entity: API Developer 
 Zilliqa company	type: ORG
entity: API Developer 
 Zilliqa company	type: ORG
entity: API Developer 
 Zilliqa company	type: ORG
entity: API Developer 
 Zilliqa company	type: ORG
entity: API Developer 
 Zilliqa company	type: ORG
entity: API Developer 
 Zilliqa company	type: ORG
entity: API Developer 
 Zilliqa company	type: ORG
entity: API Developer 
 Zilliqa company	type: ORG
entity: API Developer 
 Zilliqa company	type: ORG
entity: API Developer 
 Zilliqa company	type: ORG
entity: API Developer 
 Zilliqa company	type: ORG
entity: API Developer 
 Zilliqa company	type: ORG
entity: Jenkins / Java	type: ORG
entity: Jenkins / Java	type: ORG
entity: Jenkins / Java	type: ORG
entity: Jenkins / Java	type: ORG
entity: Jenkins / Java	type: ORG
entity: Python	type: GPE
entity: Python	type: GPE
entity: Python	type: GPE
entity: Python	type: GPE
entity: Karnataka	type: GPE
entity: Karnataka	type: GPE
entity: Jenkins / Java	type: ORG
entity: Jenkins / Java	type: ORG
entity: Jenki

entity: 3.000	type: CARDINAL
entity: Azure	type: ORG
entity: Azure	type: ORG
entity: Montreal	type: GPE
entity: Cloud Architect	type: PERSON
entity: Daily	type: DATE
entity: Daily	type: DATE


In [9]:
jsonMappedData = dict()
jsonJobDictList = []
for model in jobDataModels:
    jsonJobDictList.append(dataclasses.asdict(model))
jsonMappedData['twitterJobData'] = jsonJobDictList

with open('twitterData'+datetime.now().strftime('%m_%d_%Y')+'.json', 'w', encoding='utf-8') as file:
    json.dump(jsonMappedData, file, ensure_ascii=False, indent=4)

In [10]:
conn = mysql.connector.connect(
    user='root', password='test@123', host='127.0.0.1', database='ai_jobs_database'
)

cursor = conn.cursor()

In [11]:
userInsertQuery = """ INSERT IGNORE INTO twitter_user(user_id, name, user_name, location, date_joined)
                      VALUES (%s, %s, %s, %s, %s);
                  """
jobTweetInsertQuery = """INSERT IGNORE INTO job_tweets(tweet_id, user_id, description, date_posted, like_count)
                         VALUES (%s, %s, %s, %s, %s);
                      """
twitterTagInsertQuery = """INSERT IGNORE INTO twitter_tag(tag_name, tweet_id)
                           VALUES (%s, %s)
                        """
derivedTagInsertQuery = """INSERT IGNORE INTO derived_tag(der_tag_name, tweet_id)
                           VALUES (%s, %s)
                        """
urlInsertQuery = """INSERT IGNORE INTO tweet_url(url, tweet_id)
                    VALUES (%s, %s)
                 """

In [12]:
userData = []
jobTweetData = []
twitterTagData = []
derivedTagData = []
urlData = []
for model in jobDataModels:
    userData.append((model.user.id, model.user.name, model.user.username, model.user.location, model.user.dateJoined))
    jobTweetData.append((model.id, model.user.id, model.description, model.datePosted, model.likeCount))
    for tag in model.tags:
        twitterTagData.append((tag, model.id))
    for tag in model.derivedTags:
        derivedTagData.append((tag, model.id))
    for link in model.links:
        urlData.append((link, model.id))

try:
    cursor.executemany(userInsertQuery, userData)
    cursor.executemany(jobTweetInsertQuery, jobTweetData)
    cursor.executemany(twitterTagInsertQuery, twitterTagData)
    cursor.executemany(derivedTagInsertQuery, derivedTagData)
    cursor.executemany(urlInsertQuery, urlData)
    conn.commit()
except Exception as e:
    conn.rollback()
    print(e)
    
conn.close()