# 🏁 Initialize

In [1]:
!pip install tqdm

You should consider upgrading via the 'c:\users\long\sourcecode\personal\data sciencing\venv\scripts\python.exe -m pip install --upgrade pip' command.


In [2]:
from dotenv import load_dotenv
load_dotenv()
import os
import datetime

PATH = os.path.abspath('')

## Twitter tweepy client

In [3]:
API_KEY = os.getenv('TWITTER_BOT_KEY')
API_SECRET_KEY = os.getenv('TWITTER_BOT_SECRET_KEY')

ACCESS_TOKEN = os.getenv('lree9_ACCESS_TOKEN')
ACCESS_TOKEN_SECRET = os.getenv('lree9_ACCESS_TOKEN_SECRET')

In [16]:
import tweepy

auth = tweepy.OAuthHandler(API_KEY, API_SECRET_KEY)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
try:
    api.verify_credentials()
except:
    raise Exception("Authentication not OK")

me = api.me()
# me.screen_name

## MongoDB client

In [5]:
MONG_USER = os.getenv('MONG_USER')
MONG_PWD  = os.getenv('MONG_PWD')
MONG_HOST = os.getenv('MONG_HOST')
MONG_PORT = os.getenv('MONG_PORT')
DATABASE = "tweetDumpingGround"

In [6]:
from pymongo import MongoClient
from bson.objectid import ObjectId

uri = f"mongodb://{MONG_USER}:{MONG_PWD}@{MONG_HOST}:{MONG_PORT}/{DATABASE}"
client = MongoClient(uri)
db = client[DATABASE]
db

Database(MongoClient(host=['171.244.50.232:27017'], document_class=dict, tz_aware=False, connect=True), 'tweetDumpingGround')

## Helper functions

In [7]:
import json

def compress_object(origin_obj, attrs_to_save):
    new_obj = {'Not found':[]}

    for key in attrs_to_save:
        if key in origin_obj:
            new_obj[key] = origin_obj[key]
        else:
            new_obj['Not found'].append(key)
    
    return new_obj

def pprint(dict_):
    print(json.dumps(dict_, indent=4, sort_keys=True))

In [8]:
from io import StringIO
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

# 🎬 Action

## Params

In [9]:
from datetime import datetime, timezone, timedelta
from sys import maxsize as inf

LIMIT = 1 # inf
USER_ID = 'billwurtz'

## Get user info

In [10]:
user = api.get_user(USER_ID)
user_detail = user._json

In [11]:
# https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/user
attrs_to_save = [
    'id', 'name', 'screen_name', 'location', 'url', 'description',
    'protected', 'verified', 'followers_count', 'friends_count', 'listed_count',
    'favourites_count', 'statuses_count', 'created_at', 'profile_banner_url', 
    'profile_image_url_https', 'default_profile', 'default_profile_image',
    'withheld_in_countries', 'withheld_scope'
]

my_user_obj = compress_object(user._json, attrs_to_save)
my_user_obj['Last updated'] = datetime.now(tz=timezone(timedelta(hours=+7)))
my_user_obj

{'Not found': ['withheld_in_countries', 'withheld_scope'],
 'id': 289853473,
 'name': 'bill wurtz',
 'screen_name': 'billwurtz',
 'location': '',
 'url': 'https://t.co/WGqYlp4ab6',
 'description': 'is self',
 'protected': False,
 'verified': True,
 'followers_count': 307230,
 'friends_count': 82,
 'listed_count': 344,
 'favourites_count': 22860,
 'statuses_count': 3094,
 'created_at': 'Fri Apr 29 09:57:13 +0000 2011',
 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/289853473/1435339266',
 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1016862391823360000/BeQ1lMU7_normal.jpg',
 'default_profile': False,
 'default_profile_image': False,
 'Last updated': datetime.datetime(2021, 1, 16, 4, 4, 8, 854399, tzinfo=datetime.timezone(datetime.timedelta(seconds=25200)))}

Save to collection "profile"

In [12]:
profile_col = db["profile"]
key = {'id': my_user_obj['id']}
profile_col.update(key, my_user_obj, upsert=True)

{'n': 1, 'nModified': 1, 'ok': 1.0, 'updatedExisting': True}

## Get tweets

In [13]:
'''
with open(os.path.join(PATH, f'{USER_ID}_tweet_dump.txt'), 'w+', encoding='utf8') as file:
    for tweet in tweepy.Cursor(api.user_timeline, id=USER_ID).items(LIMIT):
        # parsed = json.loads(tweet._json)
        print(json.dumps(tweet._json, indent=4, sort_keys=True))
        # print(tweet)
        # print(dir(tweet))
        file.write(tweet.text +'\n')
'''
pass

In [14]:
# https://developer.twitter.com/en/docs/twitter-api/data-dictionary/object-model/tweet
attrs = [
    'id', 'text', 'attachments', 'author_id', 'conversation_id', 'created_at',
    'geo', 'in_reply_to_user_id', 'lang', 'possiby_sensitive', 'reply_settings', 
    'source', 'withheld',
    'entities', 'non_public_metrics', 'organic_metrics', 'promoted_metrics', 'public_metrics',
    'referenced_tweets', 'context_annotations', 
]
# https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/tweet
attrs = [
    'id', 'text', 'source', 'truncated', 'in_reply_to_status_id', 'in_reply_to_user_id',
    'in_reply_to_screen_name', 'coordinates', 'place',
    'quoted_status_id', 'is_quote_status', 'quoted_status',
    'retweeted_status', 'quote_count', 'reply_count', 'retweet_count', 'favorite_count',
    'entities', 'extended_entities', 'possibly_sensitive', 'filter_level', 'lang', 'matching_rules'
]

In [15]:
from tqdm import tqdm

tweet_col = db["tweet"]

for tweet in tqdm( tweepy.Cursor(api.user_timeline, id=USER_ID).items(LIMIT) ):
    tweet_obj = compress_object(tweet._json, attrs)
    tweet_obj['Last updated'] = datetime.now(tz=timezone(timedelta(hours=+7)))
    tweet_obj['author_id'] = tweet._json['user']['id']
    tweet_obj['source'] = strip_tags(tweet._json['source'])
    # pprint(tweet._json)
    # pprint(tweet_obj)

    key = {'id': tweet_obj['id']}
    tweet_col.update(key, tweet_obj, upsert=True)

1it [00:01,  1.01s/it]
