In [92]:
# Standard library
import sys
import logging
import datetime, time
import sqlite3
import pickle
import os
import calendar


import pandas as pd

# bioinfobot
sys.path.append("../")
from bioinfobot.utils.paths import TweetAnalysisPaths

# External library
import tweepy

In [2]:
# creating a tweeter listener
class StreamListener(tweepy.Stream):
    def on_status(self, status):
        print(status.text)
        if status.lang == "en" and "RT".upper() not in status.text:
            stat = status.text
            stat = stat.replace("\n", "")
            stat = stat.replace("\t", "")
            user_id = status.user.id_str
            stat_id = status.id_str
            create = str(status.created_at)
            name = status.user.screen_name
            data = (create, name, user_id, stat_id, stat)

            # TODO: ---------------------
            # NOTE: This needs to be switch (pandas dataframe)
            # Connecting to SQLite3 database
            try:
                db_file = "../../db/bioinfotweet.db"
                conn = sqlite3.connect(db_file, isolation_level=None)
                conn.execute(
                    "PRAGMA journal_mode=wal"
                )  # This will let concurrent read and write to the database.
                c = conn.cursor()
                c.execute(
                    "INSERT INTO tweetscapture (Date, ScreenName, UserID, TweetID, Text) values (?, ?, ?, ?, ?)",
                    data,
                )
                conn.commit()
                cdate = "Tweet inserted at: " + str(datetime.datetime.now())
                logging.info(cdate)
                conn.close()
            except Exception as ex:
                exname = str(ex)
                template = "An exception of type {0} occurred. Arguments:\n{1!r}"
                message = template.format(type(ex).__name__, ex.args)
                logging.info("Sqlite3 database exception occurred.")
                logging.info(message)
            # ---------------------------
            
    def on_error(self, status_code):
        if status_code == 420:
            cdate = "Error code 420 at:" + str(datetime.datetime.now())
            logging.info(cdate)
            logging.info("Sleeping for 15 mins")
            time.sleep(900)
        return False   

In [3]:
# executing code
# Logging configuration
ta_paths = TweetAnalysisPaths()
logging.basicConfig(
    filename=ta_paths.analysis_log,
    level=logging.DEBUG,
    filemode="a",
    format="%(asctime)s - %(levelname)s: %(message)s",
    datefmt="%m/%d/%Y %I:%M:%S %p",
)
logging.getLogger(
    "matplotlib.font_manager"
).disabled = True  # removes matplotlib debugg
logging.info("Started Tweet-analysis")

In [4]:
# opening API key 

# TODO: This requires a safer and simpler approach (using dotenv)
def _read_cred():
    # this needs to be set as a dot env file
    with open("../../credentials/bioinfobotmain.txt") as f:
        creds = f.readlines()
        consumer_key = creds[0].rstrip()
        consumer_secret = creds[1].rstrip()
        access_token = creds[2].rstrip()
        access_token_secret = creds[3].rstrip()
        return (consumer_key, consumer_secret, access_token, access_token_secret)

def load_twitter_api():
        consumer_key, consumer_secret, access_token, access_token_secret = _read_cred()
        auth = tweepy.OAuth1UserHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_token_secret)
        return auth

def load_listener():
        consumer_key, consumer_secret, access_token, access_token_secret = _read_cred()
        listener = StreamListener(consumer_key, consumer_secret, access_token, access_token_secret)
        return listener
    
def load_followers(path: str) -> list[str]: 
    """loads followers"""
    if os.path.isfile(path):
        try:
            with open(path, "rb") as followers:
                following = []
                following_dict = pickle.load(followers)
                for key, value in following_dict.items():
                    following.append(value)
        except Exception as ex:
            print(ex)
        return following
    else:
        raise FileNotFoundError("Unable to load followers from file path")




In [5]:
api_keys = load_twitter_api()
api = tweepy.API(api_keys)
stream = tweepy.Stream(*_read_cred())
following = load_followers("./following.pickle")

In [115]:
def _parse_tweet_obj(tweet_obj: tweepy.models) -> list[str, float]:
    """
    parses tweeter status reponses and extract the necessary data 
    """
    
    # extracting relevant data
    tweet_text = tweet_obj.full_text.replace("\n", "").replace("\t", "").split()
    tweet_text = " ".join([word for word in tweet_text if not word.startswith("https://t.co/")])
    user_id = tweet_obj.user.id
    status_id = tweet_obj.id_str
    created = str(tweet_obj.created_at.replace(tzinfo=None))
    name = tweet_obj.user.screen_name
    year = tweet_obj.created_at.year
    month = tweet_obj.created_at.month

    # compile into tuple 
    data = (created, name, user_id, status_id, tweet_text, year, month)

    return data

def create_date_range():
    # getting current month and year
    current_month = int(datetime.datetime.today().strftime("%m"))
    current_year = int(datetime.datetime.today().strftime("%Y"))
    last_day =  calendar.monthrange(current_year, current_month)[1]


    # setting a range of dates
    start_date = datetime.datetime(current_year, current_month, 1, 0, 0, 0)
    end_date = datetime.datetime(current_year, current_month, last_day, 0, 0, 0)

    return (start_date, end_date)

def get_tweets():

    # loading keys
    consumer_key, consumer_secret, access_token, access_token_secret = _read_cred()

    # loading followers
    following = load_followers("./following.pickle")

    # setting authentication
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)

    # generating timeline
    start_date, end_date = create_date_range()

    # opening API
    api = tweepy.API(auth)

    tweets = []
    for follower in following:
        
        try:
            tmp_tweets = api.user_timeline(screen_name=follower, count=10, include_rts=False, tweet_mode="extended")
        except:
            print(f"User {follower} does not exists")
            continue

        for tweet in tmp_tweets:
            tweet_created = tweet.created_at.replace(tzinfo=None)
            if tweet_created < end_date and tweet_created > start_date:
                tweets.append(tweet)

    col_names = ["Date", "ScreenName", "UserID", "TweetID", "Text", "year", "month"]
    tweet_data = list(map(_parse_tweet_obj, tweets))

    tweet_data_df = pd.DataFrame(data=tweet_data, columns=col_names)
    tweet_data_df. tweet_data_df
    return tweet_data_df



In [116]:
data = get_tweets()

User gwaygenomics does not exists
User Cbastien_76 does not exists
User HorizonGenomics does not exists
User c4vansite does not exists
User RNAGenomics does not exists
User Fungal_Genome does not exists
User ProjectShivom does not exists
User conesagroup does not exists
User PataPecnerova does not exists
User meyersonlab does not exists
User IndoorEcology does not exists
User 91Mattmoore does not exists
User jordaanblok does not exists
User Thomas_F_Hahn does not exists
User dasersoft does not exists
User SSABiostats does not exists
User jckuga does not exists
User stephen__doyle does not exists
User trashystats does not exists
User murphy2537 does not exists
User judiehowrylak does not exists
User jonxhill does not exists
User BISC_Global_USA does not exists
User jny5alv does not exists
User fourmodern does not exists
User babasaraki01 does not exists
User SaarunyaGeet does not exists
User Brookesloci does not exists
User WGCengage does not exists
User rachomics does not exists
User j


KeyboardInterrupt



In [112]:
data.sort_values(by="Date", ascending=True)

Unnamed: 0,Date,ScreenName,UserID,TweetID,Text,year,month
67,2022-07-01 01:38:41,aeharkess,1029867727,1542684080835137538,@MadLoftin You're the people who inspire me ! ...,2022,7
66,2022-07-01 01:44:21,aeharkess,1029867727,1542685507351715840,@mmjulkowska @BraybrookSA Definitely. Seems ha...,2022,7
65,2022-07-01 01:47:58,aeharkess,1029867727,1542686419113709568,@BraybrookSA Definitely. We're fortunate that ...,2022,7
64,2022-07-01 01:51:28,aeharkess,1029867727,1542687298139160576,@BraybrookSA @mmjulkowska Yeah great point. I'...,2022,7
63,2022-07-01 02:08:39,aeharkess,1029867727,1542691624093995009,"@may_gun @BraybrookSA Yeah, that's definitely ...",2022,7
...,...,...,...,...,...,...,...
3,2022-07-14 16:22:00,AleMedinaRivera,627852903,1547617418222047235,@XiuweiZhang presenting her work on methods to...,2022,7
2,2022-07-14 16:38:24,AleMedinaRivera,627852903,1547621546084093959,@sroyyors,2022,7
44,2022-07-14 16:57:16,aeharkess,1029867727,1547626291314053126,The worst kind of spam email.,2022,7
1,2022-07-14 19:00:21,AleMedinaRivera,627852903,1547657268056838145,Awesome keynote talk by @FurlanLab at @ISCB_Re...,2022,7


In [99]:
df

Unnamed: 0,0,1,2,3,4,5,6
0,2022-07-14 21:59:37,AleMedinaRivera,627852903,1547702380552019968,@ISCB_RegSys @mahonylab @AMathelier Is my plea...,2022,7
1,2022-07-14 19:00:21,AleMedinaRivera,627852903,1547657268056838145,Awesome keynote talk by @FurlanLab at @ISCB_Re...,2022,7
2,2022-07-14 16:38:24,AleMedinaRivera,627852903,1547621546084093959,@sroyyors,2022,7
3,2022-07-14 16:22:00,AleMedinaRivera,627852903,1547617418222047235,@XiuweiZhang presenting her work on methods to...,2022,7
4,2022-07-14 15:57:33,AleMedinaRivera,627852903,1547611266050449410,@sroyyors 's @ISCB_RegSys Keynote on Cell type...,2022,7
...,...,...,...,...,...,...,...
63,2022-07-01 02:08:39,aeharkess,1029867727,1542691624093995009,"@may_gun @BraybrookSA Yeah, that's definitely ...",2022,7
64,2022-07-01 01:51:28,aeharkess,1029867727,1542687298139160576,@BraybrookSA @mmjulkowska Yeah great point. I'...,2022,7
65,2022-07-01 01:47:58,aeharkess,1029867727,1542686419113709568,@BraybrookSA Definitely. We're fortunate that ...,2022,7
66,2022-07-01 01:44:21,aeharkess,1029867727,1542685507351715840,@mmjulkowska @BraybrookSA Definitely. Seems ha...,2022,7
