In [5]:
import logging
import boto3
from botocore.exceptions import ClientError
import tweepy
import time
import csv
import os
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
from settings import consumer_key, consumer_secret, access_token, access_token_secret

In [43]:
def glimpse(df, maxvals=10, maxlen=110):
    print('Shape: ', df.shape)
    
    def pad(y):
        max_len = max([len(x) for x in y])
        return [x.ljust(max_len) for x in y]
    
    # Column Name
    toprnt = pad(df.columns.tolist())
    
    # Column Type
    toprnt = pad([toprnt[i] + ' ' + str(df.iloc[:,i].dtype) for i in range(df.shape[1])])
    
    # Num NAs
    num_nas = [df.iloc[:,i].isnull().sum() for i in range(df.shape[1])]
    num_nas_ratio = [int(round(x*100/df.shape[0])) for x in num_nas]
    num_nas_str = [str(x) + ' (' + str(y) + '%)' for x,y in zip(num_nas, num_nas_ratio)]
    max_len = max([len(x) for x in num_nas_str])
    num_nas_str = [x.rjust(max_len) for x in num_nas_str]
    toprnt = [x + ' ' + y + ' NAs' for x,y in zip(toprnt, num_nas_str)]
    
    # Separator
    toprnt = [x + ' : ' for x in toprnt]
    
    # Values
    toprnt = [toprnt[i] + ', '.join([str(y) for y in df.iloc[:min([maxvals,df.shape[0]]), i]]) for i in range(df.shape[1])]
    
    # Trim to maxlen
    toprnt = [x[:min(maxlen, len(x))] for x in toprnt]
    
    for x in toprnt:
        print(x)


In [6]:
os.chdir('/mnt/c/Users/arnop/Documents/self_dev/twitter_api_test')

In [7]:
def upload_file(file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Upload the file
    s3_client = boto3.client("s3")
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [8]:
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)

    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

    companies = ["blendle", "cafeyn", "milibris", "readly"]

In [23]:
    # 30 days ago - which is the maximum time Twitter allows you to go in the past.
    date_since = datetime.now() - timedelta(days=30)
    date_since = date_since.strftime("%Y-%m-%d")
    date_now = datetime.now().strftime("%Y_%m_%d")

    # create results folder if it doesn't exist yet
    Path("./results").mkdir(parents=True, exist_ok=True)

    # Load previous data if it exists
    last_31days_results_path: Path = Path("./results/twitter_searches_last_31_days.tsv")
    new_results_path = Path("./results/twitter_searches_incremental.tsv")

In [118]:
last_31days_results_path.is_file()

True

In [124]:
last_31days_results = (
    pd.read_csv(last_31days_results_path, sep="\t", parse_dates=['queried_at','created_at'])
    if last_31days_results_path.is_file()
    else pd.DataFrame(columns=["id"])
)

last_31days_results = last_31days_results[last_31days_results['created_at'].apply(lambda t: t.tz_localize(None)) > (pd.to_datetime('today') - pd.to_timedelta("31day"))]

print(f"the length of it is {last_31days_results.shape[0]}")

the length of it is 375


In [110]:
    # Create logs folder if it doesn't exist yet
    Path("./logs").mkdir(parents=True, exist_ok=True)

    logs_path = Path("./logs/logs.csv")

    logs = (
        pd.read_csv(logs_path)
        if logs_path.is_file()
        else pd.DataFrame(columns=["imported_at", "company", "total_rows"])
    )

In [111]:
    # initialize empty df for last 31 days - so we can add each company tweet id's to it in the loop
    col_names = [
        "id",
        "iso_language_code",
        "created_at",
        "screen_name",
        "text",
        "location",
        "favorite_count",
        "retweet_count",
        "queried_at",
        "company",
    ]

    last_31days_container = pd.DataFrame(columns=col_names)

In [126]:
    for company in companies:
        print("Starting with {}...".format(company))
        query = company + " -filter:retweets"

        tweets = tweepy.Cursor(
            api.search,
            q=query,
            # geocode="51.969685,4.051642,1000km",
            count=100,
            result_type="recent",
            include_entities=True,
            since=date_since,
            tweet_mode="extended",
        ).items(1000)
        locs = [
            [
                tweet.id,
                tweet.metadata["iso_language_code"],
                tweet.created_at,
                tweet.user.screen_name,
                tweet.full_text,
                tweet.user.location,
                tweet.favorite_count,
                tweet.retweet_count,
                datetime.now(),
                company,
            ]
            for tweet in tweets
        ]

        # latest data
        df = pd.DataFrame(
            data=locs,
            columns=col_names,
        )
        print(f"original size of new df for {company}: {len(df)}")

        # Identify what values are in last_results and not in df
        existing_ids = list(set(last_31days_results.id).intersection(df.id))
        print(f"existing id's of {company} in last 31 days: {len(existing_ids)}")
        # Exclude rows that contain id's that we already have from a previous iteration
        new_ids = df[~df.id.isin(existing_ids)]

        # Append new rows to existing result set
        new_ids.to_csv(
            new_results_path,
            mode="a",
            header=not Path(new_results_path).is_file(),
            index=False,
            sep="\t",
        )
        
        # Print logs
        print(f"Done! Wrote a total of {len(new_ids)} new row(s) for {company}")

        # Upload to s3
        upload_file(
            "./results/twitter_searches_incremental.tsv",
            "arno12-tweets",
            "all-tweets/twitter_searches_incremental.tsv",
        )

        last_31days_container = pd.concat([last_31days_container, new_ids])
        print(f"The new length of the last 31 days file is {len(last_31days_container)}")

        # Generate logs
        logs = pd.DataFrame(
            data=[[datetime.now().timestamp(), company, len(df.index)]],
            columns=["imported_at", "company", "total_rows"],
        )

        logs.to_csv(
            logs_path, mode="a", header=not Path(logs_path).is_file(), index=False
        )


Starting with blendle...
original size of new df for blendle: 99
existing id's of blendle in last 31 days: 99
Done! Wrote a total of 0 new row(s) for blendle
The new length of the last 31 days file is 375
Starting with cafeyn...
original size of new df for cafeyn: 107
existing id's of cafeyn in last 31 days: 107
Done! Wrote a total of 0 new row(s) for cafeyn
The new length of the last 31 days file is 375
Starting with milibris...
original size of new df for milibris: 5
existing id's of milibris in last 31 days: 5
Done! Wrote a total of 0 new row(s) for milibris
The new length of the last 31 days file is 375
Starting with readly...
original size of new df for readly: 164
existing id's of readly in last 31 days: 164
Done! Wrote a total of 0 new row(s) for readly
The new length of the last 31 days file is 375


In [123]:
last_31days_container.to_csv(
    './results/twitter_searches_last_31_days.tsv',
    index=False,
    sep="\t",
)

In [114]:
last_31days_container

Unnamed: 0,id,iso_language_code,created_at,screen_name,text,location,favorite_count,retweet_count,queried_at,company
0,1416329991810387969,nl,2021-07-17 09:32:38,walrusit,Lees dit artikel uit De Standaard: ‘Er zou zov...,"Katwijk aan Zee, NL",0,0,2021-07-17 12:59:53.747461,blendle
1,1416314559946690560,nl,2021-07-17 08:31:19,nieuwsselectie,Onbeperkt tijdschriften en het nieuws van alle...,,0,4,2021-07-17 12:59:53.747477,blendle
2,1416305245924298754,nl,2021-07-17 07:54:18,MartijnRoyFFP,Lees dit artikel uit de Volkskrant: Als het ma...,The Netherlands,0,0,2021-07-17 12:59:53.747482,blendle
3,1416302676913106945,nl,2021-07-17 07:44:06,JensOldeKalter,"COCAÏNE,de motor van de criminele schaduwecono...",Amsterdam,1,0,2021-07-17 12:59:53.747487,blendle
4,1416134386853072902,nl,2021-07-16 20:35:22,m_wiersma,Lees dit artikel uit Quote: DE MACHT VAN DE WE...,NL,0,0,2021-07-17 12:59:53.747491,blendle
...,...,...,...,...,...,...,...,...,...,...
159,1413422682385887236,en,2021-07-09 09:00:02,PrincessMagUK,Discover the new @Dior Summer collection withi...,United Kingdom,0,0,2021-07-17 13:00:00.591093,readly
160,1413417884156583936,en,2021-07-09 08:40:58,LukeB_MTB,@LUDENClassics Been following the build in a c...,"Bramber, probably.",0,0,2021-07-17 13:00:00.591098,readly
161,1413411548526227457,en,2021-07-09 08:15:47,PrincessMagUK,Issue 14 of Princess Magazine is now out featu...,United Kingdom,0,0,2021-07-17 13:00:00.591103,readly
162,1413410287840026624,en,2021-07-09 08:10:47,RuralLifeUK,The Summer Edition of #RuralLife is Now Out! F...,"South West, England",0,0,2021-07-17 13:00:00.591108,readly
