In [17]:
import logging
import boto3
from botocore.exceptions import ClientError
import tweepy
import time
import csv
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
from settings import consumer_key, consumer_secret, access_token, access_token_secret

In [2]:
pwd

'/mnt/c/Users/arnop/Documents/self_dev/twitter_api_test/scripts'

In [3]:
def upload_file(file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Upload the file
    s3_client = boto3.client("s3")
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [4]:
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)

    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

    companies = ["blendle", "cafeyn", "milibris", "readly"]

In [5]:
    # 30 days ago - which is the maximum time Twitter allows you to go in the past.
    date_since = datetime.now() - timedelta(days=30)
    date_since = date_since.strftime("%Y-%m-%d")
    date_now = datetime.now().strftime("%Y_%m_%d")

    # create results folder if it doesn't exist yet
    Path("../results").mkdir(parents=True, exist_ok=True)

    # Load previous data if it exists
    last_31days_results_path: Path = Path("../results/twitter_searches_last_31_days.tsv")
    new_results_path = Path("../results/twitter_searches_incremental.tsv")

In [18]:
last_31days_results_path

PosixPath('../results/twitter_searches_last_31_days.tsv')

In [19]:
last_31days_results_path.is_file()

True

In [20]:
    last_31days_results = (
        pd.read_csv(last_31days_results_path, sep="\t")
        if last_31days_results_path.is_file()
        else pd.DataFrame(columns=["id"])
    )

    print(f"the length of it is {len(last_31days_results)}")

the length of it is 228


In [21]:
    # Create logs folder if it doesn't exist yet
    Path("../logs").mkdir(parents=True, exist_ok=True)

    logs_path = Path("../logs/logs.csv")

    logs = (
        pd.read_csv(logs_path)
        if logs_path.is_file()
        else pd.DataFrame(columns=["imported_at", "company", "total_rows"])
    )

In [22]:
    # initialize empty df for last 31 days - so we can add each company tweet id's to it in the loop
    col_names = [
        "id",
        "iso_language_code",
        "created_at",
        "screen_name",
        "text",
        "location",
        "favorite_count",
        "retweet_count",
        "queried_at",
        "company",
    ]

    last_31days_container = pd.DataFrame(columns=col_names)

In [23]:
    for company in companies:
        print("Starting with {}...".format(company))
        query = company + " -filter:retweets"

        tweets = tweepy.Cursor(
            api.search,
            q=query,
            # geocode="51.969685,4.051642,1000km",
            count=100,
            result_type="recent",
            include_entities=True,
            since=date_since,
            tweet_mode="extended",
        ).items(1000)
        locs = [
            [
                tweet.id,
                tweet.metadata["iso_language_code"],
                tweet.created_at,
                tweet.user.screen_name,
                tweet.full_text,
                tweet.user.location,
                tweet.favorite_count,
                tweet.retweet_count,
                datetime.now(),
                company,
            ]
            for tweet in tweets
        ]

        # latest data
        df = pd.DataFrame(
            data=locs,
            columns=col_names,
        )
        print(f"original size of df: {len(df)}")

        # Identify what values are in last_results and not in df
        existing_ids = list(set(last_31days_results.id).intersection(df.id))
        print(f"existing id's: {len(existing_ids)}")
        # Exclude rows that contain id's that we already have from a previous iteration
        df = df[~df.id.isin(existing_ids)]
        print(f"new size of df: {len(df)}")

        # Append new rows to existing result set
        df.to_csv(
            new_results_path,
            mode="a",
            header=not Path(new_results_path).is_file(),
            index=False,
            sep="\t",
        )

        # Upload to s3
        upload_file(
            "../results/twitter_searches_incremental.tsv",
            "arno12-tweets",
            "all-tweets/twitter_searches_incremental.tsv",
        )

        # Save a version with the last 31 days only
        df_last_31_days = df[df.created_at > datetime.now() - pd.to_timedelta("31day")]

        last_31days_container = pd.concat([last_31days_container, df_last_31_days])
        print(f"new length of last 31 days file is {len(last_31days_container)}")

        # Print logs
        print(
            "Done! Wrote a total of {} new row(s) for {}.".format(
                len(df.index), company
            )
        )

        # Generate logs
        logs = pd.DataFrame(
            data=[[datetime.now().timestamp(), company, len(df.index)]],
            columns=["imported_at", "company", "total_rows"],
        )

        logs.to_csv(
            logs_path, mode="a", header=not Path(logs_path).is_file(), index=False
        )


Starting with blendle...
original size of df: 113
existing id's: 59
new size of df: 54
new length of last 31 days file is 54
Done! Wrote a total of 54 new row(s) for blendle.
Starting with cafeyn...
original size of df: 130
existing id's: 57
new size of df: 73
new length of last 31 days file is 127
Done! Wrote a total of 73 new row(s) for cafeyn.
Starting with milibris...
original size of df: 6
existing id's: 5
new size of df: 1
new length of last 31 days file is 128
Done! Wrote a total of 1 new row(s) for milibris.
Starting with readly...
original size of df: 190
existing id's: 95
new size of df: 95
new length of last 31 days file is 223
Done! Wrote a total of 95 new row(s) for readly.


In [36]:
last_31days_container.to_csv(
    '../results/twitter_searches_last_31_days.tsv',
    index=False,
    sep="\t",
)

In [32]:
last_31days_results_path

PosixPath('../results/twitter_searches_last_31_days.tsv')

In [31]:
last_31days_container

Unnamed: 0,id,iso_language_code,created_at,screen_name,text,location,favorite_count,retweet_count,queried_at,company
0,1414200410856898561,nl,2021-07-11 12:30:27,nieuwsselectie,Onbeperkt tijdschriften en het nieuws van alle...,,0,0,2021-07-11 14:52:09.977597,blendle
1,1414181907529613314,nl,2021-07-11 11:16:55,NadineBoke,@ikheetIngeborg @aliettejonkers @RVWetjo Ik he...,Amsterdam,0,0,2021-07-11 14:52:09.977639,blendle
2,1414181243395231745,nl,2021-07-11 11:14:17,NadineBoke,@ikheetIngeborg @aliettejonkers @RVWetjo Pffff...,Amsterdam,0,0,2021-07-11 14:52:09.977659,blendle
3,1414179991928123399,nl,2021-07-11 11:09:18,ikheetIngeborg,@NadineBoke @aliettejonkers @RVWetjo Via deze ...,The Hague,2,0,2021-07-11 14:52:09.977689,blendle
4,1414004388167884804,en,2021-07-10 23:31:31,holofiche,Read this article from The New York Times: D.C...,,0,0,2021-07-11 14:52:09.977706,blendle
...,...,...,...,...,...,...,...,...,...,...
162,1411641333358399493,de,2021-07-04 11:01:35,StatementMedien,"Die Juli/August-Ausgabe 2021 von [Statement], ...","Blutgasse 3, A-1010 Wien",0,0,2021-07-11 14:52:19.322046,readly
163,1411640745799286790,en,2021-07-04 10:59:15,SpyDeals_NL,Probeer 2 maanden gratis Readly https://t.co/C...,Nederland,0,0,2021-07-11 14:52:19.322057,readly
164,1411638625520046080,de,2021-07-04 10:50:49,OeJC,"Die Juli/August-Ausgabe 2021 von [Statement], ...","1010 Wien, Blutgasse 3",1,1,2021-07-11 14:52:19.322068,readly
165,1411625870616412166,sv,2021-07-04 10:00:08,TommyPalomaki,Registrera dig för att få 1 månad helt gratis ...,Stockholm,0,0,2021-07-11 14:52:19.322078,readly
