# Capturando tweets direcionados a candidatos

[Voltar ao Índice](00_indice.ipynb)

**FAZER**
* Lidar com o caso de mais de um twitter por candidato (tanto de mais de uma linha na tabela **Feito** quanto de mais de um usuário listado na mesma linha).
* Selecionar apenas tweets direcionados a uma única pessoa; **Feito**
* Identificar violência direcionada a outras pessoas que não o mencionado. **Feito**
* Ignorar tweets que não contém texto. **Feito**

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import os
import time
import matplotlib.pyplot as pl
import json
import traceback
from pathlib import Path


import xavy.twitter as tw
import xavy.explore as xe
import xavy.dataframes as xd
#import speechwrapper as sw

## Funções

In [15]:
def extract_twitter_username(series, lower=True):
    """
    Given a Series that contains the URL 
    or the username of twitter accounts,
    extract the username.
    
    If `lower` is True, return lower case
    username (Twitter usernames are not 
    case-sensitive).
    
    Returns a series with same index as 
    the input. NaN is returned when no
    unsername is found.
    """
    
    username =  series.str.extract('(?:[Tt]wit+er\.com(?:\.br)?/@?|@)(\w+)')[0]
    if lower is True:
        username = username.str.lower()
    
    return username

def request_twitter_user_info(df, username_col='twitter_username', cand_id_col='SQ_CANDIDATO', requests_per_window=900):
    """
    Collect twitter user info using Twitter
    API v.1.1.
    
    Parameters
    ----------
    df : DataFrame
        Table containing the twitter username 
        and a candidate identifier (e.g. 
        'SQ_CANDIDATO').
    username_col : str
        Name of the column containing the 
        twitter usernames.
    cand_id_col : str
        Column identifying the candidate.
    requests_per_window : int
        Maximum number of requests allowed
        by the API in a 15-minute window.
    
    Returns
    -------
    response_df : DataFrame
        DataFrame with all the information
        provided by the API, for each 
        username found, along with the 
        associated candidate ID.
    """
    
    # Look for twitter IDs with API:
    w_username = df[username_col].dropna().drop_duplicates()
    response = tw.lookup(w_username, requests_per_window=requests_per_window)
    
    # Build DataFrame with responses:
    response_df = pd.DataFrame(response['data'])
    # Join SQ_CANDIDATO to twitter data:
    response_df['lower_name'] = response_df['username'].str.lower()
    cand_ids = df.set_index(username_col)[cand_id_col].astype(str)
    response_df = response_df.join(cand_ids, on='lower_name')
    
    # Expand dict Series (nested data):
    for col in response_df.columns:
        if type(response_df[col].iloc[0]) is dict:
            nested_df = pd.DataFrame(list(response_df[col]), index=response_df.index)
            response_df = response_df.join(nested_df)
            response_df.drop(col, axis=1, inplace=True)
    
    return response_df

def append_mentions_page(mentions, url, parameters):
    """
    Update `mentions` and `parameters` in place
    by appending twitter API /2/users/:id/mentions
    responses and getting next page token.
    
    Parameters
    ----------
    mentions : dict
        Concatenated data from multiple API
        calls.
    url : str
        Twitter API URL 
        ('https://api.twitter.com/2/users/:id/mentions')
    parameters : dict
        API call parameters, which may include a 
        'pagination_token' key. The latter is 
        updated in place if there is a 'next_page' 
        in the API response.
    
    Returns
    -------
    appended_mentions : dict
        Dict containing the data from `mentions`, 
        appended by the data from the API call.
    is_new_page : bool
        True if there is another page after
        the current one, and False otherwise.
    """
    
    extra = tw.request_twitter_api(url, parameters)
    mentions = tw.concat_json_pages(mentions, extra)
    if 'next_token' in extra['meta']:
        parameters.update({'pagination_token': extra['meta']['next_token']})
        return mentions, True
    
    return mentions, False

def mentions_to_df(mentions, user_id):
    """
    Parse JSON structure containing twitter 
    mentions to a user into a DataFrame.

    Parameters
    ----------
    mentions : dict
        Twitter API response to 
        /2/users/:id/mentions endpoint.
    user_id : str ir int
        Twitter ID of the mentioned user.
    
    Returns
    -------
    mentions_df : DataFrame
        Data from the API parsed into a 
        DataFrame, with some extra columns.
    """
    
    # Cria DataFrame de tweets mencionando usuário:
    mentions_df = pd.DataFrame(mentions['data'])
    n_mentions  = len(mentions_df)
    
    # Adiciona coluna 'in reply...' se não existir (acho que isso acontece quando nenhuma das menções é reply):
    if 'in_reply_to_user_id' not in mentions_df.columns:
        mentions_df['in_reply_to_user_id'] = np.NaN
    
    # Junta informações sobre o autor da menção:
    participants_df = pd.DataFrame(mentions['includes']['users'])[['id', 'name', 'username']].drop_duplicates()
    author_fields = {'name':'author_name', 'username':'author_username'}
    mentions_df = mentions_df.join(participants_df.rename(author_fields, axis=1).set_index('id'), on='author_id')
    assert(len(mentions_df) == n_mentions), 'Author info join increased number of mentions. This is wrong.'
    
    # Adiciona link p/ o tweet:
    mentions_df['tweet_url'] = 'https://www.twitter.com/' + mentions_df['author_username'] + '/status/' + mentions_df['id'].astype(str)
    
    # Parseia data (se existir):
    if 'created_at' in mentions_df.columns:
        mentions_df['created_at'] = pd.to_datetime(mentions_df['created_at'])
    
    # Conta número de usuários mencionados:
    if 'entities' in mentions_df.columns:
        mentions_df['n_mentions'] = mentions_df['entities'].apply(lambda s: len(s['mentions']))
    
    # Identifica reply direto:
    mentions_df['direct_reply'] = (mentions_df['in_reply_to_user_id'] == str(user_id)).astype(int)
    
    # Info da captura:
    
    
    return mentions_df

def parse_utc_time(time_in, time_fmt='%Y-%m-%dT%H:%M:%S', bsb2utc=True):
    """
    Parse a (possibly) local time into UTC time.
    
    Parameters
    ----------
    time_in : str or datetime
        Time to parse to UTC datetime.
    time_fmt : str
        If `time_in` is str, this is used to parse it to datetime.
    bsb2utc : bool
        Whether to assume `time_in` is Brasilia local time and 
        convert it to UTC.
        
    Returns
    -------
    time_utc : datetime
        Time in UTC (assuming `time_in` is UTC and `bsb2utc` is
        False; or `time_in` is UTC-3 and `bsb2utc` is True).
    """
    # Parse str to datetime:
    if type(time_in) is str:
        time_dt = dt.datetime.strptime(time_in, time_fmt)
    else:
        time_dt = time_in
        
    # Convert Brazilia (UTC-3) to UTC:
    if bsb2utc is True:
        time_dt = time_dt + dt.timedelta(hours=3)
    
    return time_dt

def get_mentions_in_period(user_id, start_time, end_time, max_pages=20, max_results=100, requests_per_window=450, verbose=True, bsb_time=True):
    """
    Get mentions to specified user within a period of time.
    
    Parameters
    ----------
    user_id : int
        Twitter user ID to check mentions for.
    start_time : str or datetime
        Beginning of the period in which to look for mentions. 
        If str, must be in the format ''%Y-%m-%dT%H:%M:%S'.
    end_time : str or datetime
        End of the period in which to look for mentions. 
        If str, must be in the format ''%Y-%m-%dT%H:%M:%S'.
    max_pages : int
        Maximum number of pages to go through when a paginated
        result is returned. Note that the API only checks and 
        returns the 800 most recent tweets.
    max_results : int
        Maximum number of results to return in each API call, 
        that is, in each page.
    requests_per_window : int
        Maximum number of calls in a 15-minute window allowed 
        by the API. Each call for a page is delayed by the 
        appropriate amount of time to avoid reaching this 
        limit.
    verbose : bool
        Whether to print page numbers as going through the
        pagination.
    bsb_time : bool
        Whether `start_time` and `end_time` are given at 
        Brasilia local time (UTC-3).
    
    Returns
    -------
    mentions : dict
        The API response, containing the tweets mentioning the 
        user `user_id`, after concatenating the pages.
    """
    
    # Hard-coded:
    url_template = 'https://api.twitter.com/2/users/{}/mentions'
    params = {'tweet.fields': ['created_at'], 'expansions':['author_id', 'in_reply_to_user_id', 'entities.mentions.username']}
    
    # Prepate input:
    sleep_time = tw.compute_sleep_time(requests_per_window)
    url = url_template.format(user_id)
    start_utc = parse_utc_time(start_time, bsb2utc=bsb_time).strftime('%Y-%m-%dT%H:%M:%SZ')
    end_utc   = parse_utc_time(end_time, bsb2utc=bsb_time).strftime('%Y-%m-%dT%H:%M:%SZ')
    params.update({'max_results': max_results, 'start_time': start_utc, 'end_time': end_utc})
    
    mentions = {}
    # First capture:
    time.sleep(sleep_time)
    mentions, get_next_page = append_mentions_page(mentions, url, params)
    page_num = 1
    if verbose is True:
        print(page_num, end=' ')
    # Go through pagination:
    while get_next_page is True and page_num < max_pages:
        time.sleep(sleep_time)
        mentions, get_next_page = append_mentions_page(mentions, url, params)
        page_num += 1
        if verbose is True:
            print(page_num, end=' ')
    
    return mentions

def compute_time_period(start_time, end_time, time_fmt='%Y-%m-%dT%H:%M:%S'):
    """
    Return the time interval in hours (float) between the
    `start_time` and `end_time` (both str or datetime).
    If str, the input should be provided in the `time_fmt`
    format.
    """
    return (parse_utc_time(end_time, time_fmt) - parse_utc_time(start_time, time_fmt)).total_seconds() / 3600

def capture_stats(mentions, start_time, end_time, time_fmt='%Y-%m-%dT%H:%M:%S', max_mentions=800):
    """
    Compute statistical information about the
    response of an API mentions request.
    
    Parameters
    ----------
    mentions : dict
        Response from an API call, as returned by 
        the `get_mentions_in_period` function.
    start_time : str or datetime
        Start of the time period requested for the
        capture with `get_mentions_in_period`.
    end_time : str or datetime
        End of the time period requested for the
        capture with `get_mentions_in_period`.
    time_fmt : str
        Format of `start_time` and `end_time`.
    max_mentions : int
        Maximum number of recent tweets returned by
        the API. NOTE THAT THE CAPTURE END TIME 
        SHOULD BE THE CURRENT TIME FOR THE STATISTICS
        TO BE RIGHT.
        
    Returns
    -------
    n_mentions : int
    n_errors : int
    time_window : float
    collected_time : float
    t_win_weight : float
    """
    
    # Request stats:
    n_mentions  = np.sum(mentions['meta']['result_count'])
    if 'errors' not in mentions.keys():
        n_errors = 0
    else:
        n_errors    = len(mentions['errors'])

    # The time period expected to be covered by the request:
    time_window = compute_time_period(start_time, end_time, time_fmt)
    # Actual time period covered:
    collected_time = compute_time_period(mentions['data'][-1]['created_at'], mentions['data'][0]['created_at'], time_fmt='%Y-%m-%dT%H:%M:%S.000Z')
    # Compute statistical weight of the tweet to represent the expected time period:
    if n_mentions >= max_mentions:
        t_win_weight = time_window / collected_time
    else:
        t_win_weight = 1.0
    
    return n_mentions, n_errors, time_window, collected_time, t_win_weight

def get_last_mentions(user_id, last_hours=6, verbose=True):
    """
    Capture mentions to a twitter user in the last couple of hours.
    
    Parameters
    ----------
    user_id : int
        Twitter user ID to look mentions for.
    last_hours : float
        Number of hours in the past, from current time, to look 
        for mentions. Remember that the API returns at most 
        800 most recent tweets.
    
    Returns
    -------
    mentions_df : DataFrame
        Table containing the tweets mentioning `user_id` in the 
        `last_hours`, along with the author info and capture 
        process stats.
    """
    
    # Set time landmarks:
    end_time   = dt.datetime.now()
    start_time = end_time - dt.timedelta(hours=last_hours)
    
    # Capture the mentions:
    mentions = get_mentions_in_period(user_id, start_time, end_time, verbose=verbose)
    
    # Exit if there is no data:
    if mentions['meta']['result_count'] == 0:
        return None, {'batch_start': start_time, 'batch_end': end_time, 'batch_tweets': 0, 'batch_errors': 0}
    
    # Build the DataFrame:
    m_df = mentions_to_df(mentions, user_id)
    m_df['batch_user']  = user_id
    m_df['batch_start'] = start_time
    m_df['batch_end']   = end_time
    m_df['batch_tweets'], m_df['batch_errors'], m_df['target_t_win'], m_df['actual_t_win'], m_df['t_win_weight'] = capture_stats(mentions, start_time, end_time)
    
    return m_df, {'batch_start': start_time, 'batch_end': end_time, 'batch_tweets': m_df.iloc[0]['batch_tweets'], 'batch_errors': m_df.iloc[0]['batch_errors']}

def todays_tweet_limit(curr_level, cap_renew_date, tweet_cap=2000000, safety_buffer=100):
    """
    Compute maximum number of tweets that should be captured
    per day given the current capture quota usage.
    
    Parameters
    ----------
    curr_level : int
        Number of tweets already captured.
    cap_renew_date : str
        Date when the usage cap resets, in format '%Y-%m-%d'.
        Check https://developer.twitter.com/en/portal/dashboard.
    tweet_cap : int
        Monthly tweet cap.
        Check https://developer.twitter.com/en/portal/dashboard.
    safety_buffer : int
        Decrement in the number of tweets to be captured per pay,
        to avoid errors.
    
    Returns
    -------
    todays_lim : int
        Maximum number of tweets that should be captured today,
        assuming a constant rate up to cap renew date.
    """
    today = dt.date.today()
    renew = dt.date(*(int(x) for x in cap_renew_date.split('-')))
    days_to_renew = (renew - today).days
    if days_to_renew <=0:
        raise Exception('{} reached cap renew date {}: reset `cap_renew_date` to new date.'.format(today, renew))
    todays_lim    = int((tweet_cap - curr_level) / days_to_renew - safety_buffer)
    
    return todays_lim

def todays_n_cands(curr_level, cap_renew_date, avg_tweets, tweet_cap=2000000, tweets_buffer=100):
    """
    Compute the number of candidates to look mentions for in the
    period of one day.
    
    Parameters
    ----------
    curr_level : int
        Number of tweets already captured.
    cap_renew_date : str
        Date when the usage cap resets, in format '%Y-%m-%d'.
        Check https://developer.twitter.com/en/portal/dashboard.
    avg_tweets : float
        Average number of tweets mentioning a candidate in the 
        capture period.
    tweet_cap : int
        Monthly tweet cap.
        Check https://developer.twitter.com/en/portal/dashboard.
    tweets_buffer : int
        Decrement in the number of tweets to be captured per pay,
        to avoid errors.
    
    Returns
    -------
    n_cands : int
        Number of candidates that can have their mentions captured.
    """
    
    return int(todays_tweet_limit(curr_level, cap_renew_date, tweet_cap, tweets_buffer) / avg_tweets)

def batch_n_cands(curr_level, cap_renew_date, avg_tweets, capture_period, tweet_cap=2000000, tweets_buffer=100):
    """
    Compute the number of candidates to look mentions for in 
    a capture batch.
    
    Parameters
    ----------
    curr_level : int
        Number of tweets already captured.
    cap_renew_date : str
        Date when the usage cap resets, in format '%Y-%m-%d'.
        Check https://developer.twitter.com/en/portal/dashboard.
    avg_tweets : float
        Average number of tweets mentioning a candidate in the 
        capture period.
    capture_period : int
        Capture window size for each user, in hours. 
    tweet_cap : int
        Monthly tweet cap.
        Check https://developer.twitter.com/en/portal/dashboard.
    tweets_buffer : int
        Decrement in the number of tweets to be captured per pay,
        to avoid errors.
    
    Returns
    -------
    n_cands : int
        Number of candidates that can have their mentions captured.
    """
    
    day_n_cands   = todays_n_cands(curr_level, cap_renew_date, avg_tweets, tweet_cap, tweets_buffer)
    batch_n_cands = int(day_n_cands / (24 / capture_period))
    
    return batch_n_cands

def read_config(filename='tweet_capture_config.json'):
    """
    Read JSON from `filename` (str).
    """
    with open(filename, 'r') as f:
        config = json.load(f)
    return config

def write_config(config, filename='tweet_capture_config.json'):
    """
    Write `config` (dict) to `filename` (str) as JSON.
    """
    with open(filename, 'w') as f:
        json.dump(config, f, indent=1)

def program_batch_capture(twitter_df, n_cands, previous_df=None, start_time=None, time_fmt='%Y-%m-%dT%H:%M:%S', random_state=None):
    """
    Generate DataFrame with a schedule for capturing data from 
    randomly sampled candidates, organized in batches spaced 
    during the day.
    
    Parameters
    ----------
    twitter_df : DataFrame
        Table with column 'id' containing all the candidates'
        Twitter IDs.
    n_cands : int
        Number of candidates to randomly select for today's 
        capture.
    previous_df : DataFrame
        Capture list from the previous batch. These user IDs 
        sre removed from the set before sampling, to avoid 
        data overlap.
    start_time : str, datetime or None
        Datetime to schedule the capture to. If str, in format 
        given by `time_fmt`. If None, get the current date.
    time_fmt : str
        Format of `start_time`, if provided as str.
    random_state : int or None
        Seed for randomly selecting candidates. Use None for 
        random seed.
    
    Returns
    -------
    df : DataFrame
        Table with candidates to capture, their batches and 
        the time they should be captured.
    """
    
    # Get current date if needed:
    if start_time is None:
        start_time = dt.datetime.now()
    else: 
        start_time = parse_utc_time(start_time, time_fmt, bsb2utc=False)
        
    # Select not repeating sample:
    if previous_df is None:
        no_repeat = twitter_df
    else:
        no_repeat = twitter_df.loc[~twitter_df['id'].isin(previous_df['id'])]
    
    # Randomly select candidates:
    daily_capture_df = no_repeat['id'].sample(n_cands, random_state=random_state).reset_index()
    daily_capture_df.rename({'index':'cand_id_pos'}, axis=1, inplace=True)
    
    # Prepare batch information:
    daily_capture_df['batch_size'] = n_cands
    daily_capture_df['batch_time'] = start_time
    daily_capture_df['status'] = 'queue'
    daily_capture_df['batch_start'] = np.NaN
    daily_capture_df['batch_end'] = np.NaN
    daily_capture_df['batch_tweets'] = np.NaN
    daily_capture_df['batch_errors'] = np.NaN
    
    return daily_capture_df

def print_to_file(error_log, filename):
    """
    Print `error_log` (str) into file with `filename` (str).
    """
    
    with open(filename, 'w') as f:
        f.write(error_log)


def gen_mentions_path(data_dir, batch_time, user_id):
    """
    Create filename and path for storing the data obtained 
    from a mentions capture.
    
    Parameters
    ----------
    data_dir : str
        Path to the root data dir (e.g. 'data/').
    batch_time : str or datetime
        Batch time for identification purposes. If str, in
        format '%Y-%m-%dT%H:%M:%S'.
    user_id : int
        ID of the user mentioned.
    
    Returns
    -------
    file_path : str
        Filename, including path, where to save captured 
        mentions.
    """
    return '{0:}{1:}/mentions_{1:}_{2:}.csv'.format(data_dir, parse_utc_time(batch_time, bsb2utc=False).strftime('%Y-%m-%dT%H:%M:%S'), user_id)

def make_necessary_dirs(filename):
    """
    Create directories in the path to `filename` (str), if necessary.
    """
    if not os.path.exists(os.path.dirname(filename)):
        try:
            os.makedirs(os.path.dirname(filename))
        except OSError as exc: # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise

def run_batch_capture(batch_time, twitter_df, config, previous_batch=None, verbose=True, no_protected=False):
    """
    Randomly select candidates and capture twitter mentions
    to them.
    
    Parameters
    ----------
    batch_time : str or datetime
        Approximately the current time, to serve only as 
        an identifier of the batch.
    twitter_df : DataFrame
        Table of all twitter IDs, to sample from.
    config : dict
        Capture process configuration, including capture
        time window, folders for saving data and logs, 
        request rates.
    previous_batch : DataFrame or None
        If provided, do not sample IDs present in 
        `previous_batch`, to avoid data overlap.
    verbose : bool
        Whether to print capture counts ans status.
    no_protected : bool
        Whether to remove protected accounts from sampling.
        I think protected accounts do not return replies, 
        but they still return mentions.
    
    Returns
    -------
    batch_df : DataFrame
        List of sampled IDs, along with information about
        their capture.
    """
    
    # Parse datetime to str:
    if type(batch_time) is dt.datetime:
        batch_time = batch_time.strftime('%Y-%m-%dT%H:%M:%S')
    
    # Filter out protected twitter accounts is requested:
    if no_protected is True:
        ids_df = twitter_df.loc[twitter_df['protected'] == False]
    else:
        ids_df = twitter_df
        
    # Create batch of IDs to capture:
    n_cands  = batch_n_cands(config['curr_level'], config['cap_renew_date'], config['avg_tweets_per_cand'], config['capture_period'])
    batch_df = program_batch_capture(ids_df, n_cands, previous_batch, start_time=batch_time)

    # Log batch data:
    batch_df.to_csv('{}capture_{}.csv'.format(config['log_dir'], batch_time), index=False)

    if verbose is True:
        print('  ')
    
    # Loop over IDs to capture:
    for i in batch_df.index.values:

        try:
            # Capture data:
            mentions_df, stats = get_last_mentions(batch_df.loc[i, 'id'], config['capture_period'], verbose=False)
            # Log capture statistics:
            for name, stat in stats.items():
                batch_df.loc[i, name] = stat
            # Save captured mentions:
            if mentions_df is not None:
                filename = gen_mentions_path(config['data_dir'], batch_time, batch_df.loc[i, 'id'])
                make_necessary_dirs(filename)
                mentions_df.to_csv(filename, index=False)      
            status = 'ok'
        except:
            # Record error:
            tb = traceback.format_exc()
            print_to_file(tb, '{}{}_i-{:05d}_id-{}.log'.format(config['error_dir'], batch_time, i, batch_df.loc[i, 'id']))
            status = 'error'
        
        finally: 
            # Log batch data:
            batch_df.loc[i, 'status'] = status
            batch_df.to_csv('{}capture_{}.csv'.format(config['log_dir'], batch_time), index=False)
            if verbose is True:
                print('{}: {}'.format(i, status), end=', ')
    
    print('')
    return batch_df

def sum_batch_tweets(batch_df):
    tot_tweets = int(batch_df.loc[batch_df['status'] == 'ok', 'batch_tweets'].sum())
    return tot_tweets

def compute_avg_tweets(batch_df):
    """
    Compute the average number of tweets from 
    the capture batch and save it to the 
    config file.
    """
    
    # Compute average:
    avg_tweets = batch_df.loc[batch_df['status'] == 'ok', 'batch_tweets'].mean()
    avg_tweets = np.round(avg_tweets, 3)
    
    return avg_tweets
    
    # Update config:
    config['avg_tweets_per_cand'] = avg_tweets
    # Save config:
    write_config(config)

def next_batch_time(capture_period, ini_date='2022-08-12T00:00:00', date_fmt='%Y-%m-%dT%H:%M:%S'):
    """
    Compute the datetime of the next batch from now.
    
    Parameters
    ----------
    capture_period : float
        Number of hours between each batch.
    ini_date : str or datetime
        Initial date (if str, in format `date_fmt`), from which
        the following batches are scheduled.
    date_fmt : str
        Format of `ini_date`.
    
    Returns
    -------
    next_date : datetime
        When to run the next batch capture
    """
    next_date = parse_utc_time(ini_date, bsb2utc=False)
    now_date  = dt.datetime.now()
    while next_date < now_date:
        next_date = next_date + dt.timedelta(hours=capture_period)
        
    return next_date

def load_saved_mentions(data_dir):
    result_df = pd.concat([pd.read_csv(f, dtype={'in_reply_to_user_id':str}) for f in Path(data_dir).rglob('*.csv')], ignore_index=True)
    return result_df

In [3]:
def log_print(string, start=False):
    print('{} {}: {}'.format('*' if start else ' ', dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), string))

def update_current_level(new_tweets, config):
    
    config['curr_level'] += new_tweets
    write_config(config)

def update_cap_renew_date(config, ini_date):

    # Parse cap renew date:
    utc_renew_time = parse_utc_time(config['cap_renew_date'] + 'T00:00:00', bsb2utc=False)
    # Get time of the next batch:
    next_time = next_batch_time(config['capture_period'], ini_date=ini_date)
    
    # Update cap renew date to next month if necessary:
    if next_time >= utc_renew_time:
        new_renew_time = utc_renew_time + dt.timedelta(days=30)
        new_renew_time = new_renew_time + dt.timedelta(days=utc_renew_time.day - new_renew_time.day)
    # Kepp current cap renew date:
    else:
        new_renew_time = utc_renew_time
    
    # Save new cap renew date:
    config['cap_renew_date'] = new_renew_time.strftime('%Y-%m-%d')
    write_config(config)

## Testes

### Arquivos CSV capturados e mal formados

In [22]:
batch_df = pd.read_csv('../tweets/logs/capture/capture_2022-09-14T18:30:00.csv')

In [25]:
batch_df.query('status == "error"')

Unnamed: 0,cand_id_pos,id,batch_size,batch_time,status,batch_start,batch_end,batch_tweets,batch_errors
600,55,243326392,1158,2022-09-14 18:30:00,error,,,,
893,1553,1541768269668966403,1158,2022-09-14 18:30:00,error,,,,
934,2243,234888981,1158,2022-09-14 18:30:00,error,,,,
951,1548,1557763774492409857,1158,2022-09-14 18:30:00,error,,,,
997,632,1551612003097186305,1158,2022-09-14 18:30:00,error,,,,


In [4]:
carriage_files = ['../tweets/data/2022-08-18T09:30:00/mentions_2022-08-18T09:30:00_762402774260875265.csv',
'../tweets/data/2022-08-18T09:30:00/mentions_2022-08-18T09:30:00_69373037.csv',
'../tweets/data/2022-08-28T12:30:00/mentions_2022-08-28T12:30:00_975127727501185025.csv',
'../tweets/data/2022-08-19T00:30:00/mentions_2022-08-19T00:30:00_74756085.csv',
'../tweets/data/2022-09-01T06:30:00/mentions_2022-09-01T06:30:00_1198004545.csv',
'../tweets/data/2022-09-14T18:30:00/mentions_2022-09-14T18:30:00_52045368.csv',
'../tweets/data/2022-09-18T12:30:00/mentions_2022-09-18T12:30:00_31139434.csv',
'../tweets/data/2022-09-18T12:30:00/mentions_2022-09-18T12:30:00_2319196454.csv',
'../tweets/data/2022-08-29T09:30:00/mentions_2022-08-29T09:30:00_3096479489.csv']

In [6]:
for input_file in carriage_files:
    print('**', input_file)
    try:
        df = pd.read_csv(input_file)
    except pd.errors.ParserError:
        print('use line terminator')
        df = pd.read_csv(input_file, lineterminator='\n')

    xe.checkMissing(df)
    print(df['batch_user'].dtype)

** ../tweets/data/2022-08-18T09:30:00/mentions_2022-08-18T09:30:00_762402774260875265.csv
[1mColunas com valores faltantes:[0m
                coluna    N     %
5  in_reply_to_user_id  9.0  1.13
int64
** ../tweets/data/2022-08-18T09:30:00/mentions_2022-08-18T09:30:00_69373037.csv
[1mColunas com valores faltantes:[0m
                coluna    N     %
2  in_reply_to_user_id  1.0  1.56
int64
** ../tweets/data/2022-08-28T12:30:00/mentions_2022-08-28T12:30:00_975127727501185025.csv
[1mColunas com valores faltantes:[0m
                coluna     N     %
1  in_reply_to_user_id  39.0  8.71
int64
** ../tweets/data/2022-08-19T00:30:00/mentions_2022-08-19T00:30:00_74756085.csv
[1mColunas com valores faltantes:[0m
                coluna     N     %
3  in_reply_to_user_id  23.0  3.06
int64
** ../tweets/data/2022-09-01T06:30:00/mentions_2022-09-01T06:30:00_1198004545.csv
[1mColunas com valores faltantes:[0m
                coluna    N     %
1  in_reply_to_user_id  9.0  2.34
int64
** ../twe

In [58]:
xe.checkMissing(df)

[1mColunas com valores faltantes:[0m
Empty DataFrame
Columns: [coluna, N, %]
Index: []


In [59]:
df

Unnamed: 0,author_id,created_at,id,entities,text,in_reply_to_user_id,author_name,author_username,tweet_url,n_mentions,direct_reply,batch_user,batch_start,batch_end,batch_tweets,batch_errors,target_t_win,actual_t_win,t_win_weight,batch_time
0,918592157832892417,2022-09-14 21:43:21+00:00,1570166334457278465,"{'mentions': [{'start': 0, 'end': 16, 'usernam...",@jairmearrependi Em São Paulo vou de @jilmarta...,1053378751189397504,Son of Rob🚩,robsouzza,https://www.twitter.com/robsouzza/status/15701...,3,0,52045368,2022-09-14 15:50:26.388777,2022-09-14 18:50:26.388777,31,0,3.0,2.801389,1.0,2022-09-14T18:30:00
1,1529997304714887173,2022-09-14 21:42:00+00:00,1570165997297885185,"{'mentions': [{'start': 0, 'end': 12, 'usernam...",@jilmartatto @Haddad_Fernando @LulaOficial Kkk...,52045368,Jose.ferreira.neto,neto091275,https://www.twitter.com/neto091275/status/1570...,3,1,52045368,2022-09-14 15:50:26.388777,2022-09-14 18:50:26.388777,31,0,3.0,2.801389,1.0,2022-09-14T18:30:00
2,1369439820225916929,2022-09-14 21:23:50+00:00,1570161425208999937,"{'mentions': [{'start': 0, 'end': 12, 'usernam...",@jilmartatto @Haddad_Fernando @LulaOficial htt...,52045368,Edson,Arcanjo68726610,https://www.twitter.com/Arcanjo68726610/status...,3,1,52045368,2022-09-14 15:50:26.388777,2022-09-14 18:50:26.388777,31,0,3.0,2.801389,1.0,2022-09-14T18:30:00
3,252395113,2022-09-14 21:23:50+00:00,1570161424441413632,"{'mentions': [{'start': 0, 'end': 14, 'usernam...",@Amora96757846 @jilmartatto @Haddad_Fernando @...,1412836643858493444,José Bitencourt,VENTOSDAPAZ,https://www.twitter.com/VENTOSDAPAZ/status/157...,4,0,52045368,2022-09-14 15:50:26.388777,2022-09-14 18:50:26.388777,31,0,3.0,2.801389,1.0,2022-09-14T18:30:00
4,1534878451102101504,2022-09-14 21:23:48+00:00,1570161417566781458,"{'mentions': [{'start': 0, 'end': 12, 'usernam...",@jilmartatto @Haddad_Fernando @LulaOficial SQN...,52045368,Liberdade🇧🇷,BraLiberdade,https://www.twitter.com/BraLiberdade/status/15...,3,1,52045368,2022-09-14 15:50:26.388777,2022-09-14 18:50:26.388777,31,0,3.0,2.801389,1.0,2022-09-14T18:30:00
5,1369439820225916929,2022-09-14 21:22:49+00:00,1570161170631491584,"{'mentions': [{'start': 0, 'end': 12, 'usernam...",@jilmartatto @Haddad_Fernando @LulaOficial htt...,52045368,Edson,Arcanjo68726610,https://www.twitter.com/Arcanjo68726610/status...,3,1,52045368,2022-09-14 15:50:26.388777,2022-09-14 18:50:26.388777,31,0,3.0,2.801389,1.0,2022-09-14T18:30:00
6,1369439820225916929,2022-09-14 21:22:33+00:00,1570161099739201536,"{'mentions': [{'start': 0, 'end': 12, 'usernam...",@jilmartatto @Haddad_Fernando @LulaOficial Pro...,52045368,Edson,Arcanjo68726610,https://www.twitter.com/Arcanjo68726610/status...,3,1,52045368,2022-09-14 15:50:26.388777,2022-09-14 18:50:26.388777,31,0,3.0,2.801389,1.0,2022-09-14T18:30:00
7,1412836643858493444,2022-09-14 21:18:19+00:00,1570160037204725760,"{'mentions': [{'start': 0, 'end': 12, 'usernam...",@jilmartatto @Haddad_Fernando @LulaOficial É o...,52045368,Amora 🌷🌼🚩,Amora96757846,https://www.twitter.com/Amora96757846/status/1...,3,1,52045368,2022-09-14 15:50:26.388777,2022-09-14 18:50:26.388777,31,0,3.0,2.801389,1.0,2022-09-14T18:30:00
8,54896438,2022-09-14 21:09:59+00:00,1570157937712308225,"{'mentions': [{'start': 0, 'end': 12, 'usernam...",@jilmartatto @Haddad_Fernando @LulaOficial Mal...,52045368,Reinaldo Carrera,ReinaldoCarrera,https://www.twitter.com/ReinaldoCarrera/status...,3,1,52045368,2022-09-14 15:50:26.388777,2022-09-14 18:50:26.388777,31,0,3.0,2.801389,1.0,2022-09-14T18:30:00
9,1561716279005806594,2022-09-14 20:55:52+00:00,1570154385652473857,"{'mentions': [{'start': 0, 'end': 12, 'usernam...",@jilmartatto @Haddad_Fernando @LulaOficial htt...,52045368,Amigo da Paz!,AmigodaPaz4,https://www.twitter.com/AmigodaPaz4/status/157...,3,1,52045368,2022-09-14 15:50:26.388777,2022-09-14 18:50:26.388777,31,0,3.0,2.801389,1.0,2022-09-14T18:30:00


### Capturas que acusaram erro

In [12]:
import xavy.twitter as xt

In [10]:
# Dá erro porque o usuário não é encontrado:
#url = get_mentions_in_period(1541768269668966403, '2022-09-15T09:30:00', '2022-09-15T12:30:00')

In [13]:
# Identificando um erro (com um usuário num certo intervalo de tempo):

url = 'https://api.twitter.com/2/users/1541768269668966403/mentions'

params = {'tweet.fields': ['created_at'],
 'expansions': ['author_id',
  'in_reply_to_user_id',
  'entities.mentions.username'],
 'max_results': 100,
 'start_time': '2022-09-15T12:30:00Z',
 'end_time': '2022-09-15T15:30:00Z'}

mentions = xt.request_twitter_api(url, params)

### Cálculo de tamanho de batches (# de candidatos)

In [4]:
twitter_df = pd.read_csv('../dados/processados/twitter+insta+lagom_ids_deputados_2022.csv')
len(twitter_df) / 3

1158.0

In [5]:
config = read_config('../tweets/tweet_capture_config.json')

In [9]:
batch_n_cands(8075, '2022-10-03', 4, 3, config['tweet_cap'], config['tweets_buffer'])

2071

## Captura

In [8]:
global_df = pd.read_csv('logs/capture/capture_2022-08-12T17:40:00.csv')

def fake_run_batch_capture(batch_time, twitter_df, config, previous_batch=None, verbose=False, no_protected=False):
    time.sleep(10)
    return global_df

In [10]:
batch_time = '2022-08-14T09:14:00'
batch_df = None
config = read_config()
while False:
    
    # Wait for next batch:
    batch_time = next_batch_time(config['capture_period'], ini_date=batch_time)
    log_print('Next batch at [{}]. Sleeping...'.format(batch_time))
    sleep_time = (batch_time - dt.datetime.now()).total_seconds()
    time.sleep(sleep_time)
    
    # Load data and config:
    log_print('Reload config and ID pool!', True)
    config = read_config()
    twitter_df = pd.read_csv(config['twitter_ids_file'])
    n_cands = batch_n_cands(config['curr_level'], config['cap_renew_date'], config['avg_tweets_per_cand'], config['capture_period'], config['tweet_cap'], config['tweets_buffer'])
    config_message = 'Batch config! # cands: {:d}, current level: {:d}, cap renew date: {}, avg. tweets p. cand: {:.3f}, capture period: {:.3f}'
    log_print(config_message.format(n_cands, config['curr_level'], config['cap_renew_date'], config['avg_tweets_per_cand'], config['capture_period']))
    
    # Run next batch:
    log_print('Running batch...')
    batch_df = run_batch_capture(batch_time, twitter_df, config, batch_df)
    tot_tweets = sum_batch_tweets(batch_df)
    avg_tweets = compute_avg_tweets(batch_df)
    log_print('Finished batch! Tweets captured: {:d}, Avg tweets: {:.3f}'.format(tot_tweets, avg_tweets))
    
    # Update config:
    update_current_level(tot_tweets, config)
    update_cap_renew_date(config, batch_time)
    log_print('Updated config! current level: {:d}, cap renew date: {}'.format(config['curr_level'], config['cap_renew_date']))

## Testes

In [None]:
# Carrega os dados capturados:
result_df = load_saved_mentions(config['data_dir'])

In [149]:
# Adiciona nota do modelo aos dados:
model  = sw.HateSpeechModel('../modelos/bertimbau-hatespeech-v01')
y_pred = model.predict_proba(result_df['text'])
result_df['hate_score'] = pd.Series(y_pred, index=result_df.index)

Loading tokenizer from neuralmind/bert-base-portuguese-cased
Loading trained model: ../modelos/bertimbau-hatespeech-v01


2022-08-13 11:08:47.970692: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2022-08-13 11:08:47.970737: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: glitterbu
2022-08-13 11:08:47.970746: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: glitterbu
2022-08-13 11:08:47.970881: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 390.154.0
2022-08-13 11:08:47.970909: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 390.154.0
2022-08-13 11:08:47.970917: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 390.154.0
2022-08-13 11:08:47.991092: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critic

In [158]:
# Print captured mentions of a target user:
username = 'carteiroreaca'
target_id = twitter_df.query('username == "{}"'.format(username))['id'].iloc[0]
xd.print_string_series(result_df.loc[target_id == result_df['batch_user'], 'text'])

[1m1074[0m: @carteiroreaca @TSEjusbr Cadê as agências de checagem?? A mentira tá no topo do Twitter
[1m1075[0m: @carteiroreaca @TSEjusbr NÃO EXISTE PERSEGUIÇÃO AOS DOIS LADOS AMIGO.
[1m1076[0m: @carteiroreaca @g_garc2 Aff... 45 não, 24 + 21.
[1m1077[0m: @carteiroreaca Ãiiiiinnnn
[1m1078[0m: @carteiroreaca Deus nos livra desse encosto. PSDBOSTA e PTOTAL já eram.
[1m1079[0m: @carteiroreaca @g_garc2 pelo amor de Deus não mostra esse ser.....
[1m1080[0m: @carteiroreaca Por favor,veja se consegue reverter o uso das máscaras,ainda em uso em São Paulo
[1m1081[0m: @filgmartin @carteiroreaca Jornalismo virou sinônimo de gente preguiçosa e preconceituosa. Matérias sobre os mais diversos assuntos são escritas sem tirar a bunda da cadeira e sem verificar nada. Se tornou algo comum em veículos de grande circulação se deparar com erros grosseiros de português.
[1m1082[0m: @carteiroreaca Jamais levará 1 voto de minha família
[1m1083[0m: @carteiroreaca Aqui em rio claro sp, etanol 

In [140]:
# Print batch pages:
check_df = batch_df.join('https://twitter.com/' + twitter_df.set_index('id')['username'], on = 'id')
xd.print_string_series(check_df.set_index('batch_tweets')['username'].sample(20))

[1m0[0m: https://twitter.com/charlesschulle
[1m3[0m: https://twitter.com/FadiFaraj
[1m0[0m: https://twitter.com/DeleyFlu
[1m0[0m: https://twitter.com/enf_priferraz
[1m1[0m: https://twitter.com/mauro_bfilho
[1m0[0m: https://twitter.com/RicattoEduardo
[1m0[0m: https://twitter.com/doutor_vicente
[1m9[0m: https://twitter.com/lucianagenro
[1m0[0m: https://twitter.com/proclaudete
[1m0[0m: https://twitter.com/ReginauroSousa
[1m0[0m: https://twitter.com/LuisFernandoEP1
[1m0[0m: https://twitter.com/caeetanocostaa
[1m0[0m: https://twitter.com/RicardoLaurenti
[1m0[0m: https://twitter.com/rodrigo_pt13
[1m0[0m: https://twitter.com/EvandroLeitao
[1m1[0m: https://twitter.com/MacaeEvaristo
[1m0[0m: https://twitter.com/prof_thaysa
[1m0[0m: https://twitter.com/MarinhoSilva30
[1m0[0m: https://twitter.com/rodrigomarcial_
[1m0[0m: https://twitter.com/vivianemourapi


## Testando a API v2

In [43]:
response = requests.get('https://api.twitter.com/2/users/1001251931812220928/timelines/reverse_chronological')

In [39]:
response = tw.request_twitter_api('https://api.twitter.com/2/users/1001251931812220928/timelines/reverse_chronological', {})

Exception: Request failed (403): Forbidden.

In [4]:
url = 'https://api.twitter.com/2/users/1001251931812220928/mentions'
params = {'max_results': 10, 'expansions':['author_id','in_reply_to_user_id']}
mentions = tw.request_twitter_api(url, params)

In [5]:
mentions

{'data': [{'text': '@DelyLeo @tabataamaralsp Inspira a direita a destruír direitos.',
   'in_reply_to_user_id': '107233534',
   'id': '1561726165961801729',
   'author_id': '2606479004'},
  {'text': '@bbgenerico @tabataamaralsp Profissão dela é apoiadora do lemann um bilionário que defende a volta da escravidão',
   'in_reply_to_user_id': '1134083128132349953',
   'id': '1561726013331087361',
   'author_id': '2606479004'},
  {'text': '@ForensicsPeter @LucarsHenrique @tabataamaralsp Melhor que essa Deputada que defende um bandido como lemann. Diz que veio da pobreza mas defende bilionários. Vergonhoso',
   'in_reply_to_user_id': '1027361952056700928',
   'id': '1561725810779766787',
   'author_id': '2606479004'},
  {'text': '@marquimlusival @tabataamaralsp Ela votou contra as mulheres, votou pela privatização dos Correios causando demissão de 90 mil trabalhadores incluindo mulheres e TB votou pela privatização das universidades onde mulheres não vão poder estudar já que pobre não pode p

In [20]:
tw.tweets_lookup([1554203624733024256])

[{'created_at': 'Mon Aug 01 20:33:14 +0000 2022',
  'id': 1554203624733024256,
  'id_str': '1554203624733024256',
  'full_text': '@Tali_Mito22 @tabataamaralsp ELA SO FALA EM CORRUPÇAO, MAS NO FUNDO E DA MESMA PANELA, E CAIU NOS BRAÇOS DO MAIOR CORRUPTO DESDE BRASIL, MESU PARABENS SUA IDIOTA PERFEITA, JA COMEÇE A ARRUMAR OUTRA COISA PARA FAZER, POIS O BRASIL VAI VOTAR EM #BolsonaroReeleitoEm2022',
  'truncated': False,
  'display_text_range': [29, 267],
  'entities': {'hashtags': [{'text': 'BolsonaroReeleitoEm2022',
     'indices': [243, 267]}],
   'symbols': [],
   'user_mentions': [{'screen_name': 'Tali_Mito22',
     'name': '💚💛 Mulher Patriota 💚💛🇧🇷',
     'id': 1505670103723130883,
     'id_str': '1505670103723130883',
     'indices': [0, 12]},
    {'screen_name': 'tabataamaralsp',
     'name': 'Tabata Amaral',
     'id': 1001251931812220928,
     'id_str': '1001251931812220928',
     'indices': [13, 28]}],
   'urls': []},
  'source': '<a href="https://mobile.twitter.com" rel="nofoll

In [19]:
mentions

{'data': [{'author_id': '587125018',
   'id': '1554205332347437056',
   'text': '@tabataamaralsp De onde sai o dinheiro, tia?.'},
  {'author_id': '1537512611406925828',
   'id': '1554204349873676311',
   'text': '@Tali_Mito22 @tabataamaralsp Um ladrão, uma patrocinada por milionário e a CARCEREIRA'},
  {'author_id': '4098368541',
   'id': '1554203624733024256',
   'text': '@Tali_Mito22 @tabataamaralsp ELA SO FALA EM CORRUPÇAO, MAS NO FUNDO E DA MESMA PANELA, E CAIU NOS BRAÇOS DO MAIOR CORRUPTO DESDE BRASIL, MESU PARABENS SUA IDIOTA PERFEITA, JA COMEÇE A ARRUMAR OUTRA COISA PARA FAZER, POIS O BRASIL VAI VOTAR EM #BolsonaroReeleitoEm2022'},
  {'author_id': '1422787253189976064',
   'id': '1554202593156632578',
   'text': '@tabataamaralsp Não faz sentido.'},
  {'author_id': '57023112',
   'id': '1554202528576929794',
   'text': '@tabataamaralsp Isso já existia e se chamava BOLSA ESCOLA, que o chefe de quadrilha que vc apóia acabou.'},
  {'author_id': '1232746374577446913',
   'id': '15542

## Testando a API v1

In [9]:
tweets[0]['user']

{'id': 1001251931812220928, 'id_str': '1001251931812220928'}

In [2]:
tweets = tw.get_timeline('tabataamaralsp')

In [4]:
reply_ids = [1554127614444462084, 1554092652320129024, 1554082621428318215, 1554104674214506496]

In [5]:
replies = tw.tweets_lookup(reply_ids)

In [9]:
tw.tweets_lookup([1554194704270606339])

[{'created_at': 'Mon Aug 01 19:57:47 +0000 2022',
  'id': 1554194704270606339,
  'id_str': '1554194704270606339',
  'full_text': '@tabataamaralsp O Estado não pode ser responsável pelos pais que colocam filhos no mundo e não consegue dar o mínimo de “educação” o que é diferente de escolaridade.',
  'truncated': False,
  'display_text_range': [16, 165],
  'entities': {'hashtags': [],
   'symbols': [],
   'user_mentions': [{'screen_name': 'tabataamaralsp',
     'name': 'Tabata Amaral',
     'id': 1001251931812220928,
     'id_str': '1001251931812220928',
     'indices': [0, 15]}],
   'urls': []},
  'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
  'in_reply_to_status_id': 1554145509761339398,
  'in_reply_to_status_id_str': '1554145509761339398',
  'in_reply_to_user_id': 1001251931812220928,
  'in_reply_to_user_id_str': '1001251931812220928',
  'in_reply_to_screen_name': 'tabataamaralsp',
  'user': {'id': 1571291780,
   'id_str': '1571291780

## Lixo

### Carregando dados de redes

In [6]:
# Load TSE data and extract twitter usernames:
redes20_df = pd.read_csv('../dados/processados/redes_candidatos_2022.csv')
twitter_df = redes20_df.query('twitter == 1')[['SQ_CANDIDATO', 'DS_URL']].copy()
twitter_df['twitter_username'] = extract_twitter_username(twitter_df['DS_URL'])

In [9]:
# Down-sample to test functions:
test_twitter_df = twitter_df.sample(20, random_state=56236101)

In [51]:
response_df = request_twitter_user_info(test_twitter_df)
# Save results:
#response_df.to_json('../dados/brutos/twitter/cand_twitter_user_info_TESTE.json', force_ascii=False, orient='records')

### Teste de concatenação de JSONs

In [250]:
import xavy.test as xtest

cin = [(1,2), ([1,2], [3,4]), ('a', 'b'), (['a', 'b'], ['c', 'd']), ({'a': 1}, {'a': 2}), ({'a': 1}, {'b': 2}), ({'a': 1, 'b':0}, {'b': 2}),
       ({'a': [1, 2], 'b':[0,1]}, {'b': 2}), ([{'a':1}, {'b':2}], [{'c':3}, {'d':4}]), ({'a':{'A': 1, 'B': 2}}, {'a':{'A': 2, 'B': 3}}),
       ({'a':{'A': 1, 'B': [0,1,2]}, 'b':[1,2,3]}, {'a':{'A': [2,3,4], 'B': 3}})]
cout = [[1,2], [1,2,3,4], ['a', 'b'], ['a', 'b', 'c', 'd'], {'a':[1,2]}, {'a':1, 'b': 2}, {'a':1, 'b': [0,2]}, 
        {'a':[1, 2], 'b': [0,1,2]}, [{'a':1}, {'b':2}, {'c':3}, {'d':4}], {'a':{'A':[1,2], 'B':[2,3]}}, 
        {'a':{'A':[1,2,3,4], 'B':[0,1,2,3]}, 'b':[1,2,3]}]
xtest.multi_test_function(tw.concat_json_pages, cin, cout)

### Funções velhas

In [173]:
def concat_mentions(mentions1, mentions2):
    """
    Concatenate twitter API responses from
    /2/users/:id/mentions endpoint maintaining
    the same data structure of lists inside 
    mentions dict keys. mentions keys that are
    not lists are turned into lists and 
    concatenated.
    
    Returns a dict.
    """
    
    # Check if API response contains unknown fields:
    known_keys = ['data', 'includes', 'errors', 'meta']
    unknown_keys = set(mentions1.keys()) - set(known_keys)
    assert len(unknown_keys) == 0, 'Found unknown keys {} in mentions1'.format(unknown_keys)
    unknown_keys = set(mentions2.keys()) - set(known_keys)
    assert len(unknown_keys) == 0, 'Found unknown keys {} in mentions2'.format(unknown_keys)
    
    # Create input "with all keys":
    m1 = defaultdict(lambda: [], mentions1)
    m2 = defaultdict(lambda: [], mentions2)
    
    mentions3 = dict()
    # Concatenate data:
    for k in known_keys:
        
        # Standardize data as lists:
        if type(m1[k]) != list:
            m1[k] = [m1[k]]
        if type(m2[k]) != list:
            m2[k] = [m2[k]]
        
        # Concatenate lists:
        mentions3[k] = m1[k] + m2[k]
    
    return mentions3

In [346]:
def program_day_capture(twitter_df, n_cands, capture_period, date=None, hour_offset='00:10:00', random_state=None):
    """
    Generate DataFrame with a schedule for capturing data from 
    randomly sampled candidates, organized in batches spaced 
    during the day.
    
    Parameters
    ----------
    twitter_df : DataFrame
        Table with column 'id' containing all the candidates'
        Twitter IDs.
    n_cands : int
        Number of candidates to randomly select for today's 
        capture.
    date : str or None
        Date to schedule the capture on, in format '%Y-%m-%d'.
        If None, get the current date.
    hour_offset : str
        First time during this day that the capture will run,
        in format '%H:%M:%S'.
    random_state : int or None
        Seed for randomly selecting candidates. Use None for 
        random seed.
    
    Returns
    -------
    df : DataFrame
        Table with candidates to capture, their batches and 
        the time they should be captured.
    """
    
    # Get current date if needed:
    if date is None:
        date = dt.date.today().strftime('%Y-%m-%d')
    
    # Set first datetime of today's capture:
    dt_offset = parse_utc_time(date + 'T' + hour_offset, bsb2utc=False)
    
    # Randomly select candidates:
    daily_capture_df = twitter_df['id'].sample(n_cands, random_state=random_state).reset_index()
    daily_capture_df.rename({'index':'cand_id_pos', 'id':'user_id'}, axis=1, inplace=True)
    
    # Prepare batch information:
    daily_capture_df['batch']   = (np.arange(n_cands) / int(n_cands / (24 / capture_period) + 1)).astype(int)
    batch_size = daily_capture_df['batch'].value_counts()
    batch_size.name  = 'batch_size'
    daily_capture_df = daily_capture_df.join(batch_size, on='batch')
    daily_capture_df['batch_time'] = dt_offset + daily_capture_df['batch'] * dt.timedelta(hours=capture_period)
    daily_capture_df['status'] = 'queue'
    
    return daily_capture_df

Traceback (most recent call last):
  File "/tmp/ipykernel_27054/1827313366.py", line 2, in <cell line: 1>
    raise ValueError
ValueError

