In [1]:
import numpy as np
import pandas as pd
import os.path
import re

In [2]:
DATA_DIR = '../data'

In [3]:
def find_newest_file(name):
    """
    Assuming that the files will be in the form of :
    yyyy-mm-dd-type_of_file.xz we can try to find the newest file
    based on the date, but if the file doesn't exist fallback to another
    date until all dates are exhausted
    """
    date_regex = re.compile('\d{4}-\d{2}-\d{2}')

    matches = (date_regex.findall(f) for f in os.listdir(DATA_DIR))
    dates = sorted(set([l[0] for l in matches if l]), reverse=True)
    for date in dates:
        filename = os.path.join(DATA_DIR, '{}-{}.xz'.format(date, name))
        if os.path.isfile(filename):
            return filename
    return None

In [11]:
data = pd.read_csv(find_newest_file('foursquare-companies'), dtype={'clean_cnpj': np.str})

In [12]:
# How many companies were fetched?
len(data['cnpj'].unique())

1300

In [6]:
# How many companies have price data?
len(data['price.tier'].dropna())

942

In [7]:
# How are those prices distributed?
data['price.message'].value_counts()

Moderate          536
Cheap             187
Expensive         186
Very Expensive     33
Name: price.message, dtype: int64

In [8]:
# How long did it take to fetch the first 1000 companies?
print(data.iloc[1000]['scraped_at'])
print(data.iloc[0]['scraped_at'])

2016-11-29T03:29:43.145063
2016-11-29T02:49:19.725351


In [9]:
# Loading reimbursements dataset to compare how much data we're missing from meal_reimbursements.
reimbursements = pd.read_csv(find_newest_file('reimbursements'), dtype={'cnpj_cpf': np.str})
meals = reimbursements[reimbursements['subquota_description'] == 'Congressperson meal']

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
# How many companies are left to be fetched?
remaining = meals[~meals['cnpj_cpf'].isin(data['cnpj'])]
len(meals[~meals['cnpj_cpf'].isin(data['clean_cnpj'])]['cnpj_cpf'].unique())

18891