In [1]:
import requests

import pandas as pd

# Data Acquisition Exercises

## 1

Using the code from the lesson as a guide and the REST API from https://python.zgulde.net/api/v1/items as we did in the lesson, create a dataframe named items that has all of the data for items.

In [None]:
url = 'https://api.data.codeup.com'
endpoint = '/documentation'
response = requests.get(url + endpoint)
print(response.json()['payload'])

In [None]:
endpoint = '/api/v1/items'
response = requests.get(url + endpoint)
data = response.json()
data.keys()

In [None]:
payload = data['payload']
payload.keys()

In [None]:
payload['next_page']

In [None]:
def get_data(url, endpoint, name):
    data = pd.DataFrame()
    
    while True:
        print(f'Reading page {endpoint}', end = '\r')
        contents = requests.get(url + endpoint).json()
        page_contents = pd.DataFrame(contents['payload'][name])
        data = pd.concat([data, page_contents])
        
        if not (next_page := contents['payload']['next_page']):
            break
            
        endpoint = next_page
        
    data = data.reset_index().drop(columns = 'index')
    print('Loading complete. Returning data.')
        
    return data

In [None]:
items = get_data('https://api.data.codeup.com', '/api/v1/items', 'items')
items.shape

## 2

Do the same thing, but for stores (https://python.zgulde.net/api/v1/stores)

In [None]:
url = 'https://api.data.codeup.com/api/v1/stores'
data = requests.get(url).json()
data.keys()

In [None]:
data['payload'].keys()

In [None]:
data['payload']['max_page']

In [None]:
stores = get_data('https://api.data.codeup.com', '/api/v1/stores', 'stores')
stores.shape

## 3

Extract the data for sales (https://python.zgulde.net/api/v1/sales). There are a lot of pages of data here, so your code will need to be a little more complex. Your code should continue fetching data from the next page until all of the data is extracted.

In [None]:
# Let's try the function

sales = get_data('https://api.data.codeup.com', '/api/v1/sales', 'sales')

In [None]:
sales.shape

## 4

Save the data in your files to local csv files so that it will be faster to access in the future.

In [None]:
items.to_csv('items.csv', index = False)
stores.to_csv('stores.csv', index = False)
sales.to_csv('sales.csv', index = False)

## 5

Combine the data from your three separate dataframes into one large dataframe.

In [None]:
items.info()

In [None]:
stores.info()

In [None]:
sales.info()

In [None]:
# item links to item_id and store links to store_id
df = sales.merge(items, how = 'inner', left_on = 'item', right_on = 'item_id', validate = 'm:1')
df = df.drop(columns = 'item_id')
df = df.merge(stores, how = 'inner', left_on = 'store', right_on = 'store_id', validate = 'm:1')
df = df.drop(columns = 'store_id')
df.info()

## 6

Acquire the Open Power Systems Data for Germany, which has been rapidly expanding its renewable energy production in recent years. The data set includes country-wide totals of electricity consumption, wind power production, and solar power production for 2006-2017. You can get the data here: https://raw.githubusercontent.com/jenfly/opsd/master/opsd_germany_daily.csv

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/jenfly/opsd/master/opsd_germany_daily.csv')
df.info()

## 7

Make sure all the work that you have done above is reproducible. That is, you should put the code above into separate functions in the acquire.py file and be able to re-run the functions and get the same data.

In [2]:
from acquire import *

In [None]:
get_open_power_systems_data().info()

In [9]:
load_data('items', use_cache = False).info()

Reading from API.
Loading complete. Returning data.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   item_brand  50 non-null     object 
 1   item_id     50 non-null     int64  
 2   item_name   50 non-null     object 
 3   item_price  50 non-null     float64
 4   item_upc12  50 non-null     object 
 5   item_upc14  50 non-null     object 
dtypes: float64(1), int64(1), object(4)
memory usage: 2.5+ KB
