# Anneda Rong (aar2dk) 
# University of Virginia
# DS 3002 - Professor Jon Tupitza
# ETL Pipeline Project - March 2022

### Imports

In [1]:
import os
import numpy
import pandas as pd
import pymysql
from sqlalchemy import create_engine
import requests
import json
from pandas.io.json import json_normalize

### Constants

In [2]:
headers = {
    'x-rapidapi-host': "tasty.p.rapidapi.com",
    'x-rapidapi-key': "7309b85eeamsh035b88dbc10b3bep18e452jsn850c209f1f60"
    }

base_url = "https://tasty.p.rapidapi.com"
recipe_data_key = 'results'
tags_data_key = 'results'

## Extract

### API Call Function

In [3]:
def api_call(url: str, params: dict):
    api_request = base_url + url
    response = requests.request("GET",api_request,headers=headers,params=params)
    def check_status_code(api_request):
        if api_request.status_code != 200:
            raise Exception (f"Response was invalid: {api_request.status_code}")
        return (api_request)
    data = response.text
    return(data)

In [4]:
base_url = "https://tasty.p.rapidapi.com"

headers = {
    'x-rapidapi-host': "tasty.p.rapidapi.com",
    'x-rapidapi-key': "7309b85eeamsh035b88dbc10b3bep18e452jsn850c209f1f60"
    }

### API Calls / Extraction of Data

#### All Recipes

In [5]:
url = "/recipes/list"
queryparams = {"from":"0","size":"40"} # skip & take value

all_recipes = api_call(url, queryparams)

## Transform

#### Transform Data Functions

In [6]:
def construct_dataframe(json_data: str, data_key: str):
    parse_json = json.loads(json_data)
    df = pd.json_normalize(parse_json[data_key])
    return(df)

In [7]:
def filter_data(data, filtered_columns, isJSON=False, data_key=None):
    if (isJSON):
        if (data_key == None):
            raise Exception('Data key argument cannot be none if JSON is supplied.')
        df_data = construct_dataframe(data, data_key)
    #check to see data_key != none bc if it is json, then we need a data key to be attached to it
    else:
        df_data = data 
    filtered_data = df_data[filtered_columns]
    return(filtered_data)

In [8]:
def add_pk_col(df,pk_name):
    df.insert(0, pk_name, range(0, df.shape[0]))

In [9]:
# transforming NaN values 

def transform_null(df,column_replace:dict = {},column_no_action:list = []):
    '''
    This function will transform the null values in a dataframe. The default action will be to drop the NULL values
    unless specified in column_replace or column_no_action.
    
    Parameters:
    df : Pandas.Dataframe
        A dataframe object to be cleaned on
    column_replace : dictionary
        A dictionary object with (column:replace). Where column name is the key and the replacement value is the value.
    column_no_action : list
        A list of strings that contain column names of those that shouldn't be touch
        
    Return:
    Pandas.DataFrame
        A dataframe object that has been cleaned.
    '''
    new_df = None
    columns = list(df.columns)
    for column in column_no_action:
        columns.remove(column)
    new_df = df.fillna(column_replace)
    new_df = new_df.dropna(subset = columns)
    return(new_df)


In [10]:
def drop_col(df,column_list):
    newdf = df.drop(column_list,axis = 1)
    return newdf

In [11]:
def get_tags_table(df,cols):
    subset_df = df[cols]
    subset_df = subset_df.explode('tags')
    subset_df['name'] = subset_df['tags'].str['name']
    subset_df['type'] = subset_df['tags'].str['type']
    final_tags_df = subset_df.drop('tags',axis=1)
    return(final_tags_df)

#### Data Transformations

In [12]:
df_all_recipes = construct_dataframe(all_recipes, recipe_data_key)
add_pk_col(df_all_recipes,'recipe_id')

In [13]:
filtered_recipes = filter_data(df_all_recipes, ['recipe_id','name','user_ratings.count_positive','user_ratings.count_negative',
                                                'user_ratings.score','total_time_minutes'])
# filtered_recipes

#### Change Column Names

In [14]:
filtered_recipes.rename(columns={'name': 'recipe_name', 'user_ratings.count_positive': 'up_votes',
                 'user_ratings.count_negative': 'down_votes', 'user_ratings.score': 'score'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


#### Transform Filtered Data

In [15]:
column_replace = {'up_votes':0,'down_votes':0,'score':0,'total_time_minutes':0}
df_clean = transform_null(filtered_recipes, column_replace)

#### Add Popularity Column

In [16]:
popularity_score = df_clean['up_votes'] + df_clean['down_votes']
df_clean['popularity'] = popularity_score

In [17]:
drop_columns = ['up_votes','down_votes']
final_df = drop_col(df_clean,drop_columns)

In [18]:
tags_df = get_tags_table(df_all_recipes,['recipe_id','tags'])
add_pk_col(tags_df,'tags_id')

In [19]:
tags_clean_df = transform_null(tags_df)

## Load

#### Declare & Assign Connection Variables for the MySQL Server & Databases with which I'll be Working

In [20]:
host_name = "localhost"
host_ip = "127.0.0.1"
port = "3306"
user_id = "arong"
pwd = "Passw0rd123"

# data source = tasty api
dst_dbname = "tasty"

In [21]:
def get_dataframe(user_id, pwd, host_name, db_name, sql_query):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    dframe = pd.read_sql(sql_query, connection);
    connection.close()
    
    return dframe


def set_dataframe(user_id, pwd, host_name, db_name, df, table_name, pk_column, db_operation):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    if db_operation == "insert":
        df.to_sql(table_name, con=connection, index=False, if_exists='replace')
        sqlEngine.execute(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});")
            
    elif db_operation == "update":
        df.to_sql(table_name, con=connection, index=False, if_exists='append')
    
    connection.close()

In [22]:
conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}"
sqlEngine = create_engine(conn_str, pool_recycle=3600)

sqlEngine.execute(f"DROP DATABASE IF EXISTS `{dst_dbname}`;") # drop a database if it already exists
sqlEngine.execute(f"CREATE DATABASE `{dst_dbname}`;") # create new database
sqlEngine.execute(f"USE {dst_dbname};")

<sqlalchemy.engine.result.ResultProxy at 0x7fc523101450>

### Create the Dimension Tables and Populate

In [23]:
#Creating RECIPES table
recipe_table = '''
    CREATE TABLE `tasty`.`Recipes` (
      `recipe_id` INT NOT NULL,
      `recipe_name` VARCHAR(200) NULL,
      `score` INT NULL,
      `total_time_minutes` INT NULL,
      `popularity` FLOAT NULL,
      PRIMARY KEY (`recipe_id`));
      '''
drop_tag = '''DROP TABLE IF EXISTS tasty.Tags;'''
drop_table = '''
DROP TABLE IF EXISTS tasty.Recipes;
'''
sqlEngine.execute(drop_tag)

sqlEngine.execute(drop_table)
sqlEngine.execute(recipe_table)

<sqlalchemy.engine.result.ResultProxy at 0x7fc51e1ecb90>

In [24]:
table_name = 'Recipes'
pk_column = 'recipe_id'
db_operation = 'insert'

set_dataframe(user_id, pwd, host_name, dst_dbname, final_df, table_name, pk_column, db_operation)

In [25]:
#Creating the TAGS table
tags_table = '''
    CREATE TABLE `tasty`.`Tags` (
      `tags_id` INT NOT NULL,
      `recipe_id` BIGINT NULL,
      `name` VARCHAR(45) NULL,
      `type` VARCHAR(45) NULL,
      PRIMARY KEY (`tags_id`),
      FOREIGN KEY (`recipe_id`) REFERENCES Recipes(`recipe_id`));
      '''

sqlEngine.execute(tags_table)

<sqlalchemy.engine.result.ResultProxy at 0x7fc51e2ef150>

In [26]:
table_name = 'Tags'
pk_column = 'tags_id'
db_operation = 'insert'

set_dataframe(user_id, pwd, host_name, dst_dbname, tags_clean_df, table_name, pk_column, db_operation)

In [27]:
query = '''
SELECT t.type,AVG(score) AS avg_score,AVG(popularity) AS avg_pop FROM tasty.Recipes r
INNER JOIN tasty.Tags t ON r.recipe_id = t.recipe_id
GROUP BY t.type
ORDER BY avg_pop DESC, avg_score DESC;
'''
results = get_dataframe(user_id, pwd, host_name, dst_dbname, query)
results
#Just a fun query -  Looks like holiday type of dishes are the most popular and highly rated followed by dish_style

Unnamed: 0,type,avg_score,avg_pop
0,holiday,0.833333,24.0
1,dish_style,0.865741,20.25
2,equipment,0.826588,11.888889
3,method,0.808727,11.666667
4,dietary,0.680992,9.966667
5,appliance,0.714563,9.823529
6,meal,0.616492,8.457143
7,occasion,0.613225,6.052632
8,seasonal,0.2,5.0
9,cuisine,0.580491,4.608696
