# Exploratory Data Analysis with Macroeconomic Data

In [None]:
import yaml
import numpy as np
import pandas as pd
from string import ascii_letters
from sqlalchemy import create_engine
from plotnine import *
import matplotlib.pyplot as plt
import seaborn as sns

## Connect With Server

In [None]:
# Import profile information
# Requires a .yaml file 

# MUST EDIT THIS LINE FOR YOUR FILE NAME
file = open("../config.yaml", 'r')
config = yaml.load(file)

In [None]:
# Create engine
engine_path = 'postgresql://{}:{}@{}/direccion_trabajo_inspections'
engine = create_engine(engine_path.format(config['db']['username'], config['db']['password'], config['db']['host']))

## Example of SQL query from database

In [None]:
# Example!!
# Notice that you will have to set role at the beginning of each query
qry = 'set role direccion_trabajo_inspections_write;select * from raw.inspections_sample limit 30;'
pd.read_sql_query(qry, engine)

In [None]:
qry = """
        set role direccion_trabajo_inspections_write; select variable_name, 
        description_spanish, description_english from raw.metadata;
        """

In [None]:
pd.read_sql_query(qry, engine)

In [None]:
qry = """
        set role direccion_trabajo_inspections_write;
        select emptrabhombres, infra
        from raw.inspections_complete where infra > 0;
    """
df = pd.read_sql_query(qry, engine)

In [None]:
df.tail()

In [None]:
fig, ax = plt.subplots()
ax.scatter(df['emptrabhombres'], df['infra'], marker='+')
ax.set_xlabel('Number of Employees')
ax.set_ylabel('Number of Infractions')
ax.set_title('Infractions-Size Plot')
plt.show()

In [None]:
qry = """
        set role direccion_trabajo_inspections_write;
        select totalafectados, infra
        from raw.inspections_complete where infra > 0;
    """
df = pd.read_sql_query(qry, engine)

In [None]:
fig, ax = plt.subplots()
ax.scatter(df['totalafectados'], df['infra'], marker='o')
ax.set_xlabel('Number of Employees Affected')
ax.set_ylabel('Number of Infractions')
ax.set_title('Infractions-Total Affected Plot')
plt.show()

In [None]:
qry = """
        set role direccion_trabajo_inspections_write;
        select cast(region as int), avg(emptrabhombres) as avgsize
        from raw.inspections_complete group by region order by region asc;
    """
df = pd.read_sql_query(qry, engine)

In [None]:
df

In [None]:
fig, ax = plt.subplots()
ax.bar(df['region'], df['avgsize'])
ax.set_xlabel('Region')
ax.set_ylabel('Average Number of Employees')
ax.set_title('Average Number of Employees by Region')
plt.show()

In [None]:
qry = """
        set role direccion_trabajo_inspections_write;
        select cast(region as int), avg(infra) as avginfra
        from raw.inspections_complete where infra > 0 group by region order by region asc;
    """
df = pd.read_sql_query(qry, engine)

In [None]:
fig, ax = plt.subplots()
ax.bar(df['region'], df['avginfra'])
ax.set_xlabel('Region')
ax.set_ylabel('Average Number of Infractions')
ax.set_title('Average Number of Infractions by Region')
plt.show()

In [None]:
qry = """
        set role direccion_trabajo_inspections_write;
        select cast(region as int), avg(num_materias) as avgnummaterias
        from raw.inspections_complete where num_materias > 0 group by region order by region asc;
    """
df = pd.read_sql_query(qry, engine)

In [None]:
fig, ax = plt.subplots()
ax.bar(df['region'], df['avgnummaterias'])
ax.set_xlabel('Region')
ax.set_ylabel('Avg. Number of Matters Inspected')
ax.set_title('Average Number of Inspected Matters by Region')
plt.show()

In [None]:
qry = """
        set role direccion_trabajo_inspections_write;
        select cast(mesreg as int), cast(region as int), avg(infra) as avginfra
        from raw.inspections_complete where infra > 0 group by mesreg, region order by mesreg, region asc;
    """
df = pd.read_sql_query(qry, engine)

In [None]:
# Edit this
fig = plt.figure()
ax = fig.add_subplot(111)

yvals = df['avginfra']
rects1 = ax.bar(df['mesreg'], yvals, 0.25, color='r')
zvals = df['mesreg']
rects2 = ax.bar(df['mesreg']+.25, zvals, 0.25, color='g')
kvals = df['region']
rects3 = ax.bar(df['mesreg']+.25*2, kvals, 0.25, color='b')

ax.set_ylabel('Scores')
ax.legend( (rects1[0], rects2[0], rects3[0]), ('y', 'z', 'k') )

plt.show()

## Macroeconomic Data

In [None]:
qry = """
        set role direccion_trabajo_inspections_write;
        select inspect.datereg as date, avg(copper.value) as copprice, count(inspect.infra) as numinfra from 
        raw.inspections_complete as inspect left join raw.copper as copper on
        inspect.datereg=copper.date group by inspect.datereg order by inspect.datereg desc;
    """
df = pd.read_sql_query(qry, engine)

In [None]:
df.head()

In [None]:
# Time series plot of infractions and copper price scaled by maximum value
fig = plt.figure()
ax1 = fig.add_subplot(111)

ax1.plot_date(df['date'], df['copprice']/max(df['copprice']), color='black', label='Copper Price')
ax1.plot_date(df['date'], df['numinfra']/max(df['numinfra']), color='green', label='Infractions')
ax1.set_title('(Scaled) Infractions and Copper Price over Time')

plt.rcParams["figure.figsize"] = [20,15]
plt.legend()
plt.show()

In [None]:
qry = """
        set role direccion_trabajo_inspections_write;
        select inspect.datereg as date, avg(copper.value) as copprice, count(inspect.infra) as numinfra from 
        raw.inspections_complete as inspect left join raw.copper as copper on
        inspect.datereg=copper.date where inspect.crae = '103'group by inspect.datereg order by inspect.datereg desc;
    """
df = pd.read_sql_query(qry, engine)

In [None]:
df.head()

In [None]:
# Time series plot of infractions and copper price scaled by maximum value
fig = plt.figure()
ax1 = fig.add_subplot(111)

ax1.plot_date(df['date'], df['copprice']/max(df['copprice']), color='black', label='Copper Price')
ax1.plot_date(df['date'], df['numinfra']/max(df['numinfra']), color='purple', label='Infractions')
ax1.set_title('(Scaled) Infractions and Copper Price over Time in Mining Sector')

plt.legend()
plt.show()

In [None]:
qry = """
        set role direccion_trabajo_inspections_write;
        select inspect.datereg as date, avg(copper.value) as copprice, count(inspect.infra) as numinfra from 
        raw.inspections_complete as inspect left join raw.copper as copper on
        inspect.datereg=copper.date where inspect.ccae = '133000' AND inspect.infra > 0 group by inspect.datereg order by inspect.datereg desc;
    """
df = pd.read_sql_query(qry, engine)

In [None]:
df.head()

In [None]:
df.corr()

In [None]:
# Time series plot of infractions and copper price scaled by maximum value
fig = plt.figure()
ax1 = fig.add_subplot(111)

ax1.plot_date(df['date'], df['copprice']/df['copprice'].max(), color='black', label='Copper Price')
ax1.plot_date(df['date'], df['numinfra']/df['numinfra'].max(), color='blue', label='Infractions')
ax1.set_title('(Scaled) Infractions and Copper Price over Time in Copper Mining Sector')

plt.legend()
plt.show()

In [None]:
qry = """
        set role direccion_trabajo_inspections_write;
        select inspect.mesreg as date, avg(macroeconomic_monthly.total_retail_trade) as retail_trade, count(inspect.infra) as 
        numinfra from raw.inspections_complete as inspect left join raw.macroeconomic_monthly as macroeconomic_monthly on
        inspect.datereg=macroeconomic_monthly.date group by inspect.mesreg order by date asc;
    """
df = pd.read_sql_query(qry, engine)

In [None]:
df.head(n=30)

In [None]:
# Time series plot of infractions and retauk trade scaled by maximum value
fig = plt.figure()
ax1 = fig.add_subplot(111)

ax1.plot(df['date'], df['retail_trade']/max(df['retail_trade']), color='blue', label='Retail Trade')
ax1.plot(df['date'], df['numinfra']/max(df['numinfra']), color='red', label='Infractions')
ax1.set_title('(Scaled) Total Retail Trade and Number of Infractions over Time')

plt.rcParams["figure.figsize"] = [20,15]
plt.legend()
plt.show()

In [None]:
qry = """
        set role direccion_trabajo_inspections_write;
        select * from raw.copper;
    """
copper = pd.read_sql_query(qry, engine)

In [None]:
copper.describe()

In [None]:
qry = """
        set role direccion_trabajo_inspections_write;
        select * from raw.macroeconomic_monthly;
    """
macro_monthly = pd.read_sql_query(qry, engine)

In [None]:
macro_monthly.describe()

In [None]:
corr = macro_monthly.corr()

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

f, ax = plt.subplots(figsize=(11, 9))

cmap = sns.diverging_palette(220, 10, as_cmap=True)

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
qry = """
        set role direccion_trabajo_inspections_write;
        select * from raw.macroeconomic_quarterly;
    """
macro_quarterly = pd.read_sql_query(qry, engine)

In [None]:
macro_quarterly.describe()

In [None]:
corr2 = macro_quarterly.corr()

mask = np.zeros_like(corr2, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

f, ax = plt.subplots(figsize=(11, 9))

cmap = sns.color_palette('deep', 10)

sns.heatmap(corr2, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
qry = """
        set role direccion_trabajo_inspections_write;
        select * from
        
        (select idfiscalizacion, extract(month from datereg) as month, extract(year from datereg) as year, infra,
        emptrabhombres
        from raw.inspections_complete) inspect
        
        join
        
        (select extract(month from date) as month, extract(year from date) as year,
        cpi, exchange_rate, civilian_labor_force, total_retail_trade, active_population,
        employment_rate, inactivity_rate, unemployment_rate
        from raw.macroeconomic_monthly) macro_data
        
        on inspect.month=macro_data.month and inspect.year=macro_data.year;
    """
join_macro_inspections = pd.read_sql_query(qry, engine)

In [None]:
join_macro_inspections.head()

In [None]:
join_macro_inspections.describe()

In [None]:
corr3 = join_macro_inspections.corr()

mask = np.zeros_like(corr3, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

f, ax = plt.subplots(figsize=(11, 9))

cmap = sns.diverging_palette(220, 10, as_cmap=True)

sns.heatmap(corr3, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})