In [1]:
# Config
# Tables
import pandas as pd
import pyreadstat
# Charting
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Math
import numpy as np
# Tools
import os

import plotly.io as pio
pio.renderers.default = "notebook_connected"

In [2]:
path = './data/USITC/'

archivos = os.listdir(path)
df_original = pd.DataFrame()

for archivo in archivos:
    archivo_df = pd.read_csv(path + archivo)
    df_original = pd.concat([df_original, archivo_df])

df_original

Unnamed: 0,Fecha,Partida,Descripcion,Pais,Region,Unidad,Cantidad,FOB
0,20220201,7318.19.00,"Iron or steel, threaded articles similar to sc...",Finland,Europa y Asia central,kilograms,21,2117
1,20220201,7318.16.00,"Iron or steel, nuts",Germany,Europa y Asia central,kilograms,22,2154
2,20220201,7307.92.90,"Iron or steel (o/than stainless), not cast, th...",Italy,Europa y Asia central,kilograms,191,2173
3,20220101,7318.22.00,"Iron or steel, washers (o/than spring washers ...",United Kingdom,Europa y Asia central,kilograms,1,2380
4,20220101,7315.11.00,"Iron or steel, roller chain",Taiwan,Asia oriental y el Pacifico,kilograms,238,2149
...,...,...,...,...,...,...,...,...
12850,20200801,7326.90.86,"Iron or steel, articles, nesoi",Vietnam,Asia oriental y el Pacifico,kilograms,0,0
12851,20200901,7326.90.86,"Iron or steel, articles, nesoi",Vietnam,Asia oriental y el Pacifico,kilograms,1186,18275
12852,20201001,7326.90.86,"Iron or steel, articles, nesoi",Vietnam,Asia oriental y el Pacifico,kilograms,0,0
12853,20201101,7326.90.86,"Iron or steel, articles, nesoi",Vietnam,Asia oriental y el Pacifico,kilograms,0,0


In [3]:
def clean(df_original):
    df = df_original.copy()

    # Eliminar Comas (,)
    # df = df.replace(',','')

    # Configurar Formato
    ## Fecha -> Datetime
    df['Fecha'] = pd.to_datetime(df['Fecha'], format='%Y%m%d')
    ## Cantidad -> Float
    df["Cantidad"] = df["Cantidad"].str.replace(",","").astype(float)
    ## FOB -> Float
    df["FOB"] = df["FOB"].str.replace(",","").astype(float)


    # Crear variable Precio
    df['Precio'] = df['FOB'] / df['Cantidad']
    df['HS2'] = df['Partida'].str[:2]
    df['HS4'] = df['Partida'].str[:4]
    
    # Drop NA and Sort
    df = df[df['Precio'] != np.Infinity]
    df = df.dropna().sort_values(by=['Cantidad', 'Pais'], ascending=False)

    return df

df_total = clean(df_original)
df_total

Unnamed: 0,Fecha,Partida,Descripcion,Pais,Region,Unidad,Cantidad,FOB,Precio,HS2,HS4
6417,2020-07-01,7214.20.00,"Iron/nonalloy steel, concrete reinforcing bars...",Dominican Rep,Gran Caribe,kilograms,8107590.0,3817010.0,0.470795,72,7214
12309,2020-07-01,7326.20.00,"Iron or steel, articles of wire, nesoi",China,Asia oriental y el Pacifico,number,7232480.0,43895.0,0.006069,73,7326
6419,2020-09-01,7214.20.00,"Iron/nonalloy steel, concrete reinforcing bars...",Dominican Rep,Gran Caribe,kilograms,6878665.0,3130017.0,0.455033,72,7214
600,2021-10-01,7214.20.00,"Iron/nonalloy steel, concrete reinforcing bars...",Turkey,Europa y Asia central,kilograms,5915815.0,4365484.0,0.737935,72,7214
6420,2020-10-01,7214.20.00,"Iron/nonalloy steel, concrete reinforcing bars...",Dominican Rep,Gran Caribe,kilograms,5779984.0,2734497.0,0.473098,72,7214
...,...,...,...,...,...,...,...,...,...,...,...
9026,2020-12-01,7309.00.00,"Iron/steel, reservoirs, tanks, vats, siml. con...",Canada,America del Norte,number,1.0,9195.0,9195.000000,73,7309
9,2021-01-01,7204.29.00,Alloy steel (o/than stainless) waste and scrap,Br Virgin Is,Gran Caribe,metric tons,1.0,2375.0,2375.000000,72,7204
5301,2020-07-01,7204.29.00,Alloy steel (o/than stainless) waste and scrap,Br Virgin Is,Gran Caribe,metric tons,1.0,3200.0,3200.000000,72,7204
3823,2021-08-01,7318.15.80,"Iron or steel, screws and bolts, nesoi, having...",Belgium,Europa y Asia central,kilograms,1.0,3288.0,3288.000000,73,7318


In [4]:
def unidades(df):
    df = df.copy()
    df = df.groupby(by=['Unidad', 'HS2']).sum().reset_index()
    
    fig = px.sunburst(
        df,
        values='FOB',
        path = ['Unidad','HS2']
    )
    fig.show()
unidades(df_total)

In [5]:
def solo_unidades(unidad, df):
    return df[df['Unidad'] == unidad]

df = solo_unidades('kilograms', df_total)
df

Unnamed: 0,Fecha,Partida,Descripcion,Pais,Region,Unidad,Cantidad,FOB,Precio,HS2,HS4
6417,2020-07-01,7214.20.00,"Iron/nonalloy steel, concrete reinforcing bars...",Dominican Rep,Gran Caribe,kilograms,8107590.0,3817010.0,0.470795,72,7214
6419,2020-09-01,7214.20.00,"Iron/nonalloy steel, concrete reinforcing bars...",Dominican Rep,Gran Caribe,kilograms,6878665.0,3130017.0,0.455033,72,7214
600,2021-10-01,7214.20.00,"Iron/nonalloy steel, concrete reinforcing bars...",Turkey,Europa y Asia central,kilograms,5915815.0,4365484.0,0.737935,72,7214
6420,2020-10-01,7214.20.00,"Iron/nonalloy steel, concrete reinforcing bars...",Dominican Rep,Gran Caribe,kilograms,5779984.0,2734497.0,0.473098,72,7214
6421,2020-11-01,7214.20.00,"Iron/nonalloy steel, concrete reinforcing bars...",Dominican Rep,Gran Caribe,kilograms,5707382.0,2861072.0,0.501293,72,7214
...,...,...,...,...,...,...,...,...,...,...,...
290,2021-12-01,7318.29.00,"Iron or steel, nonthreaded articles similar to...",France,Europa y Asia central,kilograms,1.0,3170.0,3170.000000,73,7318
3542,2021-06-01,7317.00.65,"Iron or steel, nails, tacks, corrugated nails,...",France,Europa y Asia central,kilograms,1.0,2157.0,2157.000000,73,7317
10865,2020-03-01,7318.15.80,"Iron or steel, screws and bolts, nesoi, having...",France,Europa y Asia central,kilograms,1.0,5124.0,5124.000000,73,7318
220,2021-12-01,7320.20.50,"Iron or steel, helical springs (o/than suitabl...",Czech Republic,Europa y Asia central,kilograms,1.0,2349.0,2349.000000,73,7320


In [6]:
def average(df):
    avg_df = pd.DataFrame(columns=['Mes', 'Precio','HS2'])

    for month in df.Fecha.unique():
        temp_df = df.copy()[df['Fecha'] == month]
        for HS2 in df.HS2.unique():
            HS2_df = temp_df.copy()[temp_df['HS2'] == HS2]
            avg = np.average(HS2_df.Precio, weights=HS2_df.Cantidad)
            avg_df = pd.concat([avg_df, pd.DataFrame(data={
                'Mes': [month],
                'Precio': [avg],
                'HS2': [HS2]
            })], ignore_index=True)
    avg_df = avg_df.sort_values(by=['Mes'])
    # print(avg_df)

    # Grafica
    fig = px.line(
        avg_df,
        x='Mes',
        y='Precio',
        title='Precio Promedio x Fecha',
        color='HS2'
    )
    fig.show()

    return avg_df

avg_df = average(df)

In [7]:
def sum_Cantidad_df(df, n_countries=10):
    df = df.copy()

    df = df.groupby(by=['Pais']).sum().sort_values(by=['Cantidad'], ascending=False)

    top = df[0:n_countries]
    bottom = df[n_countries:].sum()
    bottom.name = 'OTHER'

    top.loc['other'] = bottom

    normalized_sum = df['Cantidad']
    normalized_sum = (normalized_sum-normalized_sum.min())/(normalized_sum.max()-normalized_sum.min())
    
    return top['Cantidad'], df, normalized_sum

top_countries, full_sum, normalized_sum = sum_Cantidad_df(df, n_countries=10)
normalized_sum



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Pais
Dominican Rep    1.000000
Turkey           0.603013
Mexico           0.343472
Costa Rica       0.177849
India            0.163302
                   ...   
Jamaica          0.000009
New Zealand      0.000007
Czechia          0.000004
Norway           0.000003
Croatia          0.000000
Name: Cantidad, Length: 65, dtype: float64

In [8]:
def get_df_for_top_countries(df, top_countries):
    df = df.copy()
    countries = top_countries.index.values

    df = df[df['Pais'].isin(countries)]
    df = df.dropna()

    return df

df_top = get_df_for_top_countries(df, top_countries)
df_top

Unnamed: 0,Fecha,Partida,Descripcion,Pais,Region,Unidad,Cantidad,FOB,Precio,HS2,HS4
6417,2020-07-01,7214.20.00,"Iron/nonalloy steel, concrete reinforcing bars...",Dominican Rep,Gran Caribe,kilograms,8107590.0,3817010.0,0.470795,72,7214
6419,2020-09-01,7214.20.00,"Iron/nonalloy steel, concrete reinforcing bars...",Dominican Rep,Gran Caribe,kilograms,6878665.0,3130017.0,0.455033,72,7214
600,2021-10-01,7214.20.00,"Iron/nonalloy steel, concrete reinforcing bars...",Turkey,Europa y Asia central,kilograms,5915815.0,4365484.0,0.737935,72,7214
6420,2020-10-01,7214.20.00,"Iron/nonalloy steel, concrete reinforcing bars...",Dominican Rep,Gran Caribe,kilograms,5779984.0,2734497.0,0.473098,72,7214
6421,2020-11-01,7214.20.00,"Iron/nonalloy steel, concrete reinforcing bars...",Dominican Rep,Gran Caribe,kilograms,5707382.0,2861072.0,0.501293,72,7214
...,...,...,...,...,...,...,...,...,...,...,...
3713,2021-06-01,7318.15.20,"Iron or steel, bolts and bolts & their nuts or...",Spain,Europa y Asia central,kilograms,1.0,6320.0,6320.000000,73,7318
3847,2021-05-01,7318.15.80,"Iron or steel, screws and bolts, nesoi, having...",Spain,Europa y Asia central,kilograms,1.0,3480.0,3480.000000,73,7318
4023,2021-01-01,7318.29.00,"Iron or steel, nonthreaded articles similar to...",Spain,Europa y Asia central,kilograms,1.0,2480.0,2480.000000,73,7318
4027,2021-05-01,7318.29.00,"Iron or steel, nonthreaded articles similar to...",Spain,Europa y Asia central,kilograms,1.0,7096.0,7096.000000,73,7318


In [9]:
def treemap(HS2, df):
    df = df.copy()[df['HS2'] == HS2]
    fig = px.treemap(
        df,
        path=['Region','Pais'],
        values='Cantidad',
        color='Precio',
        # hover_data=['Precio'],
        names = 'Pais',
        # facet_row = 'HS4',
        color_continuous_scale=['green','grey','red'],
        color_continuous_midpoint=np.average(df['Precio'], weights=df['Cantidad']),
        range_color=[0.5,1.5] if HS2==str(72) else [0.8,2.5],
        title=f'Treemap Region-Pais del Volumen de {HS2} exportado y su Precio'
        )
    fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
    fig.show()

for HS2 in df.HS2.unique():
    treemap(HS2, df)

In [10]:
def group_HS4(df):
    df = df.copy()
    df = df.groupby(by=['HS2','HS4']).sum().reset_index()

    return df
group_HS4(df)

Unnamed: 0,HS2,HS4,Cantidad,FOB,Precio
0,72,7205,2749.0,5227.0,1.901419
1,72,7207,41651.0,38888.0,0.933663
2,72,7208,5124940.0,4746986.0,62.209996
3,72,7209,89754.0,95943.0,7.648478
4,72,7210,87185156.0,97218639.0,431.193646
5,72,7211,340323.0,359641.0,9.226277
6,72,7212,8864962.0,9934432.0,120.202521
7,72,7213,15267421.0,9795572.0,24.62615
8,72,7214,163438518.0,94948347.0,86.864515
9,72,7215,82366.0,83813.0,7.625973


In [27]:
def pie_HS(df):
    df = df.copy()
    df = df.groupby(by=['HS2', 'HS4']).sum().reset_index()
    
    fig = px.sunburst(
        df,
        values='Cantidad',
        path = ['HS2', 'HS4']
    )
    fig.show()
pie_HS(df_total)

In [34]:
def barplot(df, HS4, range_color, region):
    df = df[df['HS4'].isin(HS4)].sort_values(by=['HS4'])
    df = df[df['Region'].isin(region)]
    fig = None
    fig = px.bar(
        df,
        x='Fecha',
        y='Cantidad',
        color='Precio',
        facet_col = 'Region',
        facet_row = 'HS4',
        # facet_row_spacing = 0.01,
        range_color = range_color,
        color_continuous_scale=['green','grey','red'],
        hover_data=['Pais'],
        # name='Pais',
        title='Volumen Exportado por Region y precio (color)',
    )
    fig.update_layout(
        height = 800
    )
    fig.show()


In [35]:
barplot(df[df['HS2'] == '72'], HS4=['7214','7210','7217','7213'],range_color=[0.5,2], region=['Gran Caribe', 'Europa y Asia central','America Latina'])

In [36]:
barplot(df[df['HS2'] == '73'], HS4=['7306','7326','7310'],range_color=[0.5,2], region=['Asia meridonial','Gran Caribe','America Latina','Oriente Medio y Norte de Africa','Asia oriental y el Pacifico'])

In [None]:
def RDplot(df, HS2):
    df = df[df['HS2'] == HS2]
    df = df.sort_values(by=['HS4'])
    # df = df[df['Region'].isin(region)]
    fig = None
    fig = px.bar(
        df,
        x='Fecha',
        y='Cantidad',
        color='Precio',
        # facet_col = 'HS2',
        facet_row = 'HS4',
        facet_row_spacing = 0.01,
        range_color = [0.5,2],
        color_continuous_scale=['green','grey','red'],
        # hover_data=['Pais'],
        # name='Pais',
        title='Volumen Exportado a Republica Dominicana y precio (color)',
    )
    fig.update_layout(
        height = 8000
    )
    fig.show()

RDplot(df,'72')

In [42]:
def RDplot(df, HS2):
    df = df[df['HS2'] == HS2]
    df = df.sort_values(by=['HS4'])
    # df = df[df['Region'].isin(region)]
    fig = None
    fig = px.bar(
        df,
        x='Fecha',
        y='Cantidad',
        color='Precio',
        # facet_col = 'HS2',
        facet_row = 'HS4',
        facet_row_spacing = 0.01,
        range_color = [0.5,2],
        color_continuous_scale=['green','grey','red'],
        # hover_data=['Pais'],
        # name='Pais',
        title='Volumen Exportado a Republica Dominicana y precio (color)',
    )
    fig.update_layout(
        height = 8000
    )
    fig.show()

RDplot(df,'72')