In [854]:
# import packages
import pandas as pd
import numpy as np
from data_visualizations import plot_line, plot_scatter, plot_dumbbell


In [855]:
final_df = pd.read_csv('./../../data/processed/combined_primary_secondary.csv')
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6749 entries, 0 to 6748
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   country                 6749 non-null   object 
 1   category                6749 non-null   object 
 2   year                    6749 non-null   int64  
 3   import_value            6749 non-null   int64  
 4   export_value            6749 non-null   int64  
 5   mfn_by_us_simple_avg    5625 non-null   float64
 6   mfn_by_us_weighted_avg  5625 non-null   float64
 7   mfn_on_us_simple_avg    4875 non-null   float64
 8   mfn_on_us_weighted_avg  4875 non-null   float64
 9   gdp                     5490 non-null   float64
 10  gdp_2015_adj            5445 non-null   float64
dtypes: float64(6), int64(3), object(2)
memory usage: 580.1+ KB


### What are the overall exports and imports of the USA over the years?

Let’s explore how imports and exports have changed over time, identify which is growing faster, and analyze how the trade balance has evolved.

In [856]:
# aggregate import and export data to yearly level
yearly_df = final_df.groupby('year').agg({
    'import_value': 'sum',
    'export_value': 'sum'
}).reset_index()

# create a new column for trade balance
yearly_df['trade_balance'] = yearly_df['export_value'] - yearly_df['import_value']

# rename columns for better readability
yearly_df.rename(columns={
    'import_value': 'Import',
    'export_value': 'Export',
    'trade_balance': 'Trade Balance'
}, inplace=True)
yearly_df.head()


Unnamed: 0,year,Import,Export,Trade Balance
0,2008,228611885213,105121043873,-123490841340
1,2009,157179781039,71416597613,-85763183426
2,2010,224725910368,99876045626,-124849864742
3,2011,253735555230,117655029390,-136080525840
4,2012,296613580192,132435416780,-164178163412


In [857]:
# Plot the overall import and export values of USA over the years
# call the function from data_visualizations.py
fig = plot_line(
    df=yearly_df,
    x='year',
    y=['Import', 'Export', 'Trade Balance'],
    title='Annual Trade Trends of the USA: Imports, Exports, and Trade Balance',
    y_label='Value (USD)',
    x_label='Year',
    legend_label=None,
    markers=True
)
#
fig.show()

### What are the overall exports and imports of the USA over the years across different categories?

Let see how do these trends differ across various automotive categories? 
Specifically, which categories exhibit significant growth, 
and which demonstrate stable trade volumes over time?

In [858]:
# aggregate data to yearly level by category
yearly_df = final_df.groupby(['year', 'category']).agg({
    'import_value': 'sum',
    'export_value': 'sum'
}).reset_index()

# create a new column for trade balance
yearly_df['trade_balance'] = yearly_df['export_value'] - yearly_df['import_value']

# rename columns for better readability
yearly_df.rename(columns={
    'year':'Year',
    'import_value': 'Import',
    'export_value': 'Export',
    'trade_balance': 'Trade Balance'
}, inplace=True)

yearly_df.head(5)

Unnamed: 0,Year,category,Import,Export,Trade Balance
0,2008,Parts,90255306016,58084720471,-32170585545
1,2008,Passenger Vehicles,132547756053,41724479193,-90823276860
2,2008,Trucks,5808823144,5311844209,-496978935
3,2009,Parts,66004369220,43280003896,-22724365324
4,2009,Passenger Vehicles,86399575025,24947920330,-61451654695


In [859]:
# Plot the overall import and export values of USA over the years for each category
fig = plot_line(
    df=yearly_df,
    x='Year',
    y=['Import', 'Export', 'Trade Balance'],
    title='USA Trade Trends by Category: Imports, Exports, and Trade Balance',
    y_label='Value in USD',
    x_label='Year',
    markers=True,
    # facet by category
    facet_col='category'
)

# simplify facet titles by removing "category="
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

# show the plot
fig.show()


### US exports and imports by country, 2024
This section analyzes the import and export dynamics between the US and its top trading partners in 2024.

In [860]:
# Find top countries and plot import and export for each country

In [861]:
# find top 20 countries by export value
df = final_df.copy()
df = df.groupby('country').agg({
    'import_value': 'sum',
    'export_value': 'sum'
}).reset_index()

df = df.sort_values(by='export_value', ascending=False).head(20)
l1 = df["country"].tolist()


# find top 20 countries by import value
df = final_df.copy()
df = df.groupby('country').agg({
    'import_value': 'sum',
    'export_value': 'sum'
}).reset_index()
df = df.sort_values(by='import_value', ascending=False).head(20)
l2 = df["country"].tolist()

top_countries = list(set(l1) or set(l2))
top_countries

['Canada',
 'Australia',
 'Kuwait',
 'Japan',
 'Italy',
 'Netherlands',
 'Russia',
 'United Kingdom',
 'Thailand',
 'South Korea',
 'Saudi Arabia',
 'China',
 'Chile',
 'Belgium',
 'France',
 'United Arab Emirates',
 'South Africa',
 'Mexico',
 'Germany',
 'Brazil']

In [862]:
# select data for top countries and year 2024
top_countries_df = final_df[(final_df['country'].isin(top_countries)) & (final_df['year'] == 2024)]

# aggregate import and export data for top countries
top_countries_df = top_countries_df.groupby('country').agg({
    'import_value': 'sum',
    'export_value': 'sum'
}).reset_index().sort_values(by='import_value', ascending=True)

# rename columns for better readability
top_countries_df.rename(columns={
    'import_value': 'Import',
    'export_value': 'Export',
    'country':'Country'
}, inplace=True)

top_countries_df

Unnamed: 0,Country,Import,Export
10,Kuwait,223921,551153894
13,Russia,761163,347364
14,Saudi Arabia,13868785,2059661421
18,United Arab Emirates,125485439,2849651639
0,Australia,131695992,4334078256
12,Netherlands,429742968,624184188
4,Chile,438237989,788339822
2,Brazil,1134558068,1690777381
1,Belgium,1409561680,2227578571
6,France,1483559185,549410104


In [863]:
#  Plot dumbbell plot for top countries
fig = plot_dumbbell(
    df=top_countries_df,
    y='Country',
    x1='Export',
    x2='Import',
    x_label="Value (USD)",
    title="Top Trade Partners of the USA: Exports and Imports (2024)",
)

# increase height
fig.update_layout(
    height=1000
)

# set axes to log scale
# fig.update_xaxes(type="log")
# fig.update_yaxes(type="log")

fig.show()

### Exploring GDP and Exports of Top Trading Partners

Let’s examine how the GDP of the US’s top trading partners relates to their export volumes.

In [864]:
# Drop rows where gdp or mfn has NaN values
final_df_cleaned = final_df.dropna(subset=['mfn_by_us_simple_avg',
                                    'mfn_by_us_weighted_avg',
                                    'mfn_on_us_simple_avg',
                                    'mfn_on_us_weighted_avg',
                                    'gdp',
                                    'gdp_2015_adj'], 
                                how='any')

In [865]:
# select data corresponding to top partners only
top_countries_df = final_df_cleaned[(final_df_cleaned['country'].isin(top_countries)) ] #& (final_df_cleaned['year']==2022) ]

#rename columns for better readability
top_countries_df.rename(columns={
    'import_value': 'Import',
    'export_value': 'Export',
    'gdp': 'Nominal GDP',
    'gdp_2015_adj': 'Real GDP',
    'country':'Country'
}, inplace=True)
top_countries_df.head(5)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Country,category,year,Import,Export,mfn_by_us_simple_avg,mfn_by_us_weighted_avg,mfn_on_us_simple_avg,mfn_on_us_weighted_avg,Nominal GDP,Real GDP
221,Australia,Parts,2008,159838635,958541210,3.64,2.41,4.61,3.01,1056112000000.0,1133349000000.0
222,Australia,Passenger Vehicles,2008,863863012,564502516,3.64,2.41,4.61,3.01,1056112000000.0,1133349000000.0
223,Australia,Trucks,2008,181980,214093395,3.64,2.41,4.61,3.01,1056112000000.0,1133349000000.0
224,Australia,Parts,2009,101221703,714997863,3.46,2.95,3.85,2.93,928762100000.0,1154799000000.0
225,Australia,Passenger Vehicles,2009,115182054,271049394,3.46,2.95,3.85,2.93,928762100000.0,1154799000000.0


In [866]:
# check nulls 
top_countries_df[top_countries_df.isnull().any(axis=1)]['Country'].unique()

array([], dtype=object)

In [867]:
# Correlation analysis
fig = plot_scatter(
    df=top_countries_df,
    x='Real GDP',
    y='Export',
    facet_col='category',
    color='Country',
    title='US Exports and Partner GDP: Top Trading Countries',
)

# set axes to log scale
fig.update_xaxes(type="log")
fig.update_yaxes(type="log")

# simplify facet titles by removing "category="
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

fig.update_layout(
    margin=dict(t=100)  # increases top margin (pixels)
)
fig.show()

In [868]:
for cat in top_countries_df['category'].unique():
    subset = top_countries_df[top_countries_df['category'] == cat]
    corr = subset['Real GDP'].corr(subset['Export'])
    print(f"Correlation for {cat}: {corr:.2f}")

Correlation for Parts: -0.01
Correlation for Passenger Vehicles: 0.21
Correlation for Trucks: -0.05


### Comparing GDP and Exports: USMCA vs. Other Countries

Let’s explore how GDP and export values differ between USMCA members and other trading partners.

In [869]:
# make a copy of original cleaned df
df = final_df_cleaned.copy()

#rename columns for better readability
df.rename(columns={
    'import_value': 'Import',
    'export_value': 'Export',
    'gdp': 'Nominal GDP',
    'gdp_2015_adj': 'Real GDP',
    'country':'Country'
}, inplace=True)

#find max GDP
# Assuming df is your full dataset including all years
min_x = df["Real GDP"].min() * 0.9  # add some padding below min
max_x = df["Real GDP"].max() * 1.1  # add padding above max
min_y = df["Export"].min() * 0.9  # add some padding below min
max_y = df["Export"].max() * 1.1  # add padding above max

# Add a column to identify USMCA members
df["Trade bloc"] = df["Country"].apply(lambda x: "USMCA" if x in ["USA", "Mexico", "Canada"] else "Other")

# Create scatter plot
fig = plot_scatter(
    df=df,
    x='Real GDP',
    y='Export',
    color="Trade bloc",
    symbol="Trade bloc",
    facet_col='category',
    animation_frame="year",
    # range_x=[np.log(min_x), np.log(max_x)],
    # range_y=[np.log(min_y), np.log(max_y)],
    # range_x=[min_x, max_x],
    # range_y=[min_y, max_y],
    title='US Exports and Partner GDP with USMCA Highlighted'
)

# # set axes to log scale
fig.update_xaxes(type="log")
fig.update_yaxes(type="log")

# simplify facet titles by removing "category="
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

# plot
fig.show()