Trade Data

BLOCK 1 — Load Data + Inspect

In [None]:
# BLOCK 1 — Load Data from Google Drive (Shared Folder Shortcut)

from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Path to your CSV inside MyDrive after adding shortcut
csv_path = "/content/drive/MyDrive/Big Data Final Project/1 Updated: Final Trade and Tariff data - Shruti/merged_trade.csv"

# Load CSV
df = pd.read_csv(csv_path)

print("Initial Rows:", len(df))
print(df.head())
df.info()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Initial Rows: 6191839
  Nomenclature ReporterISO3  ProductCode ReporterName PartnerISO3  \
0           H3          BRA         1212       Brazil         KOR   
1           H3          BRA         1212       Brazil         PHL   
2           H3          BRA         1212       Brazil         PHL   
3           H3          BRA         1212       Brazil         TUR   
4           H3          BRA         1213       Brazil         ARG   

   PartnerName  Year TradeFlowName  TradeFlowCode  TradeValue in 1000 USD  
0  Korea, Rep.  2008        Import              5                 122.581  
1  Philippines  2007        Import              5                 109.206  
2  Philippines  2008        Import              5                 331.613  
3       Turkey  2007        Import              5                   0.923  
4    Argentina  2007        Import              5     

BLOCK 2 — Basic Cleaning: Remove Duplicates

In [None]:
before = len(df)
df = df.drop_duplicates()
after = len(df)

print("Rows before removing duplicates:", before)
print("Rows after removing duplicates:", after)

Rows before removing duplicates: 6191839
Rows after removing duplicates: 6145384


BLOCK 3 — Fix bad/missing ISO3 codes

In [None]:
# Bad ISO3 fixes
iso_fix = {
    "KOR": "KOR",   # Example, no change
    "": None,
    "XXX": None,
    "KOR.": "KOR",
    "USA.": "USA",
    "UKN": "GBR",   # Example if UKN appears
}

df["ReporterISO3"] = df["ReporterISO3"].replace(iso_fix)
df["PartnerISO3"]  = df["PartnerISO3"].replace(iso_fix)

print("Missing ReporterISO3:", df["ReporterISO3"].isna().sum())
print("Missing PartnerISO3:", df["PartnerISO3"].isna().sum())


Missing ReporterISO3: 0
Missing PartnerISO3: 0


BLOCK 4 — Convert datatypes

In [None]:
before = df.dtypes

df["Year"] = pd.to_numeric(df["Year"], errors="coerce").astype("Int64")
df["ProductCode"] = pd.to_numeric(df["ProductCode"], errors="coerce").astype("Int64")
df["TradeValue in 1000 USD"] = pd.to_numeric(df["TradeValue in 1000 USD"], errors="coerce")

after = df.dtypes

print("Dtypes before:\n", before)
print("\nDtypes after:\n", after)


Dtypes before:
 Nomenclature               object
ReporterISO3               object
ProductCode                 int64
ReporterName               object
PartnerISO3                object
PartnerName                object
Year                        int64
TradeFlowName              object
TradeFlowCode               int64
TradeValue in 1000 USD    float64
dtype: object

Dtypes after:
 Nomenclature               object
ReporterISO3               object
ProductCode                 Int64
ReporterName               object
PartnerISO3                object
PartnerName                object
Year                        Int64
TradeFlowName              object
TradeFlowCode               int64
TradeValue in 1000 USD    float64
dtype: object


BLOCK 5 — Normalize Country Names

In [None]:
country_fix = {
    "Korea, Rep.": "South Korea",
    "Republic of Korea": "South Korea",
    "USA": "United States",
    "U.S.A.": "United States",
    "United States of America": "United States",
}

df["ReporterName"] = df["ReporterName"].replace(country_fix)
df["PartnerName"]  = df["PartnerName"].replace(country_fix)

print(df[["ReporterName","PartnerName"]].head())


  ReporterName  PartnerName
0       Brazil  South Korea
1       Brazil  Philippines
2       Brazil  Philippines
3       Brazil       Turkey
4       Brazil    Argentina


BLOCK 6 — Clean TradeFlowName

In [None]:
before_counts = df["TradeFlowName"].value_counts()
print("Before cleaning:\n", before_counts)

mapping = {
    "Re-Import": "Import",
    "Reimport": "Import",
    "Re-Export": "Export",
    "Reexport": "Export",
    "IM": "Import",
    "EX": "Export"
}

df["TradeFlowName"] = df["TradeFlowName"].replace(mapping)

after_counts = df["TradeFlowName"].value_counts()
print("\nAfter cleaning:\n", after_counts)


Before cleaning:
 TradeFlowName
Export    3283386
Import    2861998
Name: count, dtype: int64

After cleaning:
 TradeFlowName
Export    3283386
Import    2861998
Name: count, dtype: int64


BLOCK 7 — HS Product Hierarchy Cleaning

In [None]:
before = len(df)

# Keep only 4 or 6 digit product codes
df = df[df["ProductCode"].astype(str).str.len().isin([4, 6])]

after = len(df)
print("Rows before HS filtering:", before)
print("Rows after HS filtering:", after)


Rows before HS filtering: 6145384
Rows after HS filtering: 5950875


BLOCK 8 — Remove negative trade values

In [None]:
before = len(df)
df = df[df["TradeValue in 1000 USD"] >= 0]
after = len(df)

print("Rows before removing negative trade:", before)
print("Rows after removing negative trade:", after)


Rows before removing negative trade: 5950875
Rows after removing negative trade: 5950875


BLOCK 9 — Remove trade where Reporter == Partner

In [None]:
before = len(df)

df = df[df["ReporterISO3"] != df["PartnerISO3"]]

after = len(df)

print("Rows before removing self-trade:", before)
print("Rows after removing self-trade:", after)


Rows before removing self-trade: 5950875
Rows after removing self-trade: 5894119


BLOCK 10 — Final Cleaning Summary

In [None]:
print("Final number of rows:", len(df))
df.describe(include='all')

Final number of rows: 5894119


Unnamed: 0,Nomenclature,ReporterISO3,ProductCode,ReporterName,PartnerISO3,PartnerName,Year,TradeFlowName,TradeFlowCode,TradeValue in 1000 USD
count,5894119,5894119,5894119.0,5894119,5894119,5894119,5894119.0,5894119,5894119.0,5894119.0
unique,2,12,,12,30,30,,2,,
top,H3,DEU,,Germany,NLD,Netherlands,,Export,,
freq,4466078,691560,,691560,247800,247800,,3182046,,
mean,,,5806.84969,,,,2015.546274,,5.539868,32258.0
std,,,2505.162223,,,,5.162169,,0.4984081,459466.7
min,,,1001.0,,,,2007.0,,5.0,0.0
25%,,,3214.0,,,,2011.0,,5.0,37.799
50%,,,6310.0,,,,2015.0,,6.0,527.861
75%,,,8445.0,,,,2020.0,,6.0,5445.06


BLOCK 11 — Save Final Cleaned File

In [None]:
# df.to_csv("merged_trade_cleaned.csv", index=False)
# print("Saved cleaned file: merged_trade_cleaned.csv")


to drive

In [None]:
# Save cleaned CSV to the same folder in Google Drive

output_path = "/content/drive/MyDrive/Big Data Final Project/2 Cleaned data - Hrishik/merged_trade_cleaned.csv"

df.to_csv(output_path, index=False)
print("Saved cleaned file to:", output_path)


Saved cleaned file to: /content/drive/MyDrive/Big Data Final Project/2 Cleaned data - Hrishik/merged_trade_cleaned.csv


Tariff Data

BLOCK 1 — Load Data + Inspect

In [None]:
# BLOCK 1 — Load Data from Google Drive (Shared Folder Shortcut)

from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Path to your CSV inside MyDrive after adding shortcut
csv_path = "/content/drive/MyDrive/Big Data Final Project/1 Updated: Final Trade and Tariff data - Shruti/merged_tariff.csv"

# Load CSV
df = pd.read_csv(csv_path)

print("Initial Rows:", len(df))
print(df.head())
df.info()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Initial Rows: 4234540
  Selected Nomen Native Nomen  Reporter Reporter Name  Product  \
0             HS           H3        76        Brazil     8529   
1             HS           H3        76        Brazil     8547   
2             HS           H3        76        Brazil     2849   
3             HS           H3        76        Brazil     2914   
4             HS           H3        76        Brazil     8434   

                                        Product Name  Partner Partner Name  \
0  Parts suitable for use solely or principally with       32    Argentina   
1  Insulating fittings for electrical machines, a...       32    Argentina   
2       Carbides, whether or not chemically defined.       32    Argentina   
3  Ketones and quinones, whether or not with othe...       32    Argentina   
4              Milking machines and dairy machinery.       32 

BLOCK 2 — Convert Tariff Year & Trade Year to Integers

In [None]:
before_types = df[['Tariff Year', 'Trade Year']].dtypes

df['Tariff Year'] = pd.to_numeric(df['Tariff Year'], errors='coerce').astype('Int64')
df['Trade Year']  = pd.to_numeric(df['Trade Year'], errors='coerce').astype('Int64')

after_types = df[['Tariff Year', 'Trade Year']].dtypes

print("Before:\n", before_types)
print("\nAfter:\n", after_types)


Before:
 Tariff Year    int64
Trade Year     int64
dtype: object

After:
 Tariff Year    Int64
Trade Year     Int64
dtype: object


BLOCK 3 — Convert Tariff Values (Simple Avg / Weighted Avg / others)

In [None]:
cols_to_float = [
    "Simple Average",
    "Weighted Average",
    "Standard Deviation",
    "Minimum Rate",
    "Maximum Rate",
    "Imports Value in 1000 USD",
    "Binding Coverage"
]

before = df[cols_to_float].dtypes

for col in cols_to_float:
    df[col] = pd.to_numeric(df[col], errors='coerce')

after = df[cols_to_float].dtypes

print("Before:\n", before)
print("\nAfter:\n", after)


Before:
 Simple Average               float64
Weighted Average             float64
Standard Deviation           float64
Minimum Rate                 float64
Maximum Rate                 float64
Imports Value in 1000 USD    float64
Binding Coverage             float64
dtype: object

After:
 Simple Average               float64
Weighted Average             float64
Standard Deviation           float64
Minimum Rate                 float64
Maximum Rate                 float64
Imports Value in 1000 USD    float64
Binding Coverage             float64
dtype: object


BLOCK 4 — Fix Product Codes (HS4 Only, Remove strange text)

In [None]:
before = len(df)

df["Product"] = df["Product"].astype(str).str.strip()

# keep only numeric codes
df = df[df["Product"].str.isnumeric()]

# keep only HS4 = exactly 4 digits
df = df[df["Product"].str.len() == 4]

after = len(df)

print("Rows before HS4 filtering:", before)
print("Rows after HS4 filtering:", after)


Rows before HS4 filtering: 4234540
Rows after HS4 filtering: 4023440


BLOCK 5 — Normalize Reporter & Partner to ISO3

In [None]:
iso_fix = {
    "USA": "USA",
    "U.S.A.": "USA",
    "United States": "USA",
    "Korea, Rep.": "KOR",
    "Republic of Korea": "KOR",
    "UK": "GBR",
    "United Kingdom": "GBR",
    "China": "CHN",
}

df["Reporter"] = df["Reporter"].replace(iso_fix)
df["Partner"]  = df["Partner"].replace(iso_fix)

print(df[["Reporter", "Partner"]].head())
print("Missing Reporter codes:", df["Reporter"].isna().sum())
print("Missing Partner codes:", df["Partner"].isna().sum())


   Reporter  Partner
0        76       32
1        76       32
2        76       32
3        76       32
4        76       32
Missing Reporter codes: 0
Missing Partner codes: 0


BLOCK 6 — Remove Negative or Invalid Tariff Values

In [None]:
before = len(df)

valid_condition = (
    (df["Simple Average"] >= 0) & (df["Simple Average"] <= 100) &
    (df["Weighted Average"] >= 0) & (df["Weighted Average"] <= 100)
)

df = df[valid_condition]

after = len(df)

print("Rows before tariff validation:", before)
print("Rows after tariff validation:", after)


Rows before tariff validation: 4023440
Rows after tariff validation: 3747936


BLOCK 7 — Keep Only MFN Applied / Main Tariff Source

In [None]:
before = len(df)

df = df[df["DutyType"].str.contains("MFN", case=False, na=False)]

after = len(df)

print("Rows before MFN filter:", before)
print("Rows after MFN filter:", after)


Rows before MFN filter: 3747936
Rows after MFN filter: 1874056


BLOCK 8 — Drop Unneeded Columns

In [None]:
cols_to_drop = [
    "Standard Deviation",
    "Minimum Rate",
    "Maximum Rate",
    "Nbr of Total Lines",
    "Nbr of DomesticPeaks",
    "Nbr of InternationalPeaks"
]

df_clean = df.drop(columns=cols_to_drop, errors="ignore")

print("Final Columns:", df_clean.columns.tolist())
print("Final Rows:", len(df_clean))


Final Columns: ['Selected Nomen', 'Native Nomen', 'Reporter', 'Reporter Name', 'Product', 'Product Name', 'Partner', 'Partner Name', 'Tariff Year', 'Trade Year', 'Trade Source', 'DutyType', 'Simple Average', 'Weighted Average', 'Imports Value in 1000 USD', 'Binding Coverage']
Final Rows: 1874056


BLOCK 9 — Summary

In [None]:
print("Final cleaned dataset shape:", df_clean.shape)
print(df_clean.head())

Final cleaned dataset shape: (1874056, 16)
  Selected Nomen Native Nomen  Reporter Reporter Name Product  \
2             HS           H3        76        Brazil    2849   
3             HS           H3        76        Brazil    2914   
5             HS           H3        76        Brazil    7312   
7             HS           H3        76        Brazil    8512   
8             HS           H3        76        Brazil    2811   

                                        Product Name  Partner Partner Name  \
2       Carbides, whether or not chemically defined.       32    Argentina   
3  Ketones and quinones, whether or not with othe...       32    Argentina   
5  Stranded wire, ropes, cables, plaited bands, s...       32    Argentina   
7  Electrical lighting or signalling equipment (e...       32    Argentina   
8  Other inorganic acids and other inorganic oxyg...       36    Australia   

   Tariff Year  Trade Year Trade Source DutyType  Simple Average  \
2         2009        2009   

BLOCK 10 — Save Final Clean CSV

In [None]:
# df_clean.to_csv("merged_tariff_cleaned.csv", index=False)
# print("Saved cleaned file as merged_tariff_cleaned.csv")

to drive

In [None]:
# Save cleaned CSV to the same folder in Google Drive

output_path = "/content/drive/MyDrive/Big Data Final Project/2 Cleaned data - Hrishik/merged_tariff_cleaned.csv"

df.to_csv(output_path, index=False)
print("Saved cleaned file to:", output_path)


Saved cleaned file to: /content/drive/MyDrive/Big Data Final Project/2 Cleaned data - Hrishik/merged_tariff_cleaned.csv
