In [1]:
#Set the Pyspark environment  variables
import os, findspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
import pandas as pd
from pyspark.sql.window import Window

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import GBTRegressor  # or RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator



os.environ['SPARK_HOME'] = "/Users/shrutimac/documents/Apps/spark"
findspark.init(os.environ["SPARK_HOME"])
os.environ["PYSPARK_DRIVER_PYTHON"] = "jupyter"
os.environ["PYSPARK_DRIVER_PYTHON_OPTS"] = "lab"
os.environ["PYSPARK_PYTHON"] = "python"

In [2]:
spark = (
    SparkSession.builder
    .appName("TariffsTradeModel")
    .config("spark.driver.memory", "6g")      # or "4g"
    .config("spark.executor.memory", "6g")   # often same as driver in local mode
    .master("local[*]")
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/25 22:05:42 WARN Utils: Your hostname, Shrutis-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 192.168.1.239 instead (on interface en0)
25/11/25 22:05:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/25 22:05:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
base_path = "/Users/shrutimac/Documents/big data/Final Project/Data Processed"

data_path  = f"{base_path}/df_final.csv"  

df_final = spark.read.csv(data_path, header=True, inferSchema=True)

                                                                                

In [4]:
# Extract HS2 From ProductCode
df_final = df_final.withColumn(
    "HS2",
    F.col("ProductCode").cast("string").substr(1, 2)
)

In [5]:
sector_mapping = {

    # --- Agriculture, Food, Animals, Plants (HS 01–24) ---
    "01": "Agriculture",
    "02": "Agriculture",
    "03": "Agriculture",
    "04": "Agriculture",
    "05": "Agriculture",
    "06": "Agriculture",
    "07": "Agriculture",
    "08": "Agriculture",
    "09": "Agriculture",
    "10": "Agriculture",
    "11": "Agriculture",
    "12": "Agriculture",
    "13": "Agriculture",
    "14": "Agriculture",
    "15": "Agriculture",
    "16": "Agriculture",
    "17": "Agriculture",
    "18": "Agriculture",
    "19": "Agriculture",
    "20": "Agriculture",
    "21": "Agriculture",
    "22": "Agriculture",
    "23": "Agriculture",
    "24": "Agriculture",

    # --- Minerals & Chemicals (HS 25–40) ---
    "25": "Minerals",
    "26": "Minerals",
    "27": "Mineral Fuels",     # Important category

    "28": "Chemicals",
    "29": "Chemicals",
    "30": "Chemicals",
    "31": "Chemicals",
    "32": "Chemicals",
    "33": "Chemicals",
    "34": "Chemicals",
    "35": "Chemicals",
    "36": "Chemicals",
    "37": "Chemicals",
    "38": "Chemicals",

    "39": "Plastics & Rubber",
    "40": "Plastics & Rubber",

    # --- Leather, Wood, Paper (HS 41–49) ---
    "41": "Leather",
    "42": "Leather",
    "43": "Leather",

    "44": "Wood",
    "45": "Wood",
    "46": "Wood",

    "47": "Paper",
    "48": "Paper",
    "49": "Paper",

    # --- Textiles & Apparel (HS 50–63) ---
    "50": "Textiles",
    "51": "Textiles",
    "52": "Textiles",
    "53": "Textiles",
    "54": "Textiles",
    "55": "Textiles",
    "56": "Textiles",
    "57": "Textiles",
    "58": "Textiles",
    "59": "Textiles",
    "60": "Textiles",
    "61": "Textiles",
    "62": "Textiles",
    "63": "Textiles",

    # --- Footwear & Headgear (HS 64–67) ---
    "64": "Footwear",
    "65": "Footwear",
    "66": "Footwear",
    "67": "Footwear",

    # --- Stone, Glass, Metals (HS 68–83) ---
    "68": "Metals & Machinery",
    "69": "Metals & Machinery",

    "70": "Metals & Machinery",
    "71": "Metals & Machinery",
    "72": "Metals & Machinery",
    "73": "Metals & Machinery",
    "74": "Metals & Machinery",
    "75": "Metals & Machinery",
    "76": "Metals & Machinery",
    "77": "Metals & Machinery",      # Reserved code
    "78": "Metals & Machinery",
    "79": "Metals & Machinery",
    "80": "Metals & Machinery",
    "81": "Metals & Machinery",
    "82": "Metals & Machinery",
    "83": "Metals & Machinery",

    # --- Machinery & Electronics (HS 84–85) ---
    "84": "Electronics & Machinery",
    "85": "Electronics & Machinery",

    # --- Transport Equipment (HS 86–89) ---
    "86": "Transport Equipment",
    "87": "Vehicles",
    "88": "Vehicles",
    "89": "Vehicles",

    # --- Miscellaneous (HS 90–99) ---
    "90": "Precision Instruments",
    "91": "Precision Instruments",
    "92": "Precision Instruments",

    "93": "Arms",
    "94": "Miscellaneous Manufacturing",
    "95": "Miscellaneous Manufacturing",
    "96": "Miscellaneous Manufacturing",
    "97": "Art & Antiques",

    "98": "Special Goods",
    "99": "Special Goods"
}

# Convert Mapping into Spark Expression
from pyspark.sql.functions import create_map, lit

mapping_expr = create_map(
    [lit(x) for pair in sector_mapping.items() for x in pair]
)

df_final = df_final.withColumn(
    "Sector",
    mapping_expr[F.col("HS2")]
)

## 1. Feature engineering on df_final
1.1 Define a window for lags (by country pair, sector & flow)

In [6]:
w = (
    Window
    .partitionBy("ReporterName", "PartnerName", "Sector", "TradeFlowName")
    .orderBy("Year")
)


In [7]:
# 1.2 Add lag + moving-average trade features
df_ml = (
    df_final
    # 1-year and 2-year lag of trade value
    .withColumn("lag1_trade", F.lag("TradeValueKUSD", 1).over(w))
    .withColumn("lag2_trade", F.lag("TradeValueKUSD", 2).over(w))
    # 3-year moving average of trade
    .withColumn(
        "ma3_trade",
        F.avg("TradeValueKUSD").over(w.rowsBetween(-2, 0))
    )
)


In [8]:
#1.3 Tariff change / shock features
# We’ll use EffectiveTariff as the main tariff measure

df_ml = (
    df_ml
    # previous year's tariff
    .withColumn("lag1_tariff", F.lag("EffectiveTariff", 1).over(w))
    # change in tariff (percentage points)
    .withColumn("tariff_change", F.col("EffectiveTariff") - F.col("lag1_tariff"))
    .withColumn("tariff_abs_change", F.abs(F.col("tariff_change")))
    # simple "shock" flag: change > 2 percentage points
    .withColumn(
        "tariff_shock_flag",
        (F.col("tariff_abs_change") > 2.0).cast("int")
    )
)


In [9]:
# 1.4 Filter rows with enough history & non-null target
# For a time-based model we need lag1_trade not null.
df_ml = (
    df_ml
    .filter(F.col("lag1_trade").isNotNull())
    .filter(F.col("TradeValueKUSD").isNotNull())
)


In [10]:
# If we want the model to only learn where tariffs are known:
# df_ml = df_ml.filter(F.col("EffectiveTariff").isNotNull())


In [11]:
# 1.5 Choose numeric & categorical features
numeric_features = [
    "lag1_trade",
    "lag2_trade",
    "ma3_trade",
    "EffectiveTariff",
    "Tariff_SimpleAvg",
    "Tariff_WeightedAvg",
    "Tariff_MinRate",
    "Tariff_MaxRate",
    "Tariff_ImportsKUSD",
    "tariff_change",
    "tariff_abs_change",
    "tariff_shock_flag",
    "Year"
]

categorical_features = [
    "ReporterName",
    "PartnerName",
    "Sector",
    "TradeFlowName",
    "Tariff_DutyType",
    "HS2",
]


In [12]:
# 1.6 Handle missing numeric values (simple strategy)
# You can refine this later; median or 0 is fine for a first pass.
# Fill missing numeric values with 0 (or use df_ml.na.fill with dict)
df_ml = df_ml.fillna(0, subset=numeric_features)
  

## 2. Build the PySpark ML pipeline
2.1 StringIndexers for categoricals

In [13]:
indexers = [
    StringIndexer(
        inputCol=col,
        outputCol=f"{col}_idx",
        handleInvalid="keep"
    )
    for col in categorical_features
]


In [14]:
# 2.2 OneHotEncoder
encoder = OneHotEncoder(
    inputCols=[f"{c}_idx" for c in categorical_features],
    outputCols=[f"{c}_oh" for c in categorical_features]
)


In [15]:
# 2.3 VectorAssembler for all features
assembler_inputs = numeric_features + [f"{c}_oh" for c in categorical_features]

assembler = VectorAssembler(
    inputCols=assembler_inputs,
    outputCol="features"
)


In [16]:
# 2.4 Regressor (Gradient Boosted Trees)
gbt = GBTRegressor(
    labelCol="TradeValueKUSD",
    featuresCol="features",
    maxDepth=6,
    maxIter=60,
    stepSize=0.1,
    subsamplingRate=0.8,
    seed=42
)


If training is slow, we will reduce maxIter or use RandomForestRegressor instead.

In [17]:
# 2.5 Assemble the pipeline
pipeline = Pipeline(
    stages=indexers + [encoder, assembler, gbt]
)


## 3. Train / test split & model training
3.1 Split data

In [18]:
from pyspark import StorageLevel
train_df, test_df = df_ml.randomSplit([0.8, 0.2], seed=42)


In [19]:
# 3.2 Fit the model
pipeline_model = pipeline.fit(train_df)


25/11/25 22:06:10 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/11/25 22:06:25 WARN MemoryStore: Not enough space to cache rdd_136_1 in memory! (computed 167.0 MiB so far)
25/11/25 22:06:25 WARN BlockManager: Persisting block rdd_136_1 to disk instead.
25/11/25 22:06:25 WARN MemoryStore: Not enough space to cache rdd_136_2 in memory! (computed 167.0 MiB so far)
25/11/25 22:06:25 WARN MemoryStore: Not enough space to cache rdd_136_6 in memory! (computed 167.0 MiB so far)
25/11/25 22:06:25 WARN BlockManager: Persisting block rdd_136_6 to disk instead.
25/11/25 22:06:25 WARN BlockManager: Persisting block rdd_136_2 to disk instead.
25/11/25 22:06:25 WARN MemoryStore: Not enough space to cache rdd_136_7 in memory! (computed 167.0 MiB so far)
25/11/25 22:06:25 WARN BlockManager: Persisting block rdd_136_7 to disk instead.
25/11/25 22:06:25 WARN MemoryStore: Not eno

In [21]:
predictions = pipeline_model.transform(test_df)

evaluator_rmse = RegressionEvaluator(
    labelCol="TradeValueKUSD",
    predictionCol="prediction",
    metricName="rmse"
)

evaluator_r2 = RegressionEvaluator(
    labelCol="TradeValueKUSD",
    predictionCol="prediction",
    metricName="r2"
)

rmse = evaluator_rmse.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)

print(f"Test RMSE: {rmse:,.2f}")
print(f"Test R²:   {r2:.3f}")


25/11/25 22:11:24 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
[Stage 1157:>                                                     (0 + 10) / 11]

Test RMSE: 406,579.90
Test R²:   0.291


                                                                                

Given that trade values range from 800M to 5B USD, the model captures around 30% of variation, which is expected because trade flows are heavily driven by macroeconomic shocks, global supply-demand cycles, and geopolitical factors not included in the dataset.
This result reinforces our research finding that tariffs alone do not strongly determine trade values, and the relationship is weak to moderate across sectors.

## Baseline model (just lag1_trade) to show how much better the ML model is

Definition:

For each (Reporter, Partner, Sector) group:

Next year’s trade value ≈ last year’s trade value

This is the simplest possible forecasting method if your ML model is better than this, then your model is adding value.

In [22]:
# Create the lagged feature in PySpark
# Window by Reporter, Partner, Sector, ordered by Year
w = Window.partitionBy("ReporterName", "PartnerName", "Sector").orderBy("Year")

df_lag = df_final.withColumn("PrevTradeValue", F.lag("TradeValueKUSD", 1).over(w))


In [25]:
# Remove rows where lag is null
df_lag = df_lag.filter(F.col("PrevTradeValue").isNotNull())


In [26]:
# Train/test split for baseline
train_base, test_base = df_lag.randomSplit([0.8, 0.2], seed=42)


In [27]:
# Compute baseline RMSE and R²
# Prediction = previous year’s trade value.

# Create predictions manually
test_pred = test_base.withColumn("baseline_prediction", F.col("PrevTradeValue"))

# Evaluator
evaluator_rmse = RegressionEvaluator(labelCol="TradeValueKUSD",
                                     predictionCol="baseline_prediction",
                                     metricName="rmse")

evaluator_r2 = RegressionEvaluator(labelCol="TradeValueKUSD",
                                   predictionCol="baseline_prediction",
                                   metricName="r2")

rmse_base = evaluator_rmse.evaluate(test_pred)
r2_base = evaluator_r2.evaluate(test_pred)

print("Baseline RMSE:", rmse_base)
print("Baseline R²:", r2_base)


[Stage 1165:>                                                     (0 + 10) / 11]

Baseline RMSE: 584374.6659657594
Baseline R²: -0.6029011873333598


                                                                                

In [29]:
# Comparison
print("===== Performance Comparison =====")
print(f"Baseline Model RMSE: {rmse_base:,.2f}")
print(f"Baseline Model R²:   {r2_base:.3f}")

print(f"\nML Model RMSE:       {rmse:,.2f}")
print(f"ML Model R²:         {r2:.3f}")


===== Performance Comparison =====
Baseline Model RMSE: 584,374.67
Baseline Model R²:   -0.603

ML Model RMSE:       406,579.90
ML Model R²:         0.291
