# HS Code Classification - Getting Started

**Master's Thesis Project**  
Student: Carlos Leon  
Supervisor: Oliver Staubli  

This notebook will help you get started with the project and understand the data.

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Setup complete!")

## 2. Load Processed Data

First, let's load the preprocessed data to understand what we're working with.

In [None]:
# Load HS descriptions
df_hs = pd.read_csv('../data/processed/wco_hs_descriptions_clean.csv', dtype={'hs6': str})

print(f"Total HS descriptions: {len(df_hs)}")
print(f"Unique HS6 codes: {df_hs['hs6'].nunique()}")
print(f"Unique chapters: {df_hs['chapter'].nunique()}")

df_hs.head()

In [None]:
# Load tariff data
df_tariffs = pd.read_csv('../data/processed/wto_tariffs_clean.csv', dtype={'hs6': str})

print(f"\nTotal tariff records: {len(df_tariffs)}")
print(f"Countries: {df_tariffs['reporter_name'].unique()}")
print(f"Years: {sorted(df_tariffs['year'].unique())}")

df_tariffs.head()

## 3. Exploratory Data Analysis

Let's visualize the distribution of data across HS chapters.

In [None]:
# Distribution of HS codes by chapter
chapter_counts = df_hs['chapter'].value_counts().sort_index()

plt.figure(figsize=(15, 6))
chapter_counts.plot(kind='bar')
plt.title('Distribution of HS Codes by Chapter', fontsize=14)
plt.xlabel('HS Chapter (2-digit)', fontsize=12)
plt.ylabel('Number of HS6 Codes', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print(f"\nMost common chapters:")
print(chapter_counts.head(10))

In [None]:
# Tariff rate distribution
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
df_tariffs['mfn_rate_percent'].hist(bins=50)
plt.title('Distribution of MFN Tariff Rates', fontsize=14)
plt.xlabel('Tariff Rate (%)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

plt.subplot(1, 2, 2)
df_tariffs.boxplot(column='mfn_rate_percent', by='reporter_name')
plt.title('Tariff Rates by Country', fontsize=14)
plt.xlabel('Country', fontsize=12)
plt.ylabel('MFN Rate (%)', fontsize=12)
plt.suptitle('')  # Remove default title
plt.tight_layout()
plt.show()

print(f"\nTariff statistics:")
print(df_tariffs.groupby('reporter_name')['mfn_rate_percent'].describe())

## 4. Sample HS Code Descriptions

Let's look at some example descriptions to understand the data quality.

In [None]:
# Sample descriptions from different chapters
sample_chapters = ['42', '61', '85', '09', '94']  # Different product categories

for chapter in sample_chapters:
    chapter_desc = df_hs[df_hs['chapter'] == chapter].head(3)
    print(f"\n{'='*80}")
    print(f"Chapter {chapter}:")
    print(f"{'='*80}")
    for _, row in chapter_desc.iterrows():
        print(f"  HS6: {row['hs6']}")
        print(f"  Description: {row['description'][:150]}...")
        print()

## 5. Test Baseline Model

Let's test the baseline model with some example queries.

In [None]:
# Import baseline model
import sys
sys.path.append('../src')

from models.baseline import BaselineHSClassifier

# Initialize classifier
classifier = BaselineHSClassifier(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    data_dir="../data/processed",
    model_dir="../models/baseline"
)

# Try to load existing index, otherwise build it
try:
    classifier.load_index()
    print("✓ Loaded existing index")
except:
    print("Building new index (this will take a few minutes)...")
    classifier.build_index()
    print("✓ Index built successfully")

# Load tariff data
classifier.load_tariff_data()
print("✓ Tariff data loaded")

In [None]:
# Test queries
test_queries = [
    "leather handbag with shoulder strap",
    "men's cotton t-shirt, short sleeves",
    "smartphone with touchscreen and 5G",
    "roasted coffee beans, arabica",
    "wooden dining table for home use"
]

for query in test_queries:
    print(f"\n{'='*80}")
    print(f"Query: '{query}'")
    print(f"{'='*80}")
    
    results = classifier.predict(query, reporter_name="European Union", top_k=3)
    
    for _, row in results.iterrows():
        print(f"\n  Rank {row['rank']}: HS6 {row['hs6']} | Similarity: {row['similarity']:.4f}")
        print(f"  Description: {row['description'][:100]}...")
        if pd.notna(row['mfn_rate_percent']):
            print(f"  Tariff: {row['mfn_rate_percent']}% (EU, {row['tariff_year']})")
        else:
            print(f"  Tariff: Not available")

## 6. Next Steps

Now that you have a working baseline:

1. **Generate Synthetic Data**: Create training examples for the hierarchical model
2. **Train Hierarchical Model**: Implement and train the custom neural network
3. **Evaluation**: Compare baseline vs hierarchical model
4. **Web Interface**: Test the Streamlit app
5. **Thesis Writing**: Document methodology and results

See `QUICKSTART.md` for detailed instructions on each step.

## 7. Save Results

Save some predictions for later analysis.

In [None]:
# Collect predictions for all test queries
all_results = []

for query in test_queries:
    results = classifier.predict(query, reporter_name="European Union", top_k=5)
    results['query'] = query
    all_results.append(results)

# Combine into single DataFrame
df_results = pd.concat(all_results, ignore_index=True)

# Save to file
output_file = '../models/baseline/test_predictions.csv'
df_results.to_csv(output_file, index=False)
print(f"✓ Saved predictions to {output_file}")

df_results.head(10)