In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
import string

product_dict_df = pd.read_csv('BACI_HS17_V202401/product_codes_HS17_V202401.csv')
product_dict_df = product_dict_df.set_index('code')
product_dict = product_dict_df.to_dict()['description']

In [2]:
dff = pd.read_csv('product_clusters.csv')
df = dff[['product','Cluster']].copy()
df['product_name'] = df['product'].map(product_dict)

### Preprocess the product names

In [3]:
# Preprocess product names
nltk.download('stopwords')
stop_words = set(stopwords.words('english') + list(string.punctuation))

def preprocess(text):
    return ' '.join([word.lower() for word in text.split() if word.lower() not in stop_words])

df['cleaned_name'] = df['product_name'].apply(preprocess)
df.head()

[nltk_data] Downloading package stopwords to /Users/ben/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,product,Cluster,product_name,cleaned_name
0,10121,1,"Horses: live, pure-bred breeding animals","horses: live, pure-bred breeding animals"
1,10129,1,"Horses: live, other than pure-bred breeding an...","horses: live, pure-bred breeding animals"
2,10130,1,Asses: live,asses: live
3,10190,1,Mules and hinnies: live,mules hinnies: live
4,10221,1,"Cattle: live, pure-bred breeding animals","cattle: live, pure-bred breeding animals"


### Group by the product clusters

In [4]:
clustered_products = df.groupby('Cluster')['cleaned_name'].apply(list)
clustered_products.to_csv('clustered_products.csv', header=True)

### Convert text to vectors 

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize_text(texts):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(texts)
    return X, vectorizer

# Create a dictionary to hold the vectorized text and vectorizer for each cluster
cluster_vectors = {}

for cluster, texts in clustered_products.items():
    X, vectorizer = vectorize_text(texts)
    cluster_vectors[cluster] = (X, vectorizer)

### Apply topic modelling for each cluster

In [10]:
from sklearn.decomposition import LatentDirichletAllocation

def apply_lda(X, n_topics=5):
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(X)
    return lda

# Create a dictionary to hold the LDA model for each cluster
cluster_lda = {}

for cluster, (X, vectorizer) in cluster_vectors.items():
    lda = apply_lda(X)
    cluster_lda[cluster] = (lda, vectorizer)

### Interpret topics for each cluster

In [11]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 5

# Display the top words for each topic in each cluster
for cluster, (lda, vectorizer) in cluster_lda.items():
    print(f"\nCluster {cluster} Topics:")
    display_topics(lda, vectorizer.get_feature_names_out(), no_top_words)


Cluster 0 Topics:
Topic 0:
seeds concentrates ores oil whether
Topic 1:
flowers cereals dried fresh ground
Topic 2:
fish 0303 fillets 0302 offal
Topic 3:
wood sawn planed jointed sanded
Topic 4:
fresh dried chilled tubers roots

Cluster 1 Topics:
Topic 0:
fish meat offal acids edible
Topic 1:
steel rolled iron metal alloy
Topic 2:
electric whether apparatus no heading
Topic 3:
fabrics woven no paper weight
Topic 4:
crocheted knitted fibres derivatives copper

Cluster 2 Topics:
Topic 0:
fillets excluding 0304 subheadings 91
Topic 1:
fillets excluding 0304 subheadings 91
Topic 2:
0303 fish yellowfin albacares frozen
Topic 3:
0303 fish bonito stripe skipjack
Topic 4:
0302 fish chilled fresh thunnus
