### EDA IOWA Dataset - Creating Similarity Categories

**Status:** PUBLIC Distribution <br>

**Author:** Jaume Manero IE<br>
**Date created:** 2025/02/12<br>
**Last modified:** 2025/02/12<br>
**Description:** We generate new categories by Similarity

We use embeddings for the actual categories and reconstruct them to a new SIM_CATEGORY that is the union of the most similar ones.

In [2]:
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
warnings.filterwarnings('ignore')

In [3]:
file = './data/Iowa_Liquor_Sales_DEC24.csv'
df = pd.read_csv(file, header=0)

In [4]:
df.dtypes

Invoice/Item Number       object
Date                      object
Store Number               int64
Store Name                object
Address                   object
City                      object
Zip Code                  object
Store Location            object
County Number            float64
County                    object
Category                 float64
Category Name             object
Vendor Number            float64
Vendor Name               object
Item Number               object
Item Description          object
Pack                       int64
Bottle Volume (ml)         int64
State Bottle Cost        float64
State Bottle Retail      float64
Bottles Sold               int64
Sale (Dollars)           float64
Volume Sold (Liters)     float64
Volume Sold (Gallons)    float64
dtype: object

In [5]:
df['Category Name']

0              STRAIGHT BOURBON WHISKIES
1                     TENNESSEE WHISKIES
2                        AMERICAN VODKAS
3                        AMERICAN VODKAS
4                             SPICED RUM
                        ...             
30767342       STRAIGHT BOURBON WHISKIES
30767343              TENNESSEE WHISKIES
30767344    IMPORTED CORDIALS & LIQUEURS
30767345               CANADIAN WHISKIES
30767346               AMERICAN DRY GINS
Name: Category Name, Length: 30767347, dtype: object

In [24]:
categories = df[['Category Name']]
categories = categories.drop_duplicates(subset=['Category Name'], keep='first')
categories.dropna
categories = categories.dropna(subset=['Category Name'])

In [25]:
categories

Unnamed: 0,Category Name
0,STRAIGHT BOURBON WHISKIES
1,TENNESSEE WHISKIES
2,AMERICAN VODKAS
4,SPICED RUM
7,IMPORTED CORDIALS & LIQUEURS
...,...
18452140,DELISTED ITEMS
19329022,DELISTED / SPECIAL ORDER ITEMS
22029522,IMPORTED WHISKIES
23383970,AMERICAN WHISKIES


In [141]:
import gensim.downloader as api
from scipy.spatial.distance import cosine

model = api.load('fasttext-wiki-news-subwords-300')                # This is the embeddings vocabulary from wikipedia

In [122]:
print(list(api.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [143]:
def get_average_vector(name):
    words = name.lower().split()
    word_vectors = [model[word] for word in words if word in model]
    if len(word_vectors) > 0:
        return sum(word_vectors) / len(word_vectors)
    else:
        return None

In [144]:
cat_list = categories['Category Name'].tolist()

In [145]:
name_vector_list = []
for name in cat_list:
    vector = get_average_vector(name)
    name_vector_list.append((name, vector))

In [146]:
# Convert names to vectors
#name_vectors = {name: get_average_vector(name) for name in cat_list if get_average_vector(name) is not None}

In [159]:
# Now we can see the similar categories

# We create a vector for a category
Category_name = 'Canadian'
# Calculate embedding
Category_vector = get_average_vector(Category_name)

# Now we list the most similar items to one
sims = []
# Calculate and print cosine similarity between each pair of names
for name1, vec1 in name_vector_list:
    if name1 == 'COCKTAILS/RTD':
        sim = 0
    else:
        sim = 1 - cosine(vec1.flatten(), Category_vector.flatten())  # cosine function returns the distance, so 1 - distance gives similarity
    sims.append((Category_name, name1, sim))
    
    
sims_sorted = sorted(sims, key=lambda x: x[2], reverse=True)
    
for item in sims_sorted:
    print(f"{item[0]}' and '{item[1]}': {item[2]:.2f}")

Canadian' and 'CANADIAN WHISKIES': 0.81
Canadian' and 'AMERICAN BRANDIES': 0.68
Canadian' and 'AMERICAN SCHNAPPS': 0.66
Canadian' and 'AMERICAN WHISKIES': 0.66
Canadian' and 'AMERICAN AMARETTO': 0.65
Canadian' and 'AMERICAN ALCOHOL': 0.63
Canadian' and 'AMERICAN VODKAS': 0.63
Canadian' and 'AMERICAN COCKTAILS': 0.62
Canadian' and 'AMERICAN DISTILLED SPIRITS SPECIALTY': 0.58
Canadian' and 'AMERICAN FLAVORED VODKA': 0.57
Canadian' and 'AMERICAN GRAPE BRANDIES': 0.56
Canadian' and 'JAPANESE WHISKY': 0.56
Canadian' and 'IRISH WHISKIES': 0.55
Canadian' and 'AMERICAN SLOE GINS': 0.53
Canadian' and 'HIGH PROOF BEER - AMERICAN': 0.50
Canadian' and 'AMERICAN DRY GINS': 0.50
Canadian' and 'TENNESSEE WHISKIES': 0.46
Canadian' and 'NEUTRAL GRAIN SPIRITS FLAVORED': 0.44
Canadian' and 'SINGLE BARREL BOURBON WHISKIES': 0.43
Canadian' and 'SINGLE MALT SCOTCH': 0.43
Canadian' and 'OTHER PROOF VODKA': 0.42
Canadian' and 'IOWA DISTILLERY WHISKIES': 0.42
Canadian' and 'WHITE RUM': 0.42
Canadian' and 'NEUT