In [1]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/Akash_Job_Project/_TeamFrog_submission/TeamFrog_nb_for_submission

Mounted at /content/drive/
/content/drive/MyDrive/Akash_Job_Project/_TeamFrog_submission/TeamFrog_nb_for_submission


# 1. Load packages and define functions

In [3]:
import pandas as pd
import numpy as np
import re
from metadata_cleaning import metadata_cleaning

!pip install k_means_constrained
from k_means_constrained import KMeansConstrained
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

def evaluation(y_true, y_pred):
    print('\n\nMetric results:')
    print(f'MSE: {mean_squared_error(y_true, y_pred)}')
    print(f'MAE: {mean_absolute_error(y_true, y_pred)}')
    print(f'RMSE: {np.sqrt(mean_squared_error(y_true, y_pred))}')
    print(f'R2: {r2_score(y_true, y_pred)}', end='\n\n')

def noun_chunking(titles):
    import spacy
    nlp = spacy.load("en_core_web_sm")

    """
    Takes a list of titles and returns a nested list of noun chunks for each title
    """
    m = []
    for text in titles:
        text = str(text).strip()
        text = re.sub(r"(\(|\)|\~|\||\,|\#)", "", text)
        text = re.sub(r'\&\w+\;', " ", text)
        text = re.sub(r"(\s{2,})", " ", text)
        doc = nlp(text)
        l = []
        for c in doc.noun_chunks:
            if c.root.pos_ != "PRON":
                l.append(str(c).lower())
        l = list(set(l))
        m.append(l)
    return m

Collecting k_means_constrained
  Downloading k_means_constrained-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ortools>=9.4.1874 (from k_means_constrained)
  Downloading ortools-9.7.2996-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.1/21.1 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf>=4.23.3 (from ortools>=9.4.1874->k_means_constrained)
  Downloading protobuf-4.24.3-cp37-abi3-manylinux2014_x86_64.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.6/311.6 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf, ortools, k_means_constrained
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.3
    Uninstalling protobuf-3.20.3

# 2. Data overview

In [None]:
# !wget https://jmcauley.ucsd.edu/data/amazon_v2/metaFiles2/meta_Home_and_Kitchen.json.gz --no-check-certificate
filename = "meta_Home_and_Kitchen.json.gz"
data = metadata_cleaning(filename)

print("* N observations:", len(data))
print("* Data shape:", data.shape)
print("* Columns:", data.columns)
data.head(5)

* N observations: 594301
* Data shape: (594301, 9)
* Columns: Index(['category', 'title', 'brand', 'main_cat', 'price', 'asin', 'N_images',
       'HighResImg', 'N_description'],
      dtype='object')


Unnamed: 0,category,title,brand,main_cat,price,asin,N_images,HighResImg,N_description
0,"[Home & Kitchen, Kitchen & Dining, Dining & En...",You Are Special Today Red Plate [With Red Pen],,Amazon Home,37.0,1487795,0,1,33
1,"[Home & Kitchen, Home Dcor, Candles & Holders,...",Vicks Inhaler Relief for Cold Sinus Nasal Cong...,,Amazon Home,4.05,2020300,0,1,17
2,"[Home & Kitchen, Kitchen & Dining, Dining & En...",Artistic Churchware Communion Cup Filler: RW525,,Amazon Home,12.48,6564224,0,1,6
3,"[Home & Kitchen, Bath, Bathroom Accessories]",4 BARS! Mysore Sandal Soap 70grams FAST SHIPPING,,Amazon Home,22.0,9046461,3,1,98
4,"[Home & Kitchen, Home Dcor, Home Fragrance, In...",AROGYA VATI (40gm) by popeye seller,,Amazon Home,5.1,234937912,0,1,32


* We restrict our sample to **Tools & Home Improvement** sub-category within the Home and Kitchen category.

In [None]:
# categories
print(data['main_cat'].value_counts()[data['main_cat'].value_counts() > 5000])

# data.loc[data.loc[:, 'main_cat'] == "Tools & Home Improvement",:].\
    # to_csv("Home_Kitchen_title_price.csv", index=False)
# del data

NameError: ignored

# 3. Title similarity

### Titles into noun chunks, and then into one-hot-vectors
1. We converted each title into a list of noun chunks
    - using an NLP tool `spacy` after having cleaned HTML special characters and unnecesary characters
2. The list of noun chunks are one-hot encoded into a dataframe with each title (or each listed product) as a row and the noun chunks as columns
    - We used `MultiLabelBinarizer` for this.
    - The resulting data is of shape `(11062, 18185)`=(the number of products, the number of unique noun chunks)

### *[TODO]* why noun chunks? why multilabelbinarizer? how to reduce the noun chunks?

In [None]:
d = pd.read_csv("Home_Kitchen_title_price.csv")
titles = list(d[['title']].squeeze())
m = noun_chunking(titles)

mlb = MultiLabelBinarizer()

m_series = pd.Series(m)
res = pd.DataFrame(mlb.fit_transform(m_series),
                   columns=mlb.classes_,
                   index=m_series.index)
print(res.shape)
res.head(5)

(11062, 17959)


Unnamed: 0,2 pack w10187748,a perfect couple - classic black apron - rhinestone - poly blend,abc products,baha'i way street,burrowing owl st street sign,caution,coptic orthodox,i love wilson street,kamalani hawaiian hula dancer statue - kim taylor reece,kuulei hawaiian hula dancer statue - kim taylor reece,...,ziptile baton rouge,zitrades el wire neon lights kit,zitrades el wire portable,zitrades el wire purple neon glowing strobing electroluminescent el wire kit,zoroufy,zr2818 stainless steel,zstbt dm-300lw/3 300led 9.84ft9.84ft/3m3m window curtain lights,ztop portable fashion air conditioner bladeless fan,zuhne stainless steel,zvac work
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
X_tr, X_te, y_tr, y_te = train_test_split(res, d.loc[:, 'price'], random_state = 0)
print(X_tr.shape)
print(X_te.shape)

(8296, 17959)
(2766, 17959)


### KMeans clustering

In [None]:
kmeans = KMeansConstrained(
    n_clusters=100,
    size_min=15,
    size_max=300,
    random_state=0,
    tol=1e-2,
    verbose=1,
)
kmeans.fit(X_tr)

Initialization complete
Iteration  0, inertia 37050.000
Iteration  1, inertia 17006.386
Iteration  2, inertia 16908.922
Iteration  3, inertia 16853.887
Iteration  4, inertia 16820.416
Iteration  5, inertia 16775.846
Iteration  6, inertia 16757.316
Iteration  7, inertia 16750.780
Iteration  8, inertia 16748.864
Iteration  9, inertia 16747.828
Iteration 10, inertia 16743.646
Iteration 11, inertia 16739.995
Iteration 12, inertia 16738.018
Converged at iteration 12: center shift 4.333342e-34 within tolerance 1.191202e-06
Initialization complete
Iteration  0, inertia 30319.000
Iteration  1, inertia 16869.827
Iteration  2, inertia 16814.439
Iteration  3, inertia 16768.243
Iteration  4, inertia 16718.858
Iteration  5, inertia 16693.869
Iteration  6, inertia 16682.750
Iteration  7, inertia 16677.305
Iteration  8, inertia 16674.496
Iteration  9, inertia 16672.997
Iteration 10, inertia 16672.580
Iteration 11, inertia 16671.864
Converged at iteration 11: center shift 2.311116e-33 within tolerance

KMeansConstrained(copy_x=True, init='k-means++', max_iter=300, n_clusters=100,
         n_init=10, n_jobs=1, random_state=0, size_max=300, size_min=15,
         tol=0.01, verbose=1)

In [None]:
y_tr = pd.DataFrame(y_tr)
y_tr.loc[:,'cluster'] = kmeans.predict(X_tr)
y_tr.loc[:, 'predicted_price'] = y_tr['price'].groupby(y_tr['cluster']).transform('mean')

print("* Training data evaluation:")
evaluation(y_tr['price'], y_tr['predicted_price'])

prev_result="""

Training

Metric results:
MSE: 7043.841605542812
MAE: 44.04952977160332
RMSE: 83.92759740122919
R2: 0.030761079592558804

Test

Metric results:
MSE: 6355.942786516434
MAE: 44.169873709469044
RMSE: 79.72416689132872
R2: 0.05584567627964909

"""

* Training data evaluation:


Metric results:
MSE: 7059.868262722809
MAE: 44.178464837176534
RMSE: 84.02302221845397
R2: 0.028555797195144117



In [None]:
y_te = pd.DataFrame(y_te)
y_te.loc[:,'cluster'] = kmeans.predict(X_te)
y_te.loc[:, 'predicted_price'] = y_te['price'].groupby(y_te['cluster']).transform('mean')

print("* Test data evaluation:")
evaluation(y_te['price'], y_te['predicted_price'])

* Test data evaluation:


Metric results:
MSE: 6368.5523780445155
MAE: 44.49446023601933
RMSE: 79.80321032417503
R2: 0.05397256307497378



In [None]:
d.loc[:,'cluster'] = kmeans.predict(res)
d[['asin', 'brand', 'price', 'N_images', 'HighResImg', 'N_description', 'cluster']].\
    to_csv('Home_Kitchen_title_price_including_cluster.csv', index=False)