In [2]:
import pandas as pd
import numpy as np
import spacy
import random

In [50]:
data = pd.read_csv('datasets/amazon/sampled.csv')

def shuffle_title(entry):
    title_list = entry.title.replace(entry.brand,'').split(' ')
    title_list.append(' ' + entry.brand)
    #print(title_list)
    random.shuffle(title_list)
    shuffled_title = ' '.join(title_list)
    entry.title = shuffled_title
    return entry
    

In [51]:
data_shuffled = data.apply(lambda x : shuffle_title(x),axis=1)

In [55]:
data_shuffled.to_csv('datasets/amazon/shuffled.csv',index=False)

# Loading Data:

In this part, we will load data from the Amazon Product Review dataset, available at this link : https://nijianmo.github.io/amazon/index.html

The dataset contains metadata and reviews for more than a million amazon product, we just need the metadata in our case, more precisely, the title and brand column, which was cleaned to keep only titles **containing a brand in them**. In our case we are using just office furnitures data but you can apply this project on any Amazon dataset. Or any dataset as long as it contains title and brand columns.

In [6]:
DATA_PATH = 'datasets/amazon/'
filenames = ['amazon_computers.csv','amazon_food.csv','amazon_industrial.csv','amazon_office.csv']
df_list = []
for fn in filenames:
    df_topic = pd.read_csv(DATA_PATH+fn)
    df_list.append(df_topic)
data = pd.concat(df_list)

In [9]:
data

Unnamed: 0,title,brand,asin
0,FLT&reg; Laptop AC Adapter/Power Supply/Charge...,FLT,6666666038
1,uxcell Flexible Neck Black Three Blade Compute...,uxcell,7884139057
2,"IKEA - UPPT&Auml;CKA Backpack, dark gray, yell...",IKEA,9178910897
3,"Princeton Eo2010 21"" Monitor (Pc/Mac)",Princeton,B00001MXZ7
4,"ViewSonic E790 19"" Monitor",ViewSonic,B00004TS2P
...,...,...,...
83047,Magicard Enduro3e ID Card Printer &amp; Suppli...,Magicard,B01HIXEFXY
83048,Generic Input ADF Paper Chute Tray for Fujitsu...,Generic,B01HJ9SBM8
83049,Bilipala 4Pcs Colored Plastic Magnetic Chalk H...,Bilipala,B01HJCZ0I8
83050,Nintendo Super Mario Bros. Lanyard with Mario ...,Nintendo,B01HJF4C66


In [10]:
# Let's see how many brands
data.brand.value_counts()

uxcell                1969
HP                    1357
Canon                 1265
Epson                 1073
Brother                956
                      ... 
Farina Di Castagne       1
Better Than Pasta        1
Miyasaka                 1
MAMBI                    1
CatchTheWave             1
Name: brand, Length: 36741, dtype: int64

In [21]:
data[data.brand == '(2 Pack) Journey to India, Vindaloo']

Unnamed: 0,title,brand,asin
111880,"(2 Pack) Journey to India, Vindaloo Meal Kit, ...","(2 Pack) Journey to India, Vindaloo",B0149D4QGM


In [179]:
# Let's clean the data, remove books and keep only titles with a brand in them

leave_categories = 'Books'

clean_data = full_data[full_data.main_cat != leave_categories]

# Removing titles with no brand in them :

clean_data = clean_data[clean_data.apply(lambda x: str(x['brand']) in str(x['title']), axis=1)]

In [180]:
clean_data.shape

(160321, 4)

In [181]:
# Some brands consist of numbers only and are read as float
clean_data['brand'] = clean_data.brand.apply(lambda x : str(x))

In [182]:
# Let's get rid of titles with brand having more than or 3 words
clean_data = clean_data[clean_data.apply(lambda x : len(x['brand'].split(' ')) < 3,axis=1)]

In [183]:
clean_data.shape

(150477, 4)

Let's check the distribution of brands across titles

In [49]:
data_shuffled

0                 Reader Smart  &hellip; (White)  +iD Card
1        Fall Small Europe  1-800-Flowers - of  for - F...
2        Only Bouquet Roses By... Valentine's Red Day O...
3         &amp; Large  1-800-Flowers - - Fruit Gourmet ...
4        Dish Classic - Large  Sympathy -  1-800-Flower...
                               ...                        
98823    Connector Spade ?150pcs Crimp Yellow Piggyback...
98824    Shopping Shopping Handcart Toy Mode Utility Su...
98825     yueton 1/4 Pin Inch  of 100pcs Tacks Red Push...
98826                           Cellular   Booster  zBoost
98827    Comb Handle Remover Practical Cleaner Embedded...
Length: 98828, dtype: object

In [53]:
# Return the percent of titles with brand in their start
def percent_of_brand_in_start(data):
    n = 0
    for i, item in data.iterrows():
        title = item['title']
        brand = item['brand']
        if title.find(brand) == 0:
            n += 1
    print("% of titles with brand in their start : ", n/data.shape[0] * 100, '%')

In [54]:
percent_of_brand_in_start(data_shuffled)

% of titles with brand in their start :  0.0010118589873315254 %


In [206]:
clean_data.brand.value_counts()

Avery              2306
3dRose             2196
HP                 2034
AT-A-GLANCE        1998
Canon              1886
                   ... 
Patch Magic           1
Neptune Shop          1
E-Ready Express       1
Rulercosplay          1
CatchTheWave          1
Name: brand, Length: 14116, dtype: int64

With more than 80% of titles having brands as the first word, our model can be very biased and could predict the first word in each title as as brand, let's try to improve this dataset :
- We will keep 50% of titles having brand as the first word
- For the rest :
    - Remove brand from title
    - Put it in a random place : ex : Middle, End, etc.

In [205]:
clean_data.to_csv('clean_data.csv',index=False)

# Improving the dataset

First, given limited computation abilities, we will only take 30k titles out of 116k, make sure to use random state for reproducability

**todo:** Find a way to sample from various brands as possible (no more than 2 titles per brand for example)

In [17]:
# Try to sample 1-5 example from each brand

def sample_from_brand(brand_df):
    if len(brand_df) > 10:
        sample_size = 10
    else: 
        sample_size = len(brand_df)
    return brand_df.sample(sample_size)

def sample_brand_equally(df,random_state=42):
    grp_brand = df.groupby('brand').apply(lambda b : sample_from_brand(b))
    return grp_brand
    """
    for brand_name,brand_count in grp_brand.iteritems() : 
        if brand_count > 10:
            sample_size = 10
        else:
            sample_size = brand_count
        
        brand_sample = df[df.brand == brand_name].sample(sample_size,random_state=42)
        samples.append(brand_sample)
    return pd.concat(samples)
    """

In [22]:
sampled = sample_brand_equally(data)

In [23]:
# Some further cleaning
sampled = sampled.drop_duplicates(subset='title',keep='first')
sampled = sampled[sampled.apply(lambda x: len(x['brand'].split(' ')) < 3,axis=1)]

In [16]:
print('Memory usage :',data.memory_usage(deep=True).sum()/(1e6),' mb')

Memory usage : 64.608907  mb


In [24]:
print(sampled.shape)
print(sampled.brand.value_counts())

(98828, 3)
Eat Local           10
Navitas Naturals    10
Domenico's Foods    10
Doll                10
Dolfin              10
                    ..
Jojoen               1
Joissu               1
Joint Juice          1
Joie                 1
zjskin               1
Name: brand, Length: 31438, dtype: int64


In [36]:
sampled.to_csv('datasets/amazon/sampled.csv',index=False)
sampled.to_excel('datasets/amazon/sampled.xlsx',index=False)

We sample up to 5 rows from each brand, our sample contains all brands, which is good, we have only 21480 titles instead of 30k but it's alright.

In [12]:
def get_brand_in_first(df,n=10000,random_state=42):
    
    is_brand_first = df.apply(lambda x : x['title'].find(x['brand']) == 0,axis=1)
    brand_first = df[is_brand_first].sample(n,random_state=random_state)   
    return brand_first



In [14]:
brand_first = get_brand_in_first(sample,6000,random_state=42)

In [15]:
brand_first

Unnamed: 0.1,Unnamed: 0,title,brand,present,root_cat
26319,26319,Biotene Oral Balance Saliva Replacement Gel Fo...,Biotene,True,26395
3260,3260,Zentosa Jewellery Silver Tone & Entwined Faux ...,Zentosa Jewellery,True,281
4781,4781,Italeri 2676 1/48 Scale MC.200 Saetta Plastic ...,Italeri,True,220
1482,1482,Adobe Creative Suite Photoshop Dreamweaver Fla...,Adobe,True,58058
10195,10195,"Logitech 2.1 Speaker System Z313 for PC, Xbox...",Logitech,True,58058
...,...,...,...,...,...
18629,18629,Hudson Ex-display Thursom3 Men's Suede Lace Up...,Hudson,True,11450
23015,23015,Eleganti Caresse slingback sheer heel 100% nyl...,Eleganti,True,11450
29617,29617,"Fujifilm Zoom Date F2.8, 35mm camera & Super E...",Fujifilm,True,625
640,640,LENOVO 60A3UAT2EU LT1423p Wired w. Screen Cove...,LENOVO,True,58058


We now have around 15k titles with brands as their first word, let's try to get around 5000 titles with brand as their final word and and 10k with brand in the middle

However since the dataset is quite biased, having more than 80% of titles with brand as first, we will use the fully cleaned dataset instead of the sample in order to be able to get 15k titles with no brand as their first word.

In [16]:
def number_brand_in_last(df):
    is_brand_last = df.apply(lambda x : x['title'].endswith(x['brand']),axis=1)
    brand_last = df[is_brand_last]
    print(brand_last.shape)



In [17]:
number_brand_in_last(clean_data)

(412, 5)


We only have 2830 titles with brand as their last word, we will take all these then.

In [18]:
def get_brand_in_last(df,random_state=42):
    is_brand_last = df.apply(lambda x : x['title'].endswith(x['brand']),axis=1)
    brand_last = df[is_brand_last]
    return brand_last

In [19]:
brand_last = get_brand_in_last(clean_data)

In [20]:
brand_last.shape

(412, 5)

In [217]:
# Compute number of titles having brand in the middle
# A brand is in the middle if it's not in first or last
# N middle = N sample - (N brand in first word + N brand in last word)

n_middle = sample.shape[0] - (brand_first.shape[0]+brand_last.shape[0])
print(n_middle)

15965


In [33]:
def get_brand_middle(df,n,random_state=42):
    is_brand_middle = df.apply(lambda x : x['title'].find(x['brand']) != 0 and x['title'].endswith(x['brand']) == False, axis=1)
    brand_middle = df[is_brand_middle].sample(n)
    return brand_middle
    
brand_middle = get_brand_middle(clean_data,5000)

In [34]:
brand_middle

Unnamed: 0.1,Unnamed: 0,title,brand,present,root_cat
1968,1968,Genuine CRUSADER Tumble Dryer Jockey Pulley Wh...,Crusader,True,11700
14559,14559,DELL HEATSINK FOR POWEREDGE R810 T913G,Dell,True,58058
16145,16145,WRANGLER TEXAS MENS JEANS - STONEWASH BLUE - 2...,Wrangler,True,11450
9481,9481,Cherry Red GOLD MEDAL FLOSSINE Candy Floss Col...,Gold Medal,True,11700
1598,1598,"With Love Large 14"" Ellie Teddy Bear - Soft To...",Suki,True,220
...,...,...,...,...,...
17862,17862,MENS FLY LONDON WIKI BLACK RUG LEATHER OIL SUE...,Fly London,True,11450
9284,9284,"SPRENGER BALKENHOL ULTRA-FIT SPURS, 35mm, PAST...",Sprenger,True,888
4973,4973,EDBIG5304 - Eduard Big Ed Sets 1:350 - Bismarc...,Eduard,True,220
12700,12700,NEW GENUINE JAGUAR XJ8 XK8 ALTERNATOR SURPRESS...,JAGUAR,True,131090


In [35]:
# That's it, time to concat all data : first_brand, last_brand, middle_brand

new_data = pd.concat([brand_first,brand_last,brand_middle])

In [36]:
new_data.shape

(11412, 5)

In [37]:
new_data.brand.value_counts()

Dell       88
Eduard     75
Bicycle    59
Bilibib    57
HP         56
           ..
BlueArc     1
i-Lit       1
Etiderm     1
KRUG        1
Wurth       1
Name: brand, Length: 3749, dtype: int64

In [38]:
new_data.to_csv('data_format/sampled_ebay.csv',index=False)

In [39]:
new_data.to_excel('data_format/sampled_ebay.xlsx',index=False)

we now have a little over 20k titles with brands in different places : first (50%), middle(40%), end(~10%)