# E-Commerce Data

## Data exploraton

Data is E-commerce sales data from Kaggle: <https://www.kaggle.com/datasets/carrie1/ecommerce-data?resource=download>

In [57]:
import pandas as pd

In [58]:
data = pd.read_csv('e-commerce-data.csv', encoding = "ISO-8859-1")

In [59]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [62]:
data.dropna(inplace=True)

In [63]:
len(data.Description.unique().tolist())

3896

In [64]:
text_file = open("english_labelling.txt", "w")
product_list = data.Description.unique().tolist()
product_list = list(set(product_list))
print(len(product_list))
n = text_file.write('\n'.join(product_list))
text_file.close()

3896


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [5]:
# remove rows where there are no description
data = data[~pd.isna(data.Description)]

In [6]:
data.shape

(540455, 8)

In [7]:
data.StockCode.nunique(), data.Description.nunique()

(3958, 4223)

In [8]:
# convert all StockCodes to uppercase
data['StockCode'] = data['StockCode'].apply(lambda x: x.upper())

In [9]:
# find those stock code where there are > 1 unique descriptions
non_unique_desc = data.groupby('Description').StockCode.nunique().reset_index()
non_unique_desc = non_unique_desc[non_unique_desc.StockCode > 1]
non_unique_desc = non_unique_desc.Description.tolist()

In [10]:
len(non_unique_desc)

60

In [11]:
interesting_desc = data[data.Description.isin(non_unique_desc)].groupby('Description').StockCode.nunique().reset_index()
interesting_desc

Unnamed: 0,Description,StockCode
0,?,47
1,??,7
2,???missing,2
3,?missing,2
4,AMAZON,3
5,Adjustment,2
6,Amazon,7
7,CHECK,3
8,COLOURING PENCILS BROWN TUBE,2
9,COLUMBIAN CANDLE RECTANGLE,2


In [12]:
# Problematic ones are those that contains lowercase
def check_lower(text):
    for i in text:
        if i.islower():
            return 1
    else:
        return 0
    
interesting_desc['contains_lowercase'] = interesting_desc['Description'].apply(check_lower)

In [13]:
to_remove_desc = interesting_desc[interesting_desc.contains_lowercase==1].Description.tolist()
to_remove_desc += ['CHECK', '?', '??']
to_remove_desc

['???missing',
 '?missing',
 'Adjustment',
 'Amazon',
 'Damaged',
 'Dotcom sales',
 'Found',
 'Unsaleable, destroyed.',
 'adjustment',
 'amazon',
 'check',
 'counted',
 'crushed',
 'damaged',
 'damages',
 'damages wax',
 'damages?',
 'dotcom',
 'ebay',
 'found',
 'had been put aside',
 'incorrect stock entry.',
 'mailout',
 'missing',
 'mixed up',
 'returned',
 'reverse 21/5/10 adjustment',
 'rusty throw away',
 'smashed',
 'sold as 1',
 'sold as set on dotcom',
 'stock check',
 'test',
 'thrown away',
 'wet damaged',
 'wet pallet',
 'wet/rusty',
 'CHECK',
 '?',
 '??']

In [14]:
ok_data = data[~data.Description.isin(to_remove_desc)].reset_index()

In [15]:
ok_data = ok_data[['StockCode', 'Description']].drop_duplicates()

In [16]:
ok_data.shape

(4205, 2)

## Convert to embedding

### TF-IDF

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
ok_data.sort_values(by='StockCode').StockCode.unique().tolist()[10:30]

['15030',
 '15034',
 '15036',
 '15039',
 '15044A',
 '15044B',
 '15044C',
 '15044D',
 '15056BL',
 '15056N',
 '15056P',
 '15058A',
 '15058B',
 '15058C',
 '15060B',
 '16008',
 '16010',
 '16011',
 '16012',
 '16014']

In [19]:
group1 = ['15056BL', '15056N', '15056P', '16010', '16011', '16012']
group1_data = ok_data[ok_data.StockCode.isin(group1)]
# group1_desc = ['SEAGULL NAPTH 25g WRNA / PCS', 'SEA GULL WARNA RENTENG', 'MANGKOK SAMBAL ALL VAR', 'SEAQULL NAPT WARNA 25GR', 'SeA GULL NAPHT 25GR SG-519W 1PCSX 1.500,00']

In [20]:
vectorizer = TfidfVectorizer(analyzer='char')
vectors = vectorizer.fit_transform(group1_data.Description.tolist())
# vectors = vectorizer.fit_transform(group1_desc)
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

In [21]:
df['Description'] = group1_data.Description.tolist()
# df['Description'] = group1_desc
df

Unnamed: 0,Unnamed: 1,&,/,a,b,c,d,e,f,g,...,m,n,o,p,r,s,t,u,w,Description
0,0.240888,0.0,0.0,0.695051,0.271331,0.16097,0.278021,0.13901,0.0,0.0,...,0.0,0.120444,0.13901,0.13901,0.240888,0.120444,0.0,0.0,0.16097,EDWARDIAN PARASOL BLACK
1,0.209772,0.0,0.0,0.726324,0.0,0.0,0.242108,0.121054,0.0,0.0,...,0.0,0.209772,0.121054,0.121054,0.314658,0.104886,0.163582,0.236283,0.140177,EDWARDIAN PARASOL NATURAL
2,0.267061,0.0,0.0,0.616458,0.0,0.0,0.308229,0.154114,0.0,0.0,...,0.0,0.267061,0.154114,0.308229,0.267061,0.133531,0.0,0.0,0.17846,EDWARDIAN PARASOL PINK
3,0.241189,0.0,0.222774,0.0,0.0,0.161171,0.278368,0.278368,0.222774,0.222774,...,0.0,0.241189,0.417553,0.139184,0.241189,0.361783,0.188081,0.0,0.0,FOOD/DRINK SPONGE STICKERS
4,0.189133,0.0,0.0,0.436575,0.0,0.25277,0.0,0.218287,0.0,0.0,...,0.349384,0.189133,0.0,0.0,0.189133,0.378265,0.294974,0.0,0.0,ANIMAL STICKERS
5,0.44347,0.199806,0.163844,0.102366,0.0,0.237074,0.102366,0.0,0.327688,0.327688,...,0.163844,0.266082,0.204732,0.102366,0.088694,0.354776,0.0,0.0,0.118537,FOLDING CAMPING SCISSOR W/KNIF & S


In [22]:
df = pd.merge(df, ok_data[['Description', 'StockCode']], on='Description', how='left')

### Cosine Similarity

In [23]:
from sklearn.metrics.pairwise import cosine_similarity
import scipy

In [24]:
sparse_test = scipy.sparse.csr_matrix(df[feature_names.tolist()].values)

In [25]:
cosine_sim = pd.DataFrame(cosine_similarity(sparse_test))
cosine_sim.index = df.Description.tolist()
cosine_sim.columns = df.Description.tolist()

In [26]:
cosine_sim

Unnamed: 0,EDWARDIAN PARASOL BLACK,EDWARDIAN PARASOL NATURAL,EDWARDIAN PARASOL PINK,FOOD/DRINK SPONGE STICKERS,ANIMAL STICKERS,FOLDING CAMPING SCISSOR W/KNIF & S
EDWARDIAN PARASOL BLACK,1.0,0.889362,0.901933,0.475993,0.670526,0.487933
EDWARDIAN PARASOL NATURAL,0.889362,1.0,0.897419,0.439579,0.662832,0.428884
EDWARDIAN PARASOL PINK,0.901933,0.897419,1.0,0.584812,0.673113,0.565768
FOOD/DRINK SPONGE STICKERS,0.475993,0.439579,0.584812,1.0,0.58268,0.783884
ANIMAL STICKERS,0.670526,0.662832,0.673113,0.58268,1.0,0.625924
FOLDING CAMPING SCISSOR W/KNIF & S,0.487933,0.428884,0.565768,0.783884,0.625924,1.0


We can see that this baseline method only works for very clean product names. It can find the odd product name from a group of a few items. However for messier data, it will do badly.

It performs badly when:
* there are spelling errors
* random spaces
* quantity in product name
* noise

## Spacy NER

In [None]:
import spacy
from spacy import displacy

NER = spacy.load("en_core_web_lg")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
example = ok_data.iloc[2]['Description'].lower()
example

'cream cupid hearts coat hanger'

In [None]:
raw_text="The Indian Space Research Organisation or is the national space agency of India, headquartered in Bengaluru. It operates under Department of Space which is directly overseen by the Prime Minister of India while Chairman of ISRO acts as executive of DOS as well."
raw_text = raw_text.lower()
text1 = NER(example)

In [None]:
for word in text1.ents:
    print(word.text,word.label_)

In [None]:
text1.ents

()

## NLTK NER

In [51]:
example = ok_data.iloc[2]['Description'].lower()
example

'cream cupid hearts coat hanger'

In [54]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     ------ --------------------------------- 0.2/1.5 MB 7.4 MB/s eta 0:00:01
     --------------- ------------------------ 0.6/1.5 MB 7.1 MB/s eta 0:00:01
     ----------------------- ---------------- 0.9/1.5 MB 6.9 MB/s eta 0:00:01
     ----------------------------- ---------- 1.1/1.5 MB 6.5 MB/s eta 0:00:01
     -------------------------------------- - 1.4/1.5 MB 6.5 MB/s eta 0:00:01
     ---------------------------------------- 1.5/1.5 MB 6.4 MB/s eta 0:00:00
Installing collected packages: nltk
Successfully installed nltk-3.8.1


In [55]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
{(' '.join(c[0] for c in chunk), chunk.label() ) for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(example))) if hasattr(chunk, 'label') }

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ansel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ansel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\ansel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\ansel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


set()

## StanfordNLP NER

In [56]:
!pip3 install nltk==3.2.4
!wget http://nlp.stanford.edu/software/stanford-ner-2015-04-20.zip
!unzip stanford-ner-2015-04-20.zip
from nltk.tag.stanford import StanfordNERTagger
jar = "stanford-ner-2015-04-20/stanford-ner-3.5.2.jar"
model = "stanford-ner-2015-04-20/classifiers/" 
st_3class = StanfordNERTagger(model + "english.all.3class.distsim.crf.ser.gz", jar, encoding='utf8') 
st_4class = StanfordNERTagger(model + "english.conll.4class.distsim.crf.ser.gz", jar, encoding='utf8') 
st_7class = StanfordNERTagger(model + "english.muc.7class.distsim.crf.ser.gz", jar, encoding='utf8')
st_3class.tag(example.split())
st_4class.tag(example.split())
st_7class.tag(example.split())

Collecting nltk==3.2.4
  Downloading nltk-3.2.4.tar.gz (1.2 MB)
     ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
     ----------- ---------------------------- 0.3/1.2 MB 10.2 MB/s eta 0:00:01
     ------------------------------------ --- 1.1/1.2 MB 13.4 MB/s eta 0:00:01
     ---------------------------------------- 1.2/1.2 MB 10.5 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py): started
  Building wheel for nltk (setup.py): finished with status 'done'
  Created wheel for nltk: filename=nltk-3.2.4-py3-none-any.whl size=1367721 sha256=8c9972e3b4ba7426417662298eaba1f642cb7ebaeb3fb489161e7007abcfff54
  Stored in directory: c:\users\ansel\appdata\local\pip\cache\wheels\0e\8c\42\bcd0934b61ecf4cef964ccc9881888cca0841ec72266e99de1
Successfully built nltk
Installing collected packages: nltk
  Attempting uninstall: nltk
   

'wget' is not recognized as an internal or external command,
operable program or batch file.
'unzip' is not recognized as an internal or external command,
operable program or batch file.


LookupError: Could not find stanford-ner.jar jar file at stanford-ner-2015-04-20/stanford-ner-3.5.2.jar