In [1]:
from gensim.models import KeyedVectors
import numpy as np
from gensim.models import Word2Vec
import pandas as pd
import warnings
import os
from gensim.parsing.preprocessing import preprocess_string
warnings.filterwarnings('ignore')

class Embedding:
    
    def __init__(self,filename):
        self.word2v_file = 'w2vec_10d.txt'
        self.filename = filename
    
    def preprocessing(self):
        
        df = pd.read_csv(self.filename,sep='\t')
        t = df[df['description'].isna() == False]
        t = t[t['kw_name']!='ToBeRemoved']
        data = t.copy()
        data.reset_index(drop=True,inplace=True)
        data['product_name'] = data['product_name'].apply(lambda x:x.strip())
        data['desc_and_productname'] = data['product_name'] + ' ' + data['description']
         
        all_description = []
        for description in data['desc_and_productname'].values:
            all_description.append(preprocess_string(description))
            
        return all_description
            
    def word2vec(self,list_of_string):
            
        
        if not os.path.exists(self.word2v_file):
            model = Word2Vec(list_of_string, 
                     min_count=1,   # word frequency
                     size=10,      # dimention of word embeddings
                     workers=8,     # Number of processors
                     sg=0, # sg=0 for cbow, sg=1 for skip gram
                     window=1,      # Context window
                    ) 
        
            # save
            model.wv.save_word2vec_format(self.word2v_file)
        w2v = KeyedVectors.load_word2vec_format(self.word2v_file)
        
        return w2v

In [2]:
embedding = Embedding('autotagger.tsv')

In [3]:
all_text = embedding.preprocessing()

In [4]:
all_text

[['addi',
  'flip',
  'black',
  'bin',
  'dispos',
  'unwant',
  'rubbish',
  'have',
  'touch',
  'bin',
  'qualiti',
  'pedal',
  'bin',
  'simpli',
  'open',
  'foot',
  'drop',
  'garbag',
  'trash',
  'bag'],
 ['brm',
  'peri',
  'peri',
  'spatchcock',
  'chicken',
  'fulli',
  'cook',
  'nreadi',
  'minut',
  'ncook',
  'tender',
  'marin',
  'famou',
  'peri',
  'peri',
  'sauc',
  'nkeep',
  'refriger',
  'nbrm',
  'spatchcock',
  'chicken',
  'fulli',
  'cook',
  'peri',
  'peri',
  'sauc'],
 ['crush', 'ginger', 'perfect', 'stew', 'curri'],
 ['uss', 'vsop', 'cognac', 'uss', 'vsop', 'cognac'],
 ['dettol',
  'liquid',
  'bathroom',
  'cleaner',
  'ocean',
  'fresh',
  'ndettol',
  'liquid',
  'bathroom',
  'cleaner',
  'ocean',
  'fresh'],
 ['frozen', 'mix', 'berri', 'special', 'selecet'],
 ['glad',
  'small',
  'zipper',
  'freezer',
  'bag',
  'glad',
  'freezer',
  'bag',
  'strong',
  'suitabl',
  'freez',
  'type',
  'food',
  'includ',
  'bag',
  'tie',
  'seal',
  'fres

In [5]:
w2v = embedding.word2vec(all_text)

In [6]:
w2v['skin']

array([ 0.8120286 , -1.075846  ,  0.3069056 , -0.11384587, -1.4837037 ,
        0.23139429, -0.81555927,  1.9369663 , -1.1026706 , -1.011192  ],
      dtype=float32)

In [7]:
w2v.vocab

{'flavour': <gensim.models.keyedvectors.Vocab at 0x20b762fa460>,
 'fresh': <gensim.models.keyedvectors.Vocab at 0x20b762faf10>,
 'fruit': <gensim.models.keyedvectors.Vocab at 0x20b7633b130>,
 'skin': <gensim.models.keyedvectors.Vocab at 0x20b7633b190>,
 'free': <gensim.models.keyedvectors.Vocab at 0x20b7633b1f0>,
 'tast': <gensim.models.keyedvectors.Vocab at 0x20b7633b250>,
 'soft': <gensim.models.keyedvectors.Vocab at 0x20b7633b2b0>,
 'babi': <gensim.models.keyedvectors.Vocab at 0x20b7633b310>,
 'protect': <gensim.models.keyedvectors.Vocab at 0x20b7633b370>,
 'chicken': <gensim.models.keyedvectors.Vocab at 0x20b7633b3d0>,
 'rang': <gensim.models.keyedvectors.Vocab at 0x20b7633b430>,
 'water': <gensim.models.keyedvectors.Vocab at 0x20b7633b490>,
 'white': <gensim.models.keyedvectors.Vocab at 0x20b7633b4f0>,
 'beef': <gensim.models.keyedvectors.Vocab at 0x20b7633b550>,
 'sweet': <gensim.models.keyedvectors.Vocab at 0x20b7633b5b0>,
 'cream': <gensim.models.keyedvectors.Vocab at 0x20b7633

In [1]:
import sc

In [4]:
embedding = sc.Embedding('autotagger.tsv')

In [5]:
all_text = embedding.preprocessing()

In [6]:
all_text

[['addi',
  'flip',
  'black',
  'bin',
  'dispos',
  'unwant',
  'rubbish',
  'have',
  'touch',
  'bin',
  'qualiti',
  'pedal',
  'bin',
  'simpli',
  'open',
  'foot',
  'drop',
  'garbag',
  'trash',
  'bag'],
 ['brm',
  'peri',
  'peri',
  'spatchcock',
  'chicken',
  'fulli',
  'cook',
  'nreadi',
  'minut',
  'ncook',
  'tender',
  'marin',
  'famou',
  'peri',
  'peri',
  'sauc',
  'nkeep',
  'refriger',
  'nbrm',
  'spatchcock',
  'chicken',
  'fulli',
  'cook',
  'peri',
  'peri',
  'sauc'],
 ['crush', 'ginger', 'perfect', 'stew', 'curri'],
 ['uss', 'vsop', 'cognac', 'uss', 'vsop', 'cognac'],
 ['dettol',
  'liquid',
  'bathroom',
  'cleaner',
  'ocean',
  'fresh',
  'ndettol',
  'liquid',
  'bathroom',
  'cleaner',
  'ocean',
  'fresh'],
 ['frozen', 'mix', 'berri', 'special', 'selecet'],
 ['glad',
  'small',
  'zipper',
  'freezer',
  'bag',
  'glad',
  'freezer',
  'bag',
  'strong',
  'suitabl',
  'freez',
  'type',
  'food',
  'includ',
  'bag',
  'tie',
  'seal',
  'fres

In [7]:
w2v = embedding.word2vec(all_text)

In [8]:
w2v['skin']

array([ 0.8120286 , -1.075846  ,  0.3069056 , -0.11384587, -1.4837037 ,
        0.23139429, -0.81555927,  1.9369663 , -1.1026706 , -1.011192  ],
      dtype=float32)