## Import Packages

In [84]:
import json 
import os
import pandas as pd

import re
from collections import Counter

from nltk.corpus import stopwords
#nltk.download('stopwords')
from nltk.tokenize import word_tokenize

## Load Json Data

In [51]:
def json_opener(num_products):
    
    list_of_products = []
    
    for x in range(num_products):
        
        jsonfile = open('data/product{}.json'.format(x+1),'r')
        
        jsondata = jsonfile.read()
        
        obj = json.loads(jsondata)
        
        list_of_products.append(obj)
    
    return list_of_products

In [52]:
products = json_opener(4)

In [53]:
len(products)

4

## Pull Product Description Data

In [65]:
def product_description_pull(product_list):
    
    product_descriptions =[]

    for product in product_list:
        
        description = product['description']
        
        product_descriptions.append(description[0])
    
    return product_descriptions
        
        
        

In [66]:
prod_descriptions = product_description_pull(products)

In [77]:
prod_descriptions

['( =^ ^=) 1.It is made of high quality materials,durable enought for your daily wearing<br>(=^ ^=) 2.Stylish and fashion design make you more attractive<br>(=^ ^=) 3.Perfect Match with your favorite shorts, leggings, black slacks, denim jeans, etc<br>(=^ ^=) 4.Great for Daily,Casual,I am sure you will like it! <br><br>(=^ ^=) If you would like to know more products of our store, please pay close attention t o <b>Ninasill</b> <br><br>Loose Blouse V-Neck Blouse Solid Color Blouse Sling Blouse Fashion Blouse Cool Blouse Vintage Blouse Popular Blouse Fun Blouse Sexy Blouse Lace Vest Blouse Wild <br>Blouse Sleev eless Blouse Lace Blouse Lace Stitching Blouse Sweaters T-Shirts Men Blouse Women Blouse Girls Blouse Boys Blouse Kid Blouse Adult Blouse Children Blouse Printing Vest Rose Blouse Velvet Shirt Couple Blo use Elegant Blouse Long <br>Sleeve Blouse Lace Vest Printed Blouse High Low Hem Blouse Top Short Sleeve Blouses Hollow Clothing V-Collar Vest Casual Shirts Out Shoulder Blouse Thre

## Clean Descriptions/Remove Stop Words

In [90]:
def doc_cleaner(document_list):
    
    clean_docs = []
    
    # text cleaning function
    def text_cleaner(text):
        text = text.replace("<br>", "")
        text = text.lower()
        text = re.sub('\W', ' ', text)
        text = re.sub('\s+', ' ', text)
        return text


    # Clean text string
    for doc in document_list:
        clean_doc = text_cleaner(doc)
        clean_docs.append(clean_doc)
    
    return clean_docs

In [91]:
clean_prod_desc = doc_cleaner(prod_descriptions)

In [92]:
clean_prod_desc

[' 1 it is made of high quality materials durable enought for your daily wearing 2 stylish and fashion design make you more attractive 3 perfect match with your favorite shorts leggings black slacks denim jeans etc 4 great for daily casual i am sure you will like it if you would like to know more products of our store please pay close attention t o b ninasill b loose blouse v neck blouse solid color blouse sling blouse fashion blouse cool blouse vintage blouse popular blouse fun blouse sexy blouse lace vest blouse wild blouse sleev eless blouse lace blouse lace stitching blouse sweaters t shirts men blouse women blouse girls blouse boys blouse kid blouse adult blouse children blouse printing vest rose blouse velvet shirt couple blo use elegant blouse long sleeve blouse lace vest printed blouse high low hem blouse top short sleeve blouses hollow clothing v collar vest casual shirts out shoulder blouse three quarter sleeve shirts chiffon blouse floral printed button shirts',
 'melamine s

## Convert to List of Tokens 

In [93]:
def tokenizer(product_descriptions):
    
    prod_desc_toks = []
    
    for product in product_descriptions:
        
        text_tokens = word_tokenize(product)
        
        tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
        
        prod_desc_toks.append(tokens_without_sw)
        
    return prod_desc_toks

In [94]:
prod_desc_tokens = tokenizer(clean_prod_desc)

In [95]:
prod_desc_tokens

[['1',
  'made',
  'high',
  'quality',
  'materials',
  'durable',
  'enought',
  'daily',
  'wearing',
  '2',
  'stylish',
  'fashion',
  'design',
  'make',
  'attractive',
  '3',
  'perfect',
  'match',
  'favorite',
  'shorts',
  'leggings',
  'black',
  'slacks',
  'denim',
  'jeans',
  'etc',
  '4',
  'great',
  'daily',
  'casual',
  'sure',
  'like',
  'would',
  'like',
  'know',
  'products',
  'store',
  'please',
  'pay',
  'close',
  'attention',
  'b',
  'ninasill',
  'b',
  'loose',
  'blouse',
  'neck',
  'blouse',
  'solid',
  'color',
  'blouse',
  'sling',
  'blouse',
  'fashion',
  'blouse',
  'cool',
  'blouse',
  'vintage',
  'blouse',
  'popular',
  'blouse',
  'fun',
  'blouse',
  'sexy',
  'blouse',
  'lace',
  'vest',
  'blouse',
  'wild',
  'blouse',
  'sleev',
  'eless',
  'blouse',
  'lace',
  'blouse',
  'lace',
  'stitching',
  'blouse',
  'sweaters',
  'shirts',
  'blouse',
  'women',
  'blouse',
  'girls',
  'blouse',
  'boys',
  'blouse',
  'kid',
  '

## Part 1 - "repeated words in a single product description will only count once"

In [118]:
def word_counter_no_rep(product_description_tokens_list):
    
    all_distict_words = []
    
    for product in product_description_tokens_list:
        
        distinct_words = set(product)
        
        distinct_words = list(distinct_words)
        
        all_distict_words = all_distict_words + distinct_words
    
    word_counts = Counter(all_distict_words).most_common()
    
    return word_counts

In [120]:
word_counts_nr = word_counter_no_rep(prod_desc_tokens)

word_counts_nr[:100]

[('4', 3),
 ('2', 2),
 ('long', 2),
 ('use', 2),
 ('design', 2),
 ('high', 1),
 ('etc', 1),
 ('black', 1),
 ('attention', 1),
 ('know', 1),
 ('loose', 1),
 ('couple', 1),
 ('3', 1),
 ('blouse', 1),
 ('sweaters', 1),
 ('quality', 1),
 ('perfect', 1),
 ('cool', 1),
 ('vest', 1),
 ('elegant', 1),
 ('fashion', 1),
 ('printing', 1),
 ('kid', 1),
 ('great', 1),
 ('stylish', 1),
 ('shorts', 1),
 ('products', 1),
 ('stitching', 1),
 ('solid', 1),
 ('quarter', 1),
 ('clothing', 1),
 ('would', 1),
 ('ninasill', 1),
 ('boys', 1),
 ('enought', 1),
 ('store', 1),
 ('1', 1),
 ('slacks', 1),
 ('hollow', 1),
 ('sling', 1),
 ('chiffon', 1),
 ('jeans', 1),
 ('popular', 1),
 ('durable', 1),
 ('leggings', 1),
 ('top', 1),
 ('blouses', 1),
 ('fun', 1),
 ('wearing', 1),
 ('sure', 1),
 ('close', 1),
 ('wild', 1),
 ('make', 1),
 ('match', 1),
 ('vintage', 1),
 ('printed', 1),
 ('collar', 1),
 ('blo', 1),
 ('button', 1),
 ('materials', 1),
 ('lace', 1),
 ('floral', 1),
 ('attractive', 1),
 ('favorite', 1),
 ('

## Part 2 - repeated words count every time

In [115]:
def word_counter_with_rep(product_description_tokens_list):
    
    all_words = []
    
    for product in product_description_tokens_list:
        
        all_words = all_words + product
        
    
    word_counts = Counter(all_words).most_common()
    
    return word_counts

In [116]:
word_counts_withrep = word_counter_with_rep(prod_desc_tokens)

In [121]:
word_counts_withrep[:100]

[('blouse', 29),
 ('lace', 4),
 ('vest', 4),
 ('shirts', 4),
 ('4', 3),
 ('sleeve', 3),
 ('power', 3),
 ('watt', 3),
 ('inverter', 3),
 ('high', 2),
 ('daily', 2),
 ('2', 2),
 ('fashion', 2),
 ('design', 2),
 ('casual', 2),
 ('like', 2),
 ('b', 2),
 ('use', 2),
 ('long', 2),
 ('printed', 2),
 ('suede', 2),
 ('cushioning', 2),
 ('2000', 2),
 ('12', 2),
 ('dc', 2),
 ('run', 2),
 ('load', 2),
 ('gp', 2),
 ('1', 1),
 ('made', 1),
 ('quality', 1),
 ('materials', 1),
 ('durable', 1),
 ('enought', 1),
 ('wearing', 1),
 ('stylish', 1),
 ('make', 1),
 ('attractive', 1),
 ('3', 1),
 ('perfect', 1),
 ('match', 1),
 ('favorite', 1),
 ('shorts', 1),
 ('leggings', 1),
 ('black', 1),
 ('slacks', 1),
 ('denim', 1),
 ('jeans', 1),
 ('etc', 1),
 ('great', 1),
 ('sure', 1),
 ('would', 1),
 ('know', 1),
 ('products', 1),
 ('store', 1),
 ('please', 1),
 ('pay', 1),
 ('close', 1),
 ('attention', 1),
 ('ninasill', 1),
 ('loose', 1),
 ('neck', 1),
 ('solid', 1),
 ('color', 1),
 ('sling', 1),
 ('cool', 1),
 ('