# Capstone Project - Topic modelling

In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
pd.set_option('display.max_columns',50)

from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import RegexpTokenizer

import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.models.ldamodel import LdaModel
from gensim.models.ldamulticore import LdaMulticore
from gensim import similarities

from IPython.core.display import clear_output

### Reading in scraped shoe data

In [2]:
df = pickle.load(open('shoe_data.pkl', 'rb'))

In [3]:
print(f'No. of shoes: {df.shape[0]}')
print(f'No. of features: {df.shape[1]}')

No. of shoes: 848
No. of features: 49


In [4]:
df

Unnamed: 0,name,brand,core_score,user_review_count,expert_review_count,user_rating,expert_rating,user_5star,user_4star,user_3star,user_2star,user_1star,discontinued,terrain,arch_support,use,waterproof,water_repellent,maximalist,minimalist,triathlon,price,weight,toe_drop,heel_height,forefoot_height,width,good_summary,bad_summary,summary,info,size_fit,outsole,midsole,upper,weight_m,weight_w,toe_drop_m,toe_drop_w,heel_height_m,heel_height_w,forefoot_height_m,forefoot_height_w,width_m,width_w,zero_drop_m,zero_drop_w,low_drop_m,low_drop_w
0,Nike Air Zoom Pegasus 35,Nike,94,2249,27,4.4,94.0,71,14,6,4,5,1,1,Neutral,1,0,0,0,0,0,230,Men: 286g;Women: 255g;,Men: 10mm;Women: 10mm;,Men: 29mm;Women: 29mm;,Men: 19mm;Women: 19mm;,"Men: Narrow, Standard, Wide, Extra wide;Women:...",Many people liked the aesthetics of the Nike A...,A tester complained that the upper unit of the...,Many people liked the Nike Air Zoom Pegasus 35...,The Nike Air Zoom Pegasus 35 is a running shoe...,The Nike Air Zoom Pegasus 35 has a standard ru...,The outsole unit of the Nike Air Zoom Pegasus ...,Zoom Air is a cassette that contains compresse...,The upper unit of the Nike Air Zoom Pegasus 35...,286,255,10,10,29,29,19,19,"Narrow, Standard, Wide, Extra wide","Standard, Wide",0,0,0,0
1,Brooks Ghost 11,Brooks,94,4358,17,4.5,91.0,78,9,5,3,5,1,1,Neutral,1,0,0,0,0,0,230,Men: 309g;Women: 269g;,Men: 12mm;Women: 12mm;,Men: 29mm;Women: 29mm;,Men: 17mm;Women: 17mm;,"Men: Narrow, Standard, Wide, Extra wide;Women:...",The Brooks Ghost 11 had a nice cushion that en...,One reviewer attributed the shoe’s bulky and u...,Many users have found themselves considering b...,The latest iteration of the Brooks Ghost is a ...,The Brooks Ghost 11 guarantees a fit that is t...,Two types of rubber make up the outsole of the...,The BioMoGo DNA from Brooks is a tried-and-tes...,The engineered mesh upper of the Brooks Ghost ...,309,269,12,12,29,29,17,17,"Narrow, Standard, Wide, Extra wide","Narrow, Standard, Wide",0,0,0,0
2,Asics Gel Kayano 25,Asics,95,4091,8,4.6,92.0,79,11,3,2,5,1,1,Stability,1,0,0,0,0,0,310,Men: 336g;Women: 278g;,Men: 10mm;Women: 13mm;,Men: 22mm;Women: 24mm;,Men: 12mm;Women: 11mm;,"Men: Narrow, Standard, Wide, Extra wide;Women:...",The Kayano 25 is very comfortable to wear even...,The shoe is slightly expensive.;A couple of we...,The overwhelming number of positive feedback h...,The Gel Kayano series from Asics brings out th...,Runners will find the Asics Gel Kayano 25 as t...,Compared to the regular Asics High Abrasion Ru...,The forefoot and rearfoot areas of the midsole...,The Gel Kayano 25 welcomes the use of Jacquard...,336,278,10,13,22,24,12,11,"Narrow, Standard, Wide, Extra wide","Narrow, Standard, Wide",0,0,0,0
3,Asics Gel Venture 6,Asics,94,4158,1,4.5,90.0,74,14,5,3,4,1,0,Neutral,1,0,0,0,0,0,140,Men: 318g;Women: 255g;,Men: 10mm;Women: 10mm;,Men: 20mm;Women: 20mm;,Men: 10mm;Women: 10mm;,"Men: Standard, Wide, Extra wide;Women: Narrow,...",Most customers appreciated the strong build of...,A couple of wearers noticed that the gel cushi...,The Asics Gel Venture 6 is a versatile running...,"The Asics Gel Venture 6 has been redesigned, f...",The Asics Gel Venture 6 fits true to size. It ...,The shoe houses technologies specific to trail...,The midsole of this Asics running shoes has th...,The synthetic mesh material covers the upper s...,318,255,10,10,20,20,10,10,"Standard, Wide, Extra wide","Narrow, Standard, Wide, Extra wide",0,0,0,0
4,Hoka One One Bondi 6,Hoka One One,91,794,7,4.2,92.0,64,14,9,7,6,0,1,Neutral,1,0,0,1,0,0,290,Men: 309g;Women: 244g;,Men: 4mm;Women: 4mm;,Men: 36mm;Women: 36mm;,Men: 32mm;Women: 32mm;,"Men: Narrow, Standard, Wide;Women: Narrow, Sta...",People mostly liked the cushioning capacity of...,Several consumers felt that the Hoka One One B...,People were mostly happy with the Hoka One One...,Hoka One One is a brand that prides itself in ...,The Hoka One One Bondi 6 was created to be tru...,The rubber compound that’s used for the outsol...,A Soft EVA Midsole serves as the cushioning un...,The upper unit of the Hoka One One Bondi 6 mak...,309,244,4,4,36,36,32,32,"Narrow, Standard, Wide","Narrow, Standard, Wide",0,0,1,1
5,Merrell Trail Glove 4,Merrell,93,1103,4,4.4,92.0,68,14,10,5,3,1,0,Neutral,0,0,0,0,0,0,190,Men: 230g;Women: 170g;,Men: 0mm;Women: 0mm;,Men: 11mm;Women: 11mm;,Men: 11mm;Women: 11mm;,Men: Standard;Women: Standard;,"Based on some reviews, the Merrell Trail Glove...",Some users felt some pain when they initially ...,Many claimed that the Merrell Trail Glove 4 is...,The Merrell Trail Glove 4 is a trail running s...,The Merrell Trail Glove 4 has a standard runni...,The aggressive Vibram TC5+ outsole of the Merr...,The midsole of the Merrell Trail Glove 4 is ma...,The synthetic mesh upper of the Merrell Trail ...,230,170,0,0,11,11,11,11,Standard,Standard,1,1,0,0
6,Nike Air Zoom Pegasus 36,Nike,91,16,13,4.8,90.0,81,19,0,0,0,0,1,Neutral,1,0,0,0,0,0,230,Men: 283g;Women: 227g;,Men: 10mm;Women: 10mm;,Men: 28mm;Women: 29mm;,Men: 18mm;Women: 19mm;,"Men: Standard, Wide, Extra wide;Women: Standar...",ome people claimed that the underfoot cushioni...,A couple of purchasers claimed that the width ...,The Air Zoom Pegasus 36 was generally able to ...,The Nike Air Zoom Pegasus 36 is an update to a...,The Nike Air Zoom Pegasus 36 was made using th...,The forefoot section of the Nike Air Zoom Pega...,Cushlon is the primary cushioning unit of the ...,The upper unit of the Nike Air Zoom Pegasus 36...,283,227,10,10,28,29,18,19,"Standard, Wide, Extra wide","Standard, Wide",0,0,0,0
7,Brooks Adrenaline GTS 18,Brooks,94,7191,13,4.5,91.0,77,9,6,3,5,1,1,Motion control,1,0,0,0,0,0,230,Men: 284g;Women: 244g;,Men: 12mm;Women: 12mm;,Men: 29mm;Women: 29mm;,Men: 17mm;Women: 17mm;,"Men: Narrow, Standard, Wide, Extra wide;Women:...",A lot of runners thought that the substantial ...,A hole formed in the toe box after only a week...,The Brooks Adrenaline GTS 18 has kept the esta...,While it has become softer and lighter than it...,The Adrenaline GTS 18 has a standard running s...,The design of the outsole is now more refined ...,The enhanced BioMoGo DNA midsole foam guarante...,The shoes upper material is also the same mate...,284,244,12,12,29,29,17,17,"Narrow, Standard, Wide, Extra wide","Narrow, Standard, Wide, Extra wide",0,0,0,0
8,Nike Zoom Pegasus Turbo,Nike,92,358,13,4.4,91.0,67,16,8,4,5,1,1,Neutral,1,0,0,0,0,0,340,Men: 283g;Women: 196g;,Men: 10mm;Women: 10mm;,Men: 28mm;Women: 28mm;,Men: 18mm;Women: 18mm;,Men: Standard;Women: Standard;,The Nike Zoom Pegasus Turbo had the “comfiest ...,Some users thought the Zoom Pegasus Turbo was ...,Wearers who were after a shoe that looked and ...,The Nike Air Zoom Pegasus is known for being o...,Runners are recommended to purchase the Nike Z...,The outsole of the Nike Zoom Pegasus Turbo fea...,"Featuring the ZoomX foam in the midsole, the Z...",The translucent mesh upper of the Nike Zoom Pe...,283,196,10,10,28,28,18,18,Standard,Standard,0,0,0,0
9,Asics Gel Nimbus 21,Asics,91,1150,8,4.4,88.0,71,13,4,6,6,0,1,Neutral,1,0,0,0,0,0,290,Men: 310g;Women: 255g;,Men: 10mm;Women: 13mm;,Men: 23mm;Women: 25mm;,Men: 13mm;Women: 12mm;,"Men: Narrow, Standard, Wide, Extra wide;Women:...",Most users admired the overall structure of th...,It is very expensive compared to other road ru...,Majority of the users were welcoming and happy...,Designed for neutral runners who are looking f...,The Asics Gel Nimbus 21 runs true to size. It ...,In the outsole of the Gel Nimbus 21 is the ASI...,"For a more energetic and lighter run, Asics ma...",The upper of the Asics Gel Nimbus 21 makes use...,310,255,10,13,23,25,13,12,"Narrow, Standard, Wide, Extra wide","Narrow, Standard, Wide",0,0,0,0


### Tokenization

In [5]:
# Remove punctuations.
tokenizer = RegexpTokenizer(r'\w+')

tokenized_good_summary = [tokenizer.tokenize(good_summary) for good_summary in df["good_summary"]]
tokenized_bad_summary = [tokenizer.tokenize(bad_summary) for bad_summary in df["bad_summary"]]
tokenized_summary = [tokenizer.tokenize(summary) for summary in df["summary"]]
tokenized_info = [tokenizer.tokenize(info) for info in df["info"]]
tokenized_size_fit = [tokenizer.tokenize(size_fit) for size_fit in df["size_fit"]]
tokenized_outsole = [tokenizer.tokenize(outsole) for outsole in df["outsole"]]
tokenized_midsole = [tokenizer.tokenize(midsole) for midsole in df["midsole"]]
tokenized_upper = [tokenizer.tokenize(upper) for upper in df["upper"]]

In [6]:
# Check ...
tokenized_good_summary 

[['Many',
  'people',
  'liked',
  'the',
  'aesthetics',
  'of',
  'the',
  'Nike',
  'Air',
  'Zoom',
  'Pegasus',
  '35',
  'emphasizing',
  'the',
  'freshness',
  'of',
  'the',
  'color',
  'schemes',
  'The',
  'zigzag',
  'stitching',
  'of',
  'the',
  'engineered',
  'mesh',
  'was',
  'appreciated',
  'because',
  'it',
  'kept',
  'the',
  'façade',
  'durable',
  'A',
  'lot',
  'of',
  'testers',
  'welcomed',
  'the',
  'full',
  'length',
  'Zoom',
  'Air',
  'unit',
  'which',
  'responsibly',
  'cushioned',
  'the',
  'foot',
  'and',
  'attenuated',
  'impact',
  'shock',
  'The',
  'grip',
  'capability',
  'of',
  'the',
  'outsole',
  'unit',
  'received',
  'praise',
  'from',
  'a',
  'lot',
  'of',
  'consumers',
  'Based',
  'on',
  'several',
  'runners',
  'responses',
  'the',
  'Pegasus',
  '35',
  'functioned',
  'well',
  'across',
  'a',
  'variety',
  'of',
  'activities',
  'like',
  'speed',
  'training',
  'even',
  'paced',
  'runs',
  'and',
  'ev

### Stopwords

In [7]:
# Get a list of stopwords containing all components of shoe names
# which are usually capitalised or contains a string of letters and numbers.
name_stop = set()
df['name'].str.lower().str.split().apply(name_stop.update)
name_stop = list(name_stop)
print(name_stop[:10])

['lunareclipse', 'altra', '22', '25', 'speedtrak', 'mid', '11', 'lt', '280', 'connect']


In [22]:
'flytefoam' in name_stop

False

In [8]:
len(stopwords.words('english'))

179

In [9]:
# Load stop words.
stop_words = stopwords.words('english') + name_stop
print(len(stop_words))

# Show stop words.
stop_words[:10]

836


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [11]:
# Remove stopwords, punctuations, single-letter words, names.

# remove the apostrophes, append to stop words list
stop_words = list(set(stop_words + [word.replace("'", '') for word in stop_words]))

# Remove stop words for good_summary
tokenized_clean_good_summary = [[word.lower() for word in summary if word.lower() not in stop_words]
                                for summary in tokenized_good_summary]

# Remove stop words for info
tokenized_clean_info = [[word.lower() for word in summary if word.lower() not in stop_words]
                                for summary in tokenized_info]

# Remove stop words for size_fit
tokenized_clean_size_fit = [[word.lower() for word in summary if word.lower() not in stop_words]
                                for summary in tokenized_size_fit]

# Remove stop words for outsole
tokenized_clean_outsole = [[word.lower() for word in summary if word.lower() not in stop_words]
                                for summary in tokenized_outsole]

# Remove stop words for midsole
tokenized_clean_midsole = [[word.lower() for word in summary if word.lower() not in stop_words]
                                for summary in tokenized_midsole]

# Remove stop words for upper
tokenized_clean_upper = [[word.lower() for word in summary if word.lower() not in stop_words]
                                for summary in tokenized_upper]

In [12]:
# Check ...
tokenized_clean_good_summary

[['many',
  'people',
  'liked',
  'aesthetics',
  'emphasizing',
  'freshness',
  'color',
  'schemes',
  'zigzag',
  'stitching',
  'appreciated',
  'kept',
  'façade',
  'durable',
  'lot',
  'testers',
  'welcomed',
  'full',
  'length',
  'unit',
  'responsibly',
  'cushioned',
  'foot',
  'attenuated',
  'impact',
  'shock',
  'grip',
  'capability',
  'outsole',
  'unit',
  'received',
  'praise',
  'lot',
  'consumers',
  'based',
  'several',
  'runners',
  'responses',
  'functioned',
  'well',
  'across',
  'variety',
  'activities',
  'like',
  'training',
  'even',
  'paced',
  'runs',
  'even',
  'half',
  'marathons',
  'several',
  'wearers',
  'admired',
  'overall',
  'design',
  'shoe',
  'said',
  'comfortable',
  'wearers',
  'mentioned',
  'shoe',
  'lightweight',
  'construction',
  'runs',
  'true',
  'size',
  'mentioned',
  'user',
  'reviews'],
 ['nice',
  'enabled',
  'excellent',
  'transition',
  'adequate',
  'according',
  'impressed',
  'user',
  'sever

#### Lemmatization

In [13]:
lemmatizer = WordNetLemmatizer()

# For good_summary
tokenized_clean_lem_good_summary = [[lemmatizer.lemmatize(word) for word in summary] 
                        for summary in tokenized_clean_good_summary]
# after lemmatisation, check for stop words again
tokenized_clean_lem_good_summary = [[word for word in summary if word not in stop_words]
                       for summary in tokenized_clean_lem_good_summary]

# For info
tokenized_clean_lem_info = [[lemmatizer.lemmatize(word) for word in summary] 
                        for summary in tokenized_clean_info]
# after lemmatisation, check for stop words again
tokenized_clean_lem_info = [[word for word in summary if word not in stop_words]
                       for summary in tokenized_clean_lem_info]

# For size_fit
tokenized_clean_lem_size_fit = [[lemmatizer.lemmatize(word) for word in summary] 
                        for summary in tokenized_clean_size_fit]
# after lemmatisation, check for stop words again
tokenized_clean_lem_size_fit = [[word for word in summary if word not in stop_words]
                       for summary in tokenized_clean_lem_size_fit]

# For outsole
tokenized_clean_lem_outsole = [[lemmatizer.lemmatize(word) for word in summary] 
                        for summary in tokenized_clean_outsole]
# after lemmatisation, check for stop words again
tokenized_clean_lem_outsole = [[word for word in summary if word not in stop_words]
                       for summary in tokenized_clean_lem_outsole]

# For midsole
tokenized_clean_lem_midsole = [[lemmatizer.lemmatize(word) for word in summary] 
                        for summary in tokenized_clean_midsole]
# after lemmatisation, check for stop words again
tokenized_clean_lem_midsole = [[word for word in summary if word not in stop_words]
                       for summary in tokenized_clean_lem_midsole]

# For upper
tokenized_clean_lem_upper = [[lemmatizer.lemmatize(word) for word in summary] 
                        for summary in tokenized_clean_upper]
# after lemmatisation, check for stop words again
tokenized_clean_lem_upper = [[word for word in summary if word not in stop_words]
                       for summary in tokenized_clean_lem_upper]

In [14]:
# Check ...
tokenized_clean_lem_good_summary

[['many',
  'people',
  'liked',
  'aesthetic',
  'emphasizing',
  'freshness',
  'color',
  'scheme',
  'zigzag',
  'stitching',
  'appreciated',
  'kept',
  'façade',
  'durable',
  'lot',
  'tester',
  'welcomed',
  'full',
  'length',
  'unit',
  'responsibly',
  'cushioned',
  'foot',
  'attenuated',
  'impact',
  'shock',
  'grip',
  'capability',
  'outsole',
  'unit',
  'received',
  'praise',
  'lot',
  'consumer',
  'based',
  'several',
  'runner',
  'functioned',
  'well',
  'across',
  'variety',
  'activity',
  'like',
  'training',
  'even',
  'paced',
  'even',
  'half',
  'marathon',
  'several',
  'wearer',
  'admired',
  'overall',
  'design',
  'shoe',
  'said',
  'comfortable',
  'wearer',
  'mentioned',
  'shoe',
  'lightweight',
  'construction',
  'true',
  'size',
  'mentioned',
  'user',
  'review'],
 ['nice',
  'enabled',
  'excellent',
  'transition',
  'adequate',
  'according',
  'impressed',
  'user',
  'several',
  'wearer',
  'commended',
  'shoe',
  'r

#### Bigram tokens

In [15]:
# For good_summary
bigrms_good_summary = [list(nltk.bigrams(shoe)) for shoe in tokenized_clean_lem_good_summary]
bigrms_good_summary = [[' '.join(list(bi)) for bi in shoe] for shoe in bigrms_good_summary]
bigrms_clean_lem_good_summary = [shoe + tokenized_clean_lem_good_summary[i] for i, shoe in enumerate(bigrms_good_summary)]

# For info
bigrms_info = [list(nltk.bigrams(shoe)) for shoe in tokenized_clean_lem_info]
bigrms_info = [[' '.join(list(bi)) for bi in shoe] for shoe in bigrms_info]
bigrms_clean_lem_info = [shoe + tokenized_clean_lem_info[i] for i, shoe in enumerate(bigrms_info)]

# For size_fit
bigrms_size_fit = [list(nltk.bigrams(shoe)) for shoe in tokenized_clean_lem_size_fit]
bigrms_size_fit = [[' '.join(list(bi)) for bi in shoe] for shoe in bigrms_size_fit]
bigrms_clean_lem_size_fit = [shoe + tokenized_clean_lem_size_fit[i] for i, shoe in enumerate(bigrms_size_fit)]

# For outsole
bigrms_outsole = [list(nltk.bigrams(shoe)) for shoe in tokenized_clean_lem_outsole]
bigrms_outsole = [[' '.join(list(bi)) for bi in shoe] for shoe in bigrms_outsole]
bigrms_clean_lem_outsole = [shoe + tokenized_clean_lem_outsole[i] for i, shoe in enumerate(bigrms_outsole)]

# For midsole
bigrms_midsole = [list(nltk.bigrams(shoe)) for shoe in tokenized_clean_lem_midsole]
bigrms_midsole = [[' '.join(list(bi)) for bi in shoe] for shoe in bigrms_midsole]
bigrms_clean_lem_midsole = [shoe + tokenized_clean_lem_midsole[i] for i, shoe in enumerate(bigrms_midsole)]

# For upper
bigrms_upper = [list(nltk.bigrams(shoe)) for shoe in tokenized_clean_lem_upper]
bigrms_upper = [[' '.join(list(bi)) for bi in shoe] for shoe in bigrms_upper]
bigrms_clean_lem_upper = [shoe + tokenized_clean_lem_upper[i] for i, shoe in enumerate(bigrms_upper)]

### GENSIM for topic modelling

In [16]:
# # Create gensim dictionary & bag-of-words corpus for bigrams

# For good_summary
dictionary_good_summary = Dictionary(bigrms_clean_lem_good_summary)
corpus_good_summary = [dictionary_good_summary.doc2bow(shoe) for shoe in bigrms_clean_lem_good_summary]

# For info
dictionary_info = Dictionary(bigrms_clean_lem_info)
corpus_info = [dictionary_info.doc2bow(shoe) for shoe in bigrms_clean_lem_info]

# For size_fit
dictionary_size_fit = Dictionary(bigrms_clean_lem_size_fit)
corpus_size_fit = [dictionary_size_fit.doc2bow(shoe) for shoe in bigrms_clean_lem_size_fit]

# For outsole
dictionary_outsole = Dictionary(bigrms_clean_lem_outsole)
corpus_outsole = [dictionary_outsole.doc2bow(shoe) for shoe in bigrms_clean_lem_outsole]

# For midsole
dictionary_midsole = Dictionary(bigrms_clean_lem_midsole)
corpus_midsole = [dictionary_midsole.doc2bow(shoe) for shoe in bigrms_clean_lem_midsole]

# For upper
dictionary_upper = Dictionary(bigrms_clean_lem_upper)
corpus_upper = [dictionary_upper.doc2bow(shoe) for shoe in bigrms_clean_lem_upper]

In [17]:
def topwords(bow, dictionary, num_words=10):
    bow_doc = sorted(bow, key=lambda w: w[1], reverse=True)
    for word_id, word_count in bow_doc[:num_words]:
        print(dictionary.get(word_id), word_count)

#### Bag-of-words

In [18]:
# For good_summary
print("---------- \nFor good_summary:")
print(topwords(corpus_good_summary[0], dictionary_good_summary, num_words=5))

# For info
print("---------- \nFor info:")
print(topwords(corpus_info[0], dictionary_info, num_words=5))

# For size_fit
print("---------- \nFor size_fit:")
print(topwords(corpus_size_fit[0], dictionary_size_fit, num_words=5))

# For outsole
print("---------- \nFor outsole:")
print(topwords(corpus_outsole[0], dictionary_outsole, num_words=5))

# For midsole
print("---------- \nFor midsole:")
print(topwords(corpus_midsole[0], dictionary_midsole, num_words=5))

# For upper
print("---------- \nFor upper:")
print(topwords(corpus_upper[0], dictionary_upper, num_words=5))

---------- 
For good_summary:
even 2
lot 2
mentioned 2
several 2
shoe 2
None
---------- 
For info:
shoe 3
unit 3
length 2
platform 2
provide 2
None
---------- 
For size_fit:
wide 3
b 2
follows 2
foot 2
medium 2
None
---------- 
For outsole:
rubber 3
platform 2
purpose 2
traction 2
abrasion 1
None
---------- 
For midsole:
also 3
cushioning 2
cushlon 2
also designed 1
also present 1
None
---------- 
For upper:
make 2
part 2
upper 2
acting 1
acting eyelet 1
None


#### TF-IDF

In [19]:
# For good_summary
print("---------- \nFor good_summary:")
tfidf_good_summary = TfidfModel(corpus_good_summary)
topwords(tfidf_good_summary[corpus_good_summary[2]], dictionary_good_summary, num_words=10)

# For info
print("---------- \nFor info:")
tfidf_info = TfidfModel(corpus_info)
topwords(tfidf_info[corpus_info[2]], dictionary_info, num_words=10)

# For size_fit
print("---------- \nFor size_fit:")
tfidf_size_fit = TfidfModel(corpus_size_fit)
topwords(tfidf_size_fit[corpus_size_fit[2]], dictionary_size_fit, num_words=10)

# For outsole
print("---------- \nFor outsole:")
tfidf_outsole = TfidfModel(corpus_outsole)
topwords(tfidf_outsole[corpus_outsole[2]], dictionary_outsole, num_words=10)

# For midsole
print("---------- \nFor midsole:")
tfidf_midsole = TfidfModel(corpus_midsole)
topwords(tfidf_midsole[corpus_midsole[2]], dictionary_midsole, num_words=10)

# For upper
print("---------- \nFor upper:")
tfidf_upper = TfidfModel(corpus_upper)
topwords(tfidf_upper[corpus_upper[2]], dictionary_upper, num_words=10)

---------- 
For good_summary:
allows natural 0.16563089018849114
also available 0.16563089018849114
durable also 0.16563089018849114
feature underfoot 0.16563089018849114
hour said 0.16563089018849114
option purchaser 0.16563089018849114
people couple 0.16563089018849114
version casual 0.16563089018849114
walk gym 0.16563089018849114
workout according 0.16563089018849114
---------- 
For info:
25th 0.1356685694679757
25th anniversary 0.1356685694679757
adaptive breathable 0.1356685694679757
aim level 0.1356685694679757
anniversary 0.1356685694679757
anniversary edition 0.1356685694679757
brings 25th 0.1356685694679757
comfortable another 0.1356685694679757
edition includes 0.1356685694679757
experience update 0.1356685694679757
---------- 
For size_fit:
guarantee secure 0.27789545017417105
shoe guarantee 0.27789545017417105
sizing anatomical 0.27789545017417105
guarantee 0.2493286612625887
based standard 0.2493286612625887
fit accommodate 0.2493286612625887
size based 0.2493286612625887

In [23]:
# Comparison of similarity

index_good_summary = similarities.MatrixSimilarity(tfidf_good_summary[corpus_good_summary])
index_info = similarities.MatrixSimilarity(tfidf_info[corpus_info])
index_size_fit = similarities.MatrixSimilarity(tfidf_size_fit[corpus_size_fit])
index_outsole = similarities.MatrixSimilarity(tfidf_outsole[corpus_outsole])
index_midsole = similarities.MatrixSimilarity(tfidf_midsole[corpus_midsole])
index_upper = similarities.MatrixSimilarity(tfidf_upper[corpus_upper])

In [24]:
df_sims_good_summary = pd.DataFrame(list(index_good_summary))
df_sims_info = pd.DataFrame(list(index_info))
df_sims_size_fit = pd.DataFrame(list(index_size_fit))
df_sims_outsole = pd.DataFrame(list(index_outsole))
df_sims_midsole = pd.DataFrame(list(index_midsole))
df_sims_upper = pd.DataFrame(list(index_upper))

In [26]:
# For good_summary:
sims_good_summary = pd.DataFrame({'shoenames': df.name, 'similarity':index_good_summary[tfidf_good_summary[corpus_good_summary[0]]]})
sims_good_summary.sort_values('similarity', ascending=False).head(10)

Unnamed: 0,shoenames,similarity
0,Nike Air Zoom Pegasus 35,1.0
419,New Balance Vazee Rush,0.086434
105,Altra Escalante Racer,0.071358
749,Scott Palani RC,0.066086
184,Adidas Solar Drive ST,0.064128
683,Puma Ignite v2,0.063932
520,Hoka One One Evo Jawz,0.063206
98,Adidas Adizero Adios 4,0.060558
94,Salomon Sonic RA Max,0.060427
173,Nike LunarGlide 7,0.057571


In [27]:
# For info:
sims_info = pd.DataFrame({'shoenames': df.name, 'similarity':index_info[tfidf_info[corpus_info[0]]]})
sims_info.sort_values('similarity', ascending=False).head(10)

Unnamed: 0,shoenames,similarity
0,Nike Air Zoom Pegasus 35,1.0
241,Adidas Aerobounce PR,0.067877
150,Nike Flyknit Lunar 3,0.062231
54,Adidas Solar Glide,0.059419
642,Asics Tartherzeal 6,0.059219
376,Nike Free RN Distance 2,0.058179
78,Nike Free RN,0.051759
68,Brooks Launch 5,0.051163
845,Mizuno Wave Cruise 10,0.04954
147,Mizuno Wave Shadow 2,0.047222


In [28]:
# For size_fit:
sims_size_fit = pd.DataFrame({'shoenames': df.name, 'similarity':index_size_fit[tfidf_size_fit[corpus_size_fit[0]]]})
sims_size_fit.sort_values('similarity', ascending=False).head(10)

Unnamed: 0,shoenames,similarity
0,Nike Air Zoom Pegasus 35,1.0
518,Skechers GOmeb Razor 2,0.340805
7,Brooks Adrenaline GTS 18,0.297518
23,Nike Free RN 2018,0.252547
197,New Balance Fresh Foam 1080 v7,0.236906
64,Nike Air Zoom Structure 22,0.218359
477,361 Degrees Meraki,0.212897
26,New Balance Fresh Foam 1080 v9,0.211851
128,Brooks Adrenaline GTS 17,0.209855
73,Hoka One One Arahi 2,0.202551


In [29]:
# For outsole:
sims_outsole = pd.DataFrame({'shoenames': df.name, 'similarity':index_outsole[tfidf_outsole[corpus_outsole[0]]]})
sims_outsole.sort_values('similarity', ascending=False).head(10)

Unnamed: 0,shoenames,similarity
0,Nike Air Zoom Pegasus 35,1.0
414,Nike Air Zoom Pegasus 34 Shield,0.1013
300,Nike Air Zoom Elite 8,0.097985
174,Nike Air Zoom Vomero 12,0.092316
6,Nike Air Zoom Pegasus 36,0.079585
315,Nike Air Zoom Vomero 11,0.07904
195,Nike Air Zoom Odyssey 2,0.07346
697,Asics GT 2000 6 Lite-Show,0.073153
35,Nike Air Zoom Vomero 14,0.072571
503,Nike Zoom Fly 3,0.070738


In [30]:
# For midsole:
sims_midsole = pd.DataFrame({'shoenames': df.name, 'similarity':index_midsole[tfidf_midsole[corpus_midsole[0]]]})
sims_midsole.sort_values('similarity', ascending=False).head(10)

Unnamed: 0,shoenames,similarity
0,Nike Air Zoom Pegasus 35,1.0
339,Salomon Sonic Aero,0.072004
284,New Balance 1500 v4,0.059925
304,Salomon XA Enduro,0.053391
6,Nike Air Zoom Pegasus 36,0.051327
120,Hoka One One Clifton 3,0.051099
579,Nike Air Zoom Span,0.045097
165,Asics Alpine XT,0.043473
525,Nike Dart 11,0.042804
478,Nike Air Zoom Elite 10,0.03938


In [31]:
# For upper:
sims_upper = pd.DataFrame({'shoenames': df.name, 'similarity':index_upper[tfidf_upper[corpus_upper[0]]]})
sims_upper.sort_values('similarity', ascending=False).head(10)

Unnamed: 0,shoenames,similarity
0,Nike Air Zoom Pegasus 35,1.0
6,Nike Air Zoom Pegasus 36,0.080964
437,Brooks Neuro 3,0.077146
64,Nike Air Zoom Structure 22,0.075988
21,Nike Air Zoom Pegasus 34,0.074885
426,Hoka One One Mach 2,0.073746
284,New Balance 1500 v4,0.068829
376,Nike Free RN Distance 2,0.067777
174,Nike Air Zoom Vomero 12,0.066826
23,Nike Free RN 2018,0.066819


In [32]:
shoes = df.name
shoes

0            Nike Air Zoom Pegasus 35
1                     Brooks Ghost 11
2                 Asics Gel Kayano 25
3                 Asics Gel Venture 6
4                Hoka One One Bondi 6
5               Merrell Trail Glove 4
6            Nike Air Zoom Pegasus 36
7            Brooks Adrenaline GTS 18
8             Nike Zoom Pegasus Turbo
9                 Asics Gel Nimbus 21
10                 Brooks Glycerin 16
11                 Brooks Glycerin 17
12             Hoka One One Clifton 5
13          Nike Free RN Flyknit 2018
14          Nike Epic React Flyknit 2
15               Adidas Pure Boost Go
16               Adidas Ultraboost 19
17                 Adidas Solar Boost
18              Nike Zoom Fly Flyknit
19                Altra Escalante 1.5
20           Hoka One One Speedgoat 3
21           Nike Air Zoom Pegasus 34
22                  Hoka One One Cavu
23                  Nike Free RN 2018
24               Mizuno Wave Rider 22
25                       Brooks Revel
26     New B

In [33]:
# Save to pickle.
version = 'v8-386-bigram'
to_pickle = [
    'df_sims_good_summary',
    'corpus_good_summary',
    'tfidf_good_summary',
    'dictionary_good_summary',
    'df_sims_info',
    'corpus_info',
    'tfidf_info',
    'dictionary_info',
    'df_sims_size_fit',
    'corpus_size_fit',
    'tfidf_size_fit',
    'dictionary_size_fit',
    'df_sims_outsole',
    'corpus_outsole',
    'tfidf_outsole',
    'dictionary_outsole',
    'df_sims_midsole',
    'corpus_midsole',
    'tfidf_midsole',
    'dictionary_midsole',
    'df_sims_upper',
    'corpus_upper',
    'tfidf_upper',
    'dictionary_upper',
    'shoes'
]
for var in to_pickle:
    pickle.dump(eval(var), open('tfidf_files/{}_{}.pkl'.format(var, version), 'wb'))

In [34]:
# save shoe indexing as pickle file
pickle.dump(sims_good_summary.shoenames, open('tfidf_files/shoe_mapping.pkl', 'wb'))