In [1]:
#Dataset downloaded from:: https://nijianmo.github.io/amazon/index.html

# Importing Libraries & Basic Preprocesing 

In [2]:
#import all the necessary packages.

from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import math
import time
import re
import os
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.metrics import pairwise_distances
from matplotlib import gridspec
from scipy.sparse import hstack

warnings.filterwarnings("ignore")

In [3]:
# loading the data using pandas' read_json file.
data = pd.read_json('tops_fashion.json')

In [4]:
data.head(3)

Unnamed: 0,sku,asin,product_type_name,formatted_price,author,color,brand,publisher,availability,reviews,large_image_url,availability_type,small_image_url,editorial_review,title,model,medium_image_url,manufacturer,editorial_reivew
0,,B016I2TS4W,SHIRT,,,,FNC7C,,,"[False, https://www.amazon.com/reviews/iframe?...",https://images-na.ssl-images-amazon.com/images...,,https://images-na.ssl-images-amazon.com/images...,Minions Como Superheroes Ironman Women's O Nec...,Minions Como Superheroes Ironman Long Sleeve R...,,https://images-na.ssl-images-amazon.com/images...,,
1,,B01N49AI08,SHIRT,,,,FIG Clothing,,,"[False, https://www.amazon.com/reviews/iframe?...",https://images-na.ssl-images-amazon.com/images...,,https://images-na.ssl-images-amazon.com/images...,Sizing runs on the small side. FIG® recommends...,FIG Clothing Womens Izo Tunic,,https://images-na.ssl-images-amazon.com/images...,,
2,,B01JDPCOHO,SHIRT,,,,FIG Clothing,,,"[False, https://www.amazon.com/reviews/iframe?...",https://images-na.ssl-images-amazon.com/images...,,https://images-na.ssl-images-amazon.com/images...,Sizing runs on the small side. FIG® recommends...,FIG Clothing Womens Won Top,,https://images-na.ssl-images-amazon.com/images...,,


In [5]:
#data.to_csv('amazon_apparel.csv')

In [6]:
print('='*80)
print("Number of data points in data=", data.shape[0])
print('-'*80)
print("Number of Columns in data=", data.shape[1])
print('-'*80)
print("The attributes(columns or features) of data =\n", data.columns.values)
print('='*80)

Number of data points in data= 183138
--------------------------------------------------------------------------------
Number of Columns in data= 19
--------------------------------------------------------------------------------
The attributes(columns or features) of data =
 ['sku' 'asin' 'product_type_name' 'formatted_price' 'author' 'color'
 'brand' 'publisher' 'availability' 'reviews' 'large_image_url'
 'availability_type' 'small_image_url' 'editorial_review' 'title' 'model'
 'medium_image_url' 'manufacturer' 'editorial_reivew']


`1. asin ( Amazon standard identification number)`
`2. brand ( brand to which the product belongs to )` 
`3. color ( Color information of apparel, it can contain many colors as a value ex: red and black stripes )`
`4. product_type_name (type of the apperal, ex: SHIRT/TSHIRT )`
`5. medium_image_url ( url of the image ) `
`6. title (title of the product.) `
`7. formatted_price (price of the product)`

In [7]:
data = data[['asin', 'brand', 'color', 'medium_image_url', 'product_type_name', 'title', 'formatted_price']]

In [8]:
data.head(3)

Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price
0,B016I2TS4W,FNC7C,,https://images-na.ssl-images-amazon.com/images...,SHIRT,Minions Como Superheroes Ironman Long Sleeve R...,
1,B01N49AI08,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,FIG Clothing Womens Izo Tunic,
2,B01JDPCOHO,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,FIG Clothing Womens Won Top,


In [9]:
data.isna().sum()

asin                      0
brand                   151
color                118182
medium_image_url          0
product_type_name         0
title                     0
formatted_price      154743
dtype: int64

In [10]:
data.isna().sum()/len(data)*100

asin                  0.000000
brand                 0.082451
color                64.531665
medium_image_url      0.000000
product_type_name     0.000000
title                 0.000000
formatted_price      84.495299
dtype: float64

In [11]:
#data=data[['asin', 'brand', 'medium_image_url', 'product_type_name', 'title']]

In [12]:
# consider products which have color information
# data['color'].isnull() => gives the information about the dataframe row's which have null values price == None|Null
data =data.loc[~data['color'].isnull()]
print('='*80)
print('Number of data points After eliminating color=NULL :', data.shape[0])
print('='*80)

Number of data points After eliminating color=NULL : 64956


In [13]:
data = data.loc[~data['formatted_price'].isnull()]
print('='*80)
print('Number of data points After eliminating price=NULL :', data.shape[0])
print('='*80)

Number of data points After eliminating price=NULL : 28385


In [14]:
data.shape

(28385, 7)

In [15]:
data_sorted = data[data['title'].apply(lambda x: len(x.split())>4)]
print('='*80)
print("After removal of products with short description:", data_sorted.shape[0])
print('='*80)

After removal of products with short description: 27949


In [16]:
# Sort the whole data based on title (alphabetical order of title) 
data_sorted.sort_values('title',inplace=True, ascending=False)
data_sorted.tail(3)

Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price
109599,B00KI3VDXM,Crazy4Bling,Purple,https://images-na.ssl-images-amazon.com/images...,SHIRT,"""I Wanna Be Adored"" Long Sleeve Top with Shred...",$39.99
78827,B003IDE8XQ,Maggie's Organics,Grey,https://images-na.ssl-images-amazon.com/images...,HOME,"""Camisoles Grey - Medium Fair Labor, 1 pc""",$18.99
118987,B008D30AGK,Out+of+Print+Clothing,Multicolored,https://images-na.ssl-images-amazon.com/images...,SHIRT,"""1984"" Retro Book Cover Women's SLim Fit T-Shi...",$7.51


In [17]:
data_sorted.shape

(27949, 7)

In [18]:
data_1=data_sorted.sample(13000)

In [19]:
data_2=data_sorted.sample(13000)

In [20]:
data_1.shape

(13000, 7)

In [21]:
data_2.shape

(13000, 7)

In [22]:
#df.to_excel("output.xlsx")

In [23]:
#data_1.to_excel('amazon_apparel_sheet_1.xlsx')

In [24]:
#data_2.to_excel('amazon_apparel_sheet_2.xlsx')

# Text Prepocessing & Vectorization of Text Data

In [25]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\DR SNEHAL
[nltk_data]     BANKAR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
# we use the list of stop words that are downloaded from nltk lib.
stop_words = set(stopwords.words('english'))
print ('list of stop words:', stop_words)

def nlp_preprocessing(total_text, index, column,df):
    if type(total_text) is not int:
        string = ""
        for words in total_text.split():
            # remove the special chars in review like '"#$@!%^&*()_+-~?>< etc.
            word = ("".join(e for e in words if e.isalnum()))
            # Conver all letters to lower-case
            word = word.lower()
            # stop-word removal
            if not word in stop_words:
                string += word + " "
        df[column][index] = string

list of stop words: {'because', 'when', 'their', 'a', "shouldn't", 'mustn', 'my', 't', 'they', 'for', 'hers', 'you', 'is', 'were', 'being', 'not', 'wasn', 'both', 'himself', 'themselves', 'y', 'against', 'be', 'these', 'who', "couldn't", 'under', 'during', 'from', "you're", 'further', "you'd", 'myself', 'am', 'she', 'very', 'as', 'have', 'most', 'which', 'now', 's', 'hadn', 'so', 'he', 'nor', 're', 'our', 'there', "it's", 'before', "she's", 'does', 'don', 'that', 'yours', 'did', 'couldn', 'are', 'after', "don't", 'his', 'this', 'o', 'whom', "shan't", 'or', 'down', 'how', 'same', 'isn', 'by', 'the', 'than', 'we', 'those', 'needn', "mustn't", "wasn't", "you've", 'it', 'was', 'into', 'here', 'won', 'once', "won't", 'on', 've', 'few', 'will', 'ourselves', 'm', 'where', 'itself', 'll', 'some', 'with', 'if', 'between', 'own', "aren't", 'doesn', "needn't", 'doing', 'd', 'other', 'such', 'below', 'why', "mightn't", 'should', 'more', 'ours', 'off', 'having', 'above', 'over', 'ma', 'an', 'about'

In [27]:
start_time = time.clock()
# we take each title and we text-preprocess it.
for index, row in data_1.iterrows():
    nlp_preprocessing(row['title'], index, 'title',data_1)
# we print the time it took to preprocess whole titles 
print(time.clock() - start_time, "seconds")

9.040406600000004 seconds


In [28]:
data_1['title'].head(2)

120453    retro brand womens large vneck graphic tee tsh...
159166     wessel womens long sleeve scoop neck shirt xs...
Name: title, dtype: object

In [29]:
start_time = time.clock()
# we take each title and we text-preprocess it.
for index, row in data_2.iterrows():
    nlp_preprocessing(row['title'], index, 'title',data_2)
# we print the time it took to preprocess whole titles 
print(time.clock() - start_time, "seconds")

8.579748000000002 seconds


In [30]:
data_2['title'].head(2)

15939        aqua womens crochet scoop neck tank top white 
111493    ta5 women tops blouses summer beach casual loo...
Name: title, dtype: object

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
title_vectorizer = CountVectorizer()
title_vectorizer.fit((data_1['title'].values)+(data_2['title'].values))
title_features_1=title_vectorizer.transform(data_1['title'].values)
title_features_1.get_shape() # get number of rows and columns in feature matrix.

(13000, 11903)

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
#title_vectorizer = CountVectorizer(max_features=title_features_1.get_shape()[1])
title_features_2   = title_vectorizer.transform(data_2['title'].values)
title_features_2.get_shape() # get number of rows and columns in feature matrix.

(13000, 11903)

In [33]:
from sklearn.metrics.pairwise import cosine_similarity

In [34]:
pairwise_dist = cosine_similarity(title_features_1[100],title_features_2[100])

In [35]:
pairwise_dist.shape

(1, 1)

In [36]:
np.nonzero(pairwise_dist)

(array([0], dtype=int64), array([0], dtype=int64))

In [37]:
#data_2['title'][165580]

In [38]:
np.amax(pairwise_dist)

0.19069251784911848

In [39]:
def bag_of_words_model(doc_id, num_results):
    # doc_id: apparel's id in given corpus
    
    # pairwise_dist will store the distance from given input apparel to all remaining apparels
    # the metric we used here is cosine, the coside distance is mesured as K(X, Y) = <X, Y> / (||X||*||Y||)
    # http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
    print("Title from dataset 2:",data_2.iloc[doc_id]['title'])
    print('='*60)
    pairwise_dist = cosine_similarity(title_features_1,title_features_2[doc_id])
    
    # np.argsort will return indices of the smallest distances
    indices = np.argsort(pairwise_dist.flatten())[::-1][0:num_results]
    #pdists will store the smallest distances
    pdists  = np.sort(pairwise_dist.flatten())[::-1][0:num_results]

    #data frame indices of the 9 smallest distace's
    df_indices = list(data_1.index[indices])
    
    for i in range(0,len(indices)):
        # we will pass 1. doc_id, 2. title1, 3. title2, url, model
        #get_result(indices[i],data['title'].loc[df_indices[0]], data['title'].loc[df_indices[i]], data['medium_image_url'].loc[df_indices[i]], 'bag_of_words')
        print('ASIN :',data_1['asin'].loc[df_indices[i]])
        print ('Brand:', data_1['brand'].loc[df_indices[i]])
        print ('Title:', data_1['title'].loc[df_indices[i]])
        print ('Cosine similarity with the query point',pdists[i])
        print('='*60)

In [40]:
data_1=data_1.reset_index(drop=True)

In [41]:
data_1[data_1['asin']=='B074JBMY4C'].index

Int64Index([4883], dtype='int64')

In [55]:
bag_of_words_model(400, 5)

Title from dataset 2: noble outfitters brooke tank xs mint geo 
ASIN : B01NA032Y6
Brand: Noble Outfitters
Title: noble outfitters brooke tank l mint geo 
Cosine similarity with the query point 0.9258200997725515
ASIN : B01MUWCR0X
Brand: Noble Outfitters
Title: noble outfitters brooke tank xs grape geo 
Cosine similarity with the query point 0.8571428571428569
ASIN : B01N2AQ8W6
Brand: Noble Outfitters
Title: noble outfitters brooke tank xs black 
Cosine similarity with the query point 0.7715167498104596
ASIN : B01MUTGXEN
Brand: Noble Outfitters
Title: noble outfitters brooke tank black 
Cosine similarity with the query point 0.6761234037828132
ASIN : B01NB1BX9G
Brand: Noble Outfitters
Title: noble outfitters brooke tank l black 
Cosine similarity with the query point 0.6761234037828132


In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [44]:
tfidf_title_vectorizer = TfidfVectorizer()
tfidf_title_vectorizer.fit((data_1['title'].values)+(data_2['title'].values))

title_feature_1_tfidf=tfidf_title_vectorizer.transform(data_1['title'].values)
title_feature_2_tfidf=tfidf_title_vectorizer.transform(data_2['title'].values)

In [45]:
def tfidf_model(doc_id, num_results):
    # doc_id: apparel's id in given corpus
    
    # pairwise_dist will store the distance from given input apparel to all remaining apparels
    # the metric we used here is cosine, the coside distance is mesured as K(X, Y) = <X, Y> / (||X||*||Y||)
    # http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
    print("Title from dataset 2:",data_2.iloc[doc_id]['title'])
    print('='*60)
    pairwise_dist = cosine_similarity(title_feature_1_tfidf,title_feature_2_tfidf[doc_id])
    
    # np.argsort will return indices of the smallest distances
    indices = np.argsort(pairwise_dist.flatten())[::-1][0:num_results]
    #pdists will store the smallest distances
    pdists  = np.sort(pairwise_dist.flatten())[::-1][0:num_results]

    #data frame indices of the 9 smallest distace's
    df_indices = list(data_1.index[indices])
    
    for i in range(0,len(indices)):
        # we will pass 1. doc_id, 2. title1, 3. title2, url, model
        #get_result(indices[i],data['title'].loc[df_indices[0]], data['title'].loc[df_indices[i]], data['medium_image_url'].loc[df_indices[i]], 'bag_of_words')
        print('ASIN :',data_1['asin'].loc[df_indices[i]])
        print ('Brand:', data_1['brand'].loc[df_indices[i]])
        print ('Title:', data_1['title'].loc[df_indices[i]])
        print ('Cosine similarity with the query point',pdists[i])
        print('='*60)

In [54]:
tfidf_model(400, 5)

Title from dataset 2: noble outfitters brooke tank xs mint geo 
ASIN : B01NA032Y6
Brand: Noble Outfitters
Title: noble outfitters brooke tank l mint geo 
Cosine similarity with the query point 0.9789680394268324
ASIN : B01MUWCR0X
Brand: Noble Outfitters
Title: noble outfitters brooke tank xs grape geo 
Cosine similarity with the query point 0.8096057875204586
ASIN : B01N2AQ8W6
Brand: Noble Outfitters
Title: noble outfitters brooke tank xs black 
Cosine similarity with the query point 0.8024140452194966
ASIN : B01MUTGXEN
Brand: Noble Outfitters
Title: noble outfitters brooke tank black 
Cosine similarity with the query point 0.7760661661283565
ASIN : B01NB1BX9G
Brand: Noble Outfitters
Title: noble outfitters brooke tank l black 
Cosine similarity with the query point 0.7760661661283565
