In [85]:

# importing dependecies
import requests
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from urllib.parse import urlparse, urlsplit
import re
import os

# 1. Part Performing ETL

In [108]:
class DataPrep:
    def __init__(self, url=None):
        self.url = url
        self.domain = self.get_domain(url)
        
        self.weights={}
        self.data={}
        self.domains_visited = []


        self.res={}
        self.rows=[]

        self.attributes = ['title', 'vendor', 'product_type', 'tags', 'handle']
        


    def validate_url(self):
        try:
            url = f'https://www.{self.domain}/collections/all/products.json?page=1'
            products = requests.get(url).json()['products']
            print(f"Extracting data from the url:", url)

        except:
            raise requests.HTTPError('Invalid url. Kindly check the same.')


    def get_domain(self, url):

        parsed_url = urlsplit(url)
        domain= parsed_url.netloc[4:]


        return domain
    
    def extract_data(self):

        # extracting the data upto 5 pages

        try:
            for i in range(1,6):

                # Send a request to the URL and get the page content
                url = f'https://www.{self.domain}/collections/all/products.json?page={i}'
                print(f"Extracting data from the url:", url)
                products = requests.get(url).json()['products']
                # content = response.content

                # # Parse the HTML content using Beautiful Soup
                # soup = BeautifulSoup(content, 'html.parser')

                # # Find the products on the page
                # products = soup.find_all('a', {'class': 'grid__image'})


                for product in products:
                    handle = product['handle']
                    product_url = f'https://www.{self.domain}/products/{handle}.json'
                    # print(f'{url=}')

                    product_details = requests.get(product_url).json()['product']

                    row = [product_details[i] for i in self.attributes]

                    self.rows.append(row)

            print("Data extraction successful.")


        except:

            raise requests.ConnectionError( 'Invalid url, Failed to establish a connection. Please check the url carefully.')
            
        
    
    def save_data(self):

        

        print('Transforming and Saving extracted data.')

        try:

            # converting into dataframe required in further operations
            self.data = pd.DataFrame(data=self.rows, columns=self.attributes)
        

            # saving only the handles required for fetching the index of cosine matrix
            handles = self.data[['handle']]

            csv_path = f'data/{self.domain}.csv'
            # create a directory for weights is doesn't exist 
            os.makedirs('data', exist_ok=True)


            handles.to_csv(csv_path, index=False)
            print(f'{csv_path} saved successfully.')


        except Exception as e:
            raise ValueError(e)


    def save_weights(self):

        print('Calculating and saving import weight file.')

        try:

            # create a TF-IDF matrix
            vectorizer = TfidfVectorizer(stop_words='english')
            matrix = vectorizer.fit_transform(self.data['title'] + ' '+ self.data['vendor'] + ' ' + self.data['product_type'] + ' ' + self.data['tags'])

            # get the cosine-similary as per the terms which are matching highest common
            similarity = cosine_similarity(matrix)

            weight_path = f'weights/{self.domain}.npy'
            # create a directory for weights is doesn't exist 
            os.makedirs('weights', exist_ok=True)

            np.save(weight_path, similarity)
            print(f'{weight_path} saved successfully.')

        except Exception as e:
            raise ValueError(e)






    def save(self):

        try:

            if  self.domain not in self.domains_visited:
                print("Please wait while processing in the backend.")

                if not( os.path.exists(f'data/{self.domain}.csv') and os.path.exists(f'weights/{self.domain}.npy') ):
                    print('Data doesn\'t exist, it may take around a minute. Time may vary depends upon the network speed.')
                    print("\nStep #1/3")
                    self.extract_data()
                    print("\nStep #2/3")
                    self.save_data()
                    print("\nStep #3/3")
                    self.save_weights()

                else:
                    print('Data already exists.')
                self.domains_visited.append(self.domain)


            print(f'\nAll necessary files is saved for {self.url}.\n')

            print('CONGRATULATIONS! You can proceed with 2nd Step.')



       

        except Exception as e:


            return e


            # print('Sorry, Unexpected error happened!!', e)


### 1.1  Let's extract data for the domain 'https://www.boysnextdoor-apparel.co'

In [87]:
DataPrep('https://www.boysnextdoor-apparel.co').save()

Please wait while processing in the backend.
Data doesn't exist, it may take around a minute. Time may vary depends upon the network speed.

Step #1/3
Extracting data from the url: https://www.boysnextdoor-apparel.co/collections/all/products.json?page=1
Extracting data from the url: https://www.boysnextdoor-apparel.co/collections/all/products.json?page=2
Extracting data from the url: https://www.boysnextdoor-apparel.co/collections/all/products.json?page=3
Extracting data from the url: https://www.boysnextdoor-apparel.co/collections/all/products.json?page=4
Extracting data from the url: https://www.boysnextdoor-apparel.co/collections/all/products.json?page=5
Data extraction successful.

Step #2/3
Transforming and Saving extracted data.
data/boysnextdoor-apparel.co.csv saved successfully.

Step #3/3
Calculating and saving import weight file.
weights/boysnextdoor-apparel.co.npy saved successfully.

All necessary files is saved for https://www.boysnextdoor-apparel.co.

CONGRATULATIONS! You

### 1.2 Try again extracting data for the same domain 'https://www.boysnextdoor-apparel.co'

In [88]:
DataPrep('https://www.boysnextdoor-apparel.co').save()

Please wait while processing in the backend.
Data already exists.

All necessary files is saved for https://www.boysnextdoor-apparel.co.

CONGRATULATIONS! You can proceed with 2nd Step.


### 1.3 Try an invalid url

In [109]:
# NOTE: using '.com' instead of '.co'

DataPrep('https://www.boysnextdoor-apparel.com').save()


# Invalid URL Exception is thrown

Please wait while processing in the backend.
Data doesn't exist, it may take around a minute. Time may vary depends upon the network speed.

Step #1/3
Extracting data from the url: https://www.boysnextdoor-apparel.com/collections/all/products.json?page=1


requests.exceptions.ConnectionError('Invalid url, Failed to establish a connection. Please check the url carefully.')

# 2. Part : Testing the API


### 2.1 Loading the previously processed data for the domain 'https://www.boysnextdoor-apparel.co'

In [89]:
class DataLoader:
    def __init__(self):
        self.weights={}
        self.data={}
        self.domains_visited = []


        self.res={}


    def get_domain(self, url):
        parsed_url = urlsplit(url)
        domain= parsed_url.netloc[4:]

        if domain not in self.domains_visited:
            self.domains_visited.append(domain)


        return domain


    def get_item(self, url):

        item = url.split('/')[-1]

        item = re.sub(r'\.json$', '', item)

        return item



    def get_data(self,domain):



        try:


            if domain not in self.data:
                self.data[domain] = pd.read_csv(f'data/{domain}.csv')

            print('data loading success')
            return self.data[domain]
        
        except:
            raise OSError( f'data/{domain}.csv doesn\'t exist.')

        

        

    def get_weight(self, domain):


        try:
            if domain not in self.weights:
                self.weights[domain] = np.load(f'weights/{domain}.npy')

            print('weight loading success')
            return self.weights[domain]
        

        except:
            raise OSError ( f'weights/{domain}.npy doesn\'t exist.')
    

    def get_most_similarity_idx(self, product_similarity):
        similarity = sorted(product_similarity, reverse=True)

        threshold = similarity[0] - similarity[1] + 0.07

        for i in range(1, 10):
            diff = similarity[0] - similarity[i]
            
            if diff > threshold:
                break
        return i


    def FindAlternateGroups(self, url):
        item = self.get_item(url)

        try:

            # optimization to prevent re-processing for the same item
            if item in self.res:
                return self.res[item]

            print(f'{item=}')

            domain = self.get_domain(url)

            print(f'{domain=}')

            data = self.get_data(domain)

            
            try:
                idx = data[data['handle'] == item].index[0]

            except:
                raise ValueError (f'sorry this item `{item}` is not taken into consideration')

            

            similarity = self.get_weight(domain)

            self.product_similarity = similarity[idx]

            # sort the similarity scores in descending order
            self.similar_indices = self.product_similarity.argsort()[::-1]

        

            # get the elements whose difference with the original item is under a certain threshold
            self.most_sim_idx = self.get_most_similarity_idx(self.product_similarity)
            # generate recommendations
            recommendations = self.similar_indices[:self.most_sim_idx]

            

            res = data.iloc[recommendations].copy()


            output_url = re.sub(item, '', url)

            res['handle'] = (output_url + res['handle'])

            final_res = {
                'product alternates': list(res['handle'].values)
            }


            self.res[item] = final_res


            return final_res
        
        except Exception as e:

            return e
        
    




# DataPrep('https://www.boysnextdoor-apparel.co').save()

In [90]:

# dataloader initialization
solution = DataLoader()

### 2.2 Testing the API with a product url 

In [91]:
name = 'beams-two-pocket-sweater-navy' 
item_url = f'https://www.boysnextdoor-apparel.co/products/{name}'

res = solution.FindAlternateGroups(item_url)

print(res)

item='beams-two-pocket-sweater-navy'
domain='boysnextdoor-apparel.co'
data loading success
weight loading success
{'product alternates': ['https://www.boysnextdoor-apparel.co/products/beams-two-pocket-sweater-navy', 'https://www.boysnextdoor-apparel.co/products/beams-two-pocket-sweater-white']}


### 2.3 Testing another product url

In [92]:
name = 'ben-davis-heavy-duty-pocket-tee-black' 
item_url = f'https://www.boysnextdoor-apparel.co/products/{name}'

res = solution.FindAlternateGroups(item_url)
print(res)

item='ben-davis-heavy-duty-pocket-tee-black'
domain='boysnextdoor-apparel.co'
data loading success
weight loading success
{'product alternates': ['https://www.boysnextdoor-apparel.co/products/ben-davis-heavy-duty-pocket-tee-black', 'https://www.boysnextdoor-apparel.co/products/ben-davis-heavy-duty-pocket-tee-white', 'https://www.boysnextdoor-apparel.co/products/ben-davis-pocket-l-s-tee-black', 'https://www.boysnextdoor-apparel.co/products/ben-davis-heavy-duty-pocket-tee-charcoal', 'https://www.boysnextdoor-apparel.co/products/ben-davis-heavy-duty-pocket-tee-ash-grey']}


### 2.4 Testing the api again with incorrect product url

In [93]:
name = 'beams-two-pocket-sweater-navy' 
item_url = f'https://www.boysnextdoor-apparel.com/products/{name}'

# NOTE: '.com' used instead of '.co' in the domain

res = solution.FindAlternateGroups(item_url)

print(res)

{'product alternates': ['https://www.boysnextdoor-apparel.co/products/beams-two-pocket-sweater-navy', 'https://www.boysnextdoor-apparel.co/products/beams-two-pocket-sweater-white']}


### 2.5 Testing the api once more with incorrect product url

In [94]:
name = 'beams-two-pocket-sweater-navy-blue' 
item_url = f'https://www.boysnextdoor-apparel.co/products/{name}'

# NOTE: 'navy-blue' used instead of 'navy' in the product name

res = solution.FindAlternateGroups(item_url)

print(res)

item='beams-two-pocket-sweater-navy-blue'
domain='boysnextdoor-apparel.co'
data loading success
sorry this item `beams-two-pocket-sweater-navy-blue` is not taken into consideration
