# Customer Review Scraping

In [11]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import numpy as np
from datetime import datetime
import pickle

In [2]:
def generate_url(product, page, platform='PLT1', time_ordered=False):
    if platform == 'PLT1':
        domain_      = 'https://hue.platform1.com'
        path_before  = '/marketplace-web-app-service/api/review/'
        
        if time_ordered:
            path_after   = '?order=1&page='
        else:
            path_after   = '?page='
            
        product_id=product
        page_=str(page)
        url = domain_+path_before+product_id+path_after+str(page_)
    
    elif platform == 'PLT2':
        
        domain_      = 'https://www.platform2.com'
        path_before  = '/source/service/product-detail/reviews/'
        path_after   = '?page='
        product_id=product
        page_=str(page)
        url = domain_+path_before+product_id+path_after+str(page_)
        
    return url

In [3]:
def get_json(url):
    response = requests.get(url)
    raw_data = BeautifulSoup(response.text)
    raw_data_str = str(raw_data)
    raw_data_str = raw_data_str[len('<html><body><p>'):-len('</p></body></html>')]
    raw_data_js = json.loads(raw_data_str)
    return raw_data_js

In [4]:
def scrape_reviews(product_id_list, platform_id, from_page=0, until_page=6, time_ordered=False):
    
    assert until_page >= from_page
    assert until_page <= 100
        
    if platform_id == 'PLT1':

        page_range = range(from_page, until_page+1)
        multi_product_dict = {}
        
        for product_id in product_id_list:
            
            comments_all = []

            for page_ in page_range:

                comments_iter = []

                # generate url
                url = generate_url(product_id, page_, platform_id, time_ordered=time_ordered)

                # request web page & get json
                raw_data_js = get_json(url)
                
                # create short path to comments
                review_path_json = raw_data_js["result"]["productComments"]["content"]

                # get reviews for the iterated page
                comments_iter = [review_path_json[i]["review"] for i in range(len(review_path_json))]

                # add up to all reviews for the product
                comments_all += comments_iter
                
                # track progress
                if page_%10==0:
                    print("Product:",product_id,"Page:",page_)
            
            # update the dictionary with comments

            multi_product_dict[product_id] = comments_all
            
    
    elif platform_id == 'PLT2':

        page_range = range(from_page, until_page)
        multi_product_dict = {}

        for product_id in product_id_list:

            comments_all = []

            for page_ in page_range:

                comments_iter = []

                # generate url
                url = generate_url(product_id, page_, platform_id)
                
                # request web page & get json
                raw_data_js = get_json(url)

                # create short path to comments
                review_path_json = raw_data_js['COMMENTS']

                # get reviews for the iterated page
                comments_iter = [review_path_json[i]['COMMENT'] for i in range(len(review_path_json))]

                # add up to all reviews for the product
                comments_all += comments_iter

            # update the dictionary with comments

            multi_product_dict[product_id] = comments_all
                        
        
    return multi_product_dict

In [5]:
def scrape_and_merge_reviews(product_id, platform):
    multi_product_dict_ordered = scrape_reviews([product_id], platform, until_page=100, time_ordered=True)
    multi_product_dict_not_ordered = scrape_reviews([product_id], platform, until_page=100, time_ordered=False)
    reviews_df = pd.DataFrame({product_id: multi_product_dict_ordered[product_id]+multi_product_dict_not_ordered[product_id]})
    reviews_df = reviews_df.drop_duplicates()
    return reviews_df

def export_reviews(reviews_df, platform):
    date_ = datetime.today().date().isoformat()
    file_name= platform+"_"+date_+"_"+reviews_df.columns[0]
    f = open(file_name, 'wb') 
    pickle.dump(reviews_df, f)
    f.close()
    

# Main Code

In [7]:
reviews_df = scrape_and_merge_reviews('84859', 'PLT1')
reviews_df.head()

Product: 84859 Page: 0
Product: 84859 Page: 10
Product: 84859 Page: 20
Product: 84859 Page: 30
Product: 84859 Page: 40
Product: 84859 Page: 50
Product: 84859 Page: 60
Product: 84859 Page: 70
Product: 84859 Page: 80
Product: 84859 Page: 90
Product: 84859 Page: 100
Product: 84859 Page: 0
Product: 84859 Page: 10
Product: 84859 Page: 20
Product: 84859 Page: 30
Product: 84859 Page: 40
Product: 84859 Page: 50
Product: 84859 Page: 60
Product: 84859 Page: 70
Product: 84859 Page: 80
Product: 84859 Page: 90
Product: 84859 Page: 100


Unnamed: 0,84859
0,basariliii
1,urunu kullandim guzel tesekkurler
2,Çok güzel çok beğendim
3,saçımdaki egzamayı pul pul dökülmeleri kesti y...
4,"kepeğe çok etkili ve kaşıntıya da,mevsimsel eg..."


In [8]:
len(reviews_df)

4977

In [12]:
export_reviews(reviews_df, 'PLT1')

### End of notebook