# Data scraper for Trendyol

## Install & Import Dependencies

In [1]:
%pip install selenium pandas numpy beautifulsoup4 -q 

Note: you may need to restart the kernel to use updated packages.


In [2]:
import time
import requests

from bs4 import BeautifulSoup
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from concurrent.futures import ThreadPoolExecutor

## Define functions

In [3]:
def calculate_ratings(product):
    star_w_elements = product.find_all('div', class_='star-w')
    total_rating = 0

    if star_w_elements:
        for star_w in star_w_elements:
            full_div = star_w.find("div", attrs={"class": "full"})
            if full_div:
                style = full_div.get('style')
                if style:
                    try:
                        style_width = style.split('width:')[1].split(';')[0]
                        if '0px' in style:
                            width_percentage = 0
                        else:
                            width_percentage_str = style_width.split('%')[0]
                            width_percentage = float(width_percentage_str)
                        rating = width_percentage / 100
                        total_rating += rating
                    except (IndexError, ValueError) as e:
                        print(f"Unexpected style format: {style}")

        total_rating = round(total_rating, 1)
    else:
        total_rating = "unrated"

    return total_rating


In [4]:
def scroll_down(reviews_link, driver):
    driver.get(reviews_link)

    screen_height = driver.execute_script("return window.screen.height;")  # Browser window height
    i = 1
    while True:
        driver.execute_script(f"window.scrollTo(0, {screen_height * i});")
        i += 1
        time.sleep(1) # crucial, need to wait
        scroll_height = driver.execute_script("return document.body.scrollHeight;")
        if screen_height * i > scroll_height:
            break

    new_soup = BeautifulSoup(driver.page_source, "html.parser")

    return new_soup

In [5]:
def get_images_from_review(review):
  image_links = []
  comment_photos = review.find('div', class_='comment-photos')
  if comment_photos:
    image_containers = comment_photos.find_all('div', class_='item review-image')
    for container in image_containers:
      style = container.get('style')
      if style:
        start_index = style.find('"') + 1
        end_index = style.find('"', start_index)
        image_link = style[start_index:end_index]
        image_link = image_link.replace('/mnresize/144/144', '')
        image_links.append(image_link)

  return image_links

In [6]:
def get_reviews(reviews_link, driver):
  all_reviews = scroll_down(reviews_link, driver)
  reviews_list = []
  reviews = all_reviews.find_all("div", attrs={"class": "comment"})

  rating_scores = {5.0: 0, 4.0: 0, 3.0: 0, 2.0: 0, 1.0: 0}

  for review in reviews:
    rating = calculate_ratings(review)

    rating_scores[rating] += 1
    review_text = review.find("div", class_="comment-text").text.strip() if review.find("div", class_="comment-text") else None
    reviewer_name = review.find("div", class_="comment-info-item").text.strip() if review.find("div", class_="comment-info-item") else None
    review_date = review.find_all("div", class_="comment-info-item")[1].text.strip() if review.find_all("div", class_="comment-info-item") else None
    seller_name_info = review.find('span', class_='seller-name-info').text.strip() if review.find_all("span", class_="seller-name-info") else None

    image_links = get_images_from_review(review)
    review_info = {
        'rating': rating,
        'reviewer_name': reviewer_name,
        'review_date': review_date,
        'review_text': review_text,
        'image_links': image_links,
        'seller_name_info': seller_name_info
    }
    
    reviews_list.append(review_info)

  return reviews_list, rating_scores

In [7]:
def create_product_dict(index, product, driver):
    product_dict = {}

    product_dict['index'] = index

    product_brand_span = product.find('span', class_='prdct-desc-cntnr-ttl')
    product_brand = product_brand_span.text.strip() if product_brand_span else None
    product_dict['brand'] = product_brand

    product_name_span = product.find('span', class_='prdct-desc-cntnr-name')
    product_name = product_name_span.get('title') if product_name_span else None
    product_dict['name'] =  product_name

    product_price_div = product.find('div', class_='prc-box-dscntd')
    product_price = product_price_div.text.strip() if product_price_div else None
    product_dict['price'] = float(product_price.replace('TL', '').replace('.', '').replace(',', '.')) if product_price else None

    rating_count_span = product.find('span', class_='ratingCount')
    rating_count = rating_count_span.text.strip()[1:-1] if rating_count_span else None
    product_dict['rating_count'] = int(rating_count) if rating_count else None

    rating = calculate_ratings(product)
    product_dict['rating'] =  rating

    #print('index: ', index)
    #print('product_brand: ', product_brand)
    #print('product_name: ', product_name)
    #print('product_price: ', product_price)
    #print('rating_count: ', rating_count)
    #print('rating', rating)

    product_link_a = product.find('a')
    product_href = product_link_a.get('href') if product_link_a else None
    if product_href:
        product_href0 = product_href.split('?')[0]
        product_href1 = product_href.split('?')[1]
        reviews_link = f"https://www.trendyol.com{product_href0}/yorumlar?{product_href1}"

        reviews_list, rating_scores = get_reviews(reviews_link, driver)
        product_dict['reviews'] = reviews_list
        product_dict['rating_scores'] = rating_scores

    return product_dict

## Scrape & create dataset containing top items and their reviews

In [8]:
base_url = "https://www.trendyol.com/laptop-x-c103108?sst=BEST_SELLER&pi="
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 OPR/98.0.0.0",
}

start_page = 1
end_page = 5
num_workers = 8

products_list = []

def fetch_page_content(page_number):
    url = base_url + str(page_number)
    r = requests.get(url, headers=header)
    soup = BeautifulSoup(r.content, "html.parser", from_encoding="utf-8")
    return soup

def process_product(index, product):
    options = Options()
    options.add_argument('--headless=new')
    driver = webdriver.Chrome(options=options)

    try:
        product_dict = create_product_dict(index, product, driver)
        pass
    finally:
        driver.quit()

    return product_dict

def add_product(index, product):
    product_dict = process_product(index, product)
    products_list.append(product_dict)

# needed for keeping the correct index of products during parallelization
start_index = 0

def process_page(page_number):
    global start_index
    soup = fetch_page_content(page_number)
    products = soup.find_all("div", attrs={"class": "p-card-wrppr with-campaign-view"})
    
    args_list = [(index, product) for index, product in enumerate(products, start=start_index)]

    with ThreadPoolExecutor(num_workers) as executor:
        executor.map(lambda args: add_product(*args), args_list)

    # increase start_index for the new page
    start_index += len(products)

for page_number in range(start_page, end_page + 1):
    print('scraping page: ', page_number)
    process_page(page_number)

df = pd.DataFrame(products_list)
df = df.sort_values(by=['index']).reset_index(drop = True)
df.to_csv('example_output.csv', encoding="utf-8")

scraping page:  1
scraping page:  2
scraping page:  3
scraping page:  4
scraping page:  5


In [None]:
df.head()

Unnamed: 0,index,brand,name,price,rating_count,rating,reviews,rating_scores
0,0,Casper,Excalibur G870.1245-DFB0X-B Intel Core i5-1245...,27999.0,141.0,4.64,"[{'rating': 5.0, 'reviewer_name': 'Emre Ş.', '...","{5.0: 79, 4.0: 8, 3.0: 4, 2.0: 0, 1.0: 4}"
1,1,Apple,"Macbook Air M1 Çip 8gb 256gb Ssd Macos 13"" Qhd...",27349.0,2060.0,4.81,"[{'rating': 5.0, 'reviewer_name': 'F** T**', '...","{5.0: 1192, 4.0: 81, 3.0: 13, 2.0: 8, 1.0: 47}"
2,2,MONSTER,Abra A5 V20.4.2 Intel Core i5 12450H 32 GB RAM...,28299.0,131.0,4.72,"[{'rating': 5.0, 'reviewer_name': '**** ****',...","{5.0: 77, 4.0: 6, 3.0: 3, 2.0: 1, 1.0: 2}"
3,3,Casper,Nirvana C370.4020-4C00B Intel Celeron N4020 4G...,8099.0,395.0,4.22,"[{'rating': 5.0, 'reviewer_name': 'G** U**', '...","{5.0: 189, 4.0: 31, 3.0: 21, 2.0: 3, 1.0: 39}"
4,4,Casper,Excalibur G770.1245-bvj0x-b Intel Core I5-1245...,21999.0,456.0,4.4,"[{'rating': 5.0, 'reviewer_name': 'N** U**', '...","{5.0: 197, 4.0: 44, 3.0: 14, 2.0: 7, 1.0: 37}"


In [9]:
len(df.iloc[0].reviews)

95