In [409]:
import re
import time, os
import json

import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup

from pymongo import MongoClient

from tqdm.notebook import tqdm

from sqlalchemy import create_engine

In [326]:
cnx = create_engine('postgresql://bubbl:1411@localhost/pcpartpicker?gssencmode=disable')

In [327]:
def request_url(url, max_tries = 5):
    r = requests.get(url)
    _try  = 1
    while r.status_code != 200:
        time.sleep(1*max_tries**2)
        r = requests.get(url)
        print(max_tries)
        if _try > max_tries:
            raise Exception(f'Request failed. max_tries of {max_tries} reached')
        max_tries += 1
    return r

In [328]:
def scrape_completed_builds(page = 1):
    url = f'https://pcpartpicker.com/builds/fetch/?page={page}'
    r = request_url(url)    
    soup = BeautifulSoup(json.loads(r.text)['result']['html'], 'lxml')
    
    build_list = []
    soup_list = soup.find_all(class_ ='logGroup')
    for item in soup_list:
        build = {}
        try:
            build['name'] = item.find('h1').text
            build['build_url'] = item.find('a', href=True)['href']
            build['author'] = item.find(class_ ='log__author').text
            build['author_url'] = item.find(class_ ='userAvatar')['href']
            try:
                build['img_url'] = item.find(class_ ='log__image')['style'][24:-1]
            except:
                pass
            try:
                build['price'] = item.find(class_ ='log__price').text
            except:
                pass    
            build_list.append(build)
        except:
            pass
    return build_list

In [329]:
df_builds = pd.DataFrame()
for page in range(1,1000):
    df_builds = pd.concat([df_builds,pd.DataFrame(scrape_completed_builds(page))], ignore_index=True)
df_builds.to_sql('builds', con=cnx, if_exists='replace')
df_builds.to_pickle('builds.pkl')

Unnamed: 0,name,build_url,author,author_url,img_url,price
0,Gaming PC - White Widow,/b/nYHBD3,jdeabreu,/user/jdeabreu/,cdn.pcpartpicker.com/static/forever/images/use...,$1752.81+
1,ARGON,/b/zq3tt6,CheesePork,/user/CheesePork/,cdn.pcpartpicker.com/static/forever/images/use...,$5081.62
2,Rotating PC,/b/6k6RsY,Logarythym,/user/Logarythym/,cdn.pcpartpicker.com/static/forever/images/use...,$1323.44+
3,Wife,/b/2hgJ7P,Rcash1170,/user/Rcash1170/,cdn.pcpartpicker.com/static/forever/images/use...,$871.91+
4,New Daily Driver/Gaming Rig,/b/86Rkcf,NuclearAnarchy,/user/NuclearAnarchy/,cdn.pcpartpicker.com/static/forever/images/use...,$1370.33


In [None]:
df_builds = pd.read_pickle('/content/drive/My Drive/Metis-Project-5/builds.pkl')
df_builds.head()

In [331]:
def request_build(build_url):
    url = f'https://pcpartpicker.com{build_url}'
    r = request_url(url)
    soup = BeautifulSoup(r.text, 'lxml')
    return soup

In [332]:
soup = request_build('/b/gWb8TW')

In [370]:
def scrape_build_img(soup):    
    try:
        img_list = json.loads( re.sub('(\S*):',r'"\1":',re.search('var images = (\[.*?\])', soup.text, re.DOTALL).group(1)) )
        # src thumb heading title
        return [ img['src'] for img in img_list]
    except:
        return []
scrape_build_img(soup)

[]

In [334]:
def scrape_build_details(soup):  
    details_soup =  soup.find('h2', text='Details').parent.parent.parent.parent.find_all(class_ ='group')
    details = {detail.find(class_='group__title').text : detail.find(class_='group__content').text for detail in details_soup} 
    return details
scrape_build_details(soup)

{'Date Published': 'May 31, 2020',
 'Date Built': 'May 22, 2020',
 'CPU Clock Rate': '3.8 GHz',
 'CPU Temperature While Idle': '30.0° C',
 'CPU Temperature Under Load': '65.0° C',
 'GPU Core Clock Rate': '1.607 GHz',
 'GPU Effective Memory Clock Rate': '8 GHz',
 'GPU Temperature While Idle': '30.0° C',
 'GPU Temperature Under Load': '75.0° C'}

In [336]:
def scrape_parts_list(soup):  
    parts_list = []
    soup_list = soup.find(class_ = 'partlist partlist--mini').find_all(class_ ='td__component')
    for item in soup_list:
        try:
            part = {}
            part['type'] = item.text
            part['name'] = item.parent.next_sibling.next_sibling.find(class_ = 'td__name').find('a', href=True).text
            part['part_url'] = item.parent.next_sibling.next_sibling.find(class_ = 'td__name').find('a', href=True)['href'][0:15]
            try:
                part['price'] = item.parent.next_sibling.next_sibling.find(class_ = 'td__name').find(class_ = 'td__price').text
            except:
                pass
            parts_list.append(part)
        except:
            pass
#             print(item.text)
#             print(item.parent.next_sibling.next_sibling.find(class_ = 'td__name'))
    return parts_list
scrape_parts_list(soup)[:2]

[{'type': 'CPU',
  'name': 'AMD Ryzen 5 3600X 3.8 GHz 6-Core',
  'part_url': '/product/3WYLrH',
  'price': '$249.99'},
 {'type': 'CPU Cooler',
  'name': 'NZXT Kraken X63 98.17 CFM Liquid',
  'part_url': '/product/JfVG3C',
  'price': '$149.99'}]

In [None]:
build_list = []
img_list = []
part_build_list = []

In [None]:
df_builds_details = pd.read_pickle('/content/drive/My Drive/Metis-Project-5/builds_details.plk')
df_part_build = pd.read_pickle('/content/drive/My Drive/Metis-Project-5/part_build.plk')
df_imgs = pd.read_pickle('/content/drive/My Drive/Metis-Project-5/imgs.plk')

build_list = df_builds_details.to_dict('records')
img_list = df_part_build.to_dict('records')
part_build_list = df_imgs.to_dict('records')

In [436]:
for url in df_builds['build_url'][len(build_list):]:
 
    soup = request_build(url)
#     time.sleep(1)
    
    build = {}
    build['build_url']=url
    
    if soup.find(class_ = "markdown").text.strip() == 'The completed build you have requested has been removed.':
        build['description'] = 'The completed build you have requested has been removed.'
        continue
    
    build['description'] = " ".join( [ p.text for p in soup.find(class_ = "markdown").find_all('p') ] )
    
    # build details
    build.update(scrape_build_details(soup))
    
    # build_images_list
        # zip with build_url?
    build_img_list = scrape_build_img(soup)
    
    # build_parts_list
        # zip with build_url?
    build_parts_list = scrape_parts_list(soup)
    
    build_list.append(build)
    img_list = img_list + [ {'build_url' : url , 'img_url' : img} for img in build_img_list ]
    part_build_list = part_build_list + [ {'build_url' : url, **parts} for parts in build_parts_list ]
# build_list[0]
# img_list[:4]
# parts_list[:4]

AttributeError: 'NoneType' object has no attribute 'parent'

In [438]:
print( len(build_list) )
print( len(df_part_build['part_url'].unique()) )
print( len(df_part_build[df_part_build['type']=='CPU']['part_url'].unique()) )

737
3220
89


In [437]:
df_builds_details = pd.DataFrame(build_list)
display(df_builds_details.head())

df_part_build = pd.DataFrame(part_build_list)
display(df_part_build.head(3))

df_imgs = pd.DataFrame(img_list)
display(df_imgs.head(5))

df_builds_details.to_pickle('builds_details.plk')
df_part_build.to_pickle('part_build.plk')
df_imgs.to_pickle('imgs.plk')

Unnamed: 0,build_url,description,Date Published,Date Built,CPU Clock Rate,GPU Effective Memory Clock Rate,GPU Core Clock Rate,CPU Temperature While Idle,CPU Temperature Under Load,GPU Temperature While Idle,GPU Temperature Under Load
0,/b/nYHBD3,First gaming pc build. A modest little beast.,"June 2, 2020","June 1, 2020",3.6 GHz,14 GHz,,,,,
1,/b/zq3tt6,"Mainly for gaming, been in use for about 4 mon...","June 2, 2020",,4 GHz,7 GHz,1.35 GHz,,,,
2,/b/6k6RsY,"First off, thanks to PCPP for providing such g...","June 2, 2020",,3.8 GHz,14 GHz,1.605 GHz,,,,
3,/b/2hgJ7P,Mainly use it for gaming. I don't play any of ...,"June 2, 2020","May 22, 2020",3.2 GHz,12 GHz,1.5 GHz,28.0° C,45.0° C,41.0° C,67.0° C
4,/b/86Rkcf,I built this PC because my old one that I buil...,"June 2, 2020","May 30, 2020",3.6 GHz,14 GHz,1.65 GHz,44.0° C,70.0° C,48.0° C,92.0° C


Unnamed: 0,build_url,type,name,part_url,price
0,/b/nYHBD3,CPU,AMD Ryzen 5 3600 3.6 GHz 6-Core,/product/9nm323,$167.00
1,/b/nYHBD3,Motherboard,MSI B450 GAMING PRO CARBON AC ATX AM4,/product/t797YJ,$306.88
2,/b/nYHBD3,Memory,Corsair Vengeance RGB Pro 32 GB (2 x 16 GB) DD...,/product/NyTPxr,$159.99


Unnamed: 0,build_url,img_url
0,/b/nYHBD3,//cdn.pcpartpicker.com/static/forever/images/u...
1,/b/nYHBD3,//cdn.pcpartpicker.com/static/forever/images/u...
2,/b/nYHBD3,//cdn.pcpartpicker.com/static/forever/images/u...
3,/b/nYHBD3,//cdn.pcpartpicker.com/static/forever/images/u...
4,/b/nYHBD3,//cdn.pcpartpicker.com/static/forever/images/u...


In [307]:
def scrape_part_details(soup):  
    part_details = {}
    
    try:
        part_details['type'] = soup.find('h3', class_='pageTitle--categoryTitle').text
    except:
        pass
    part_details['name'] = soup.find('h1', class_='pageTitle').text
    
    ratings = soup.find(class_='product--rating').next_sibling.strip()[1:-1].split()
    part_details['number_ratings'] = ratings[0]
    if len(ratings) > 2:
        part_details['avg_rating'] = ratings[2]
    
    try:
        price_list = [ float(s.text[1:]) for s in soup.find(id = 'prices').find_all(class_ ='td__base priority--2') ]
        part_details['best_price'] = min(price_list)
    except:
        pass
    
    try:
        part_details['price_amazon'] = float(soup.find(id = 'prices').find('img', alt="Amazon").parent.parent.parent.find(class_ ='td__base priority--2').text[1:])
    except:
        pass
    
    # Specifications
    soup_list = soup.find(class_='specs').find_all(class_='group--spec')
    for item in soup_list:
        try:
            part_details[item.find('h3').text] = item.find('p').text.strip()
        except:
            pass

    return part_details
scrape_part_details(soup)

In [321]:
part_list = []
for part_url in df_part_build['part_url'].unique(): #[:3]:
    url = f'https://pcpartpicker.com{part_url}'
#     time.sleep(.1)
    r = request_url(url)
    
    soup = BeautifulSoup(r.text, 'lxml')
    
    part = {}
    part['part_url']=part_url
    part.update(scrape_part_details(soup))
    part_list.append(part)
df_parts_details = pd.DataFrame(part_list)
display(df_parts_details.head(3))
df_parts_details.shape

Unnamed: 0,part_url,type,name,number_ratings,avg_rating,best_price,price_amazon,Manufacturer,Model,Core Count,...,Onboard Ethernet,SATA 6 Gb/s,Onboard Video,USB 2.0 Headers,USB 3.2 Gen 1 Headers,USB 3.2 Gen 2 Headers,USB 3.2 Gen 2x2 Headers,Supports ECC,Wireless Networking,RAID Support
0,/product/3WYLrH,CPU,AMD Ryzen 5 3600X 3.8 GHz 6-Core Processor,85,4.9,204.99,231.99,AMD,100-100000022BOX,6.0,...,,,,,,,,,,
1,/product/JfVG3C,CPU Cooler,NZXT Kraken X63 98.17 CFM Liquid CPU Cooler,13,4.5,149.6,149.6,NZXT,Kraken X63,,...,,,,,,,,,,
2,/product/WcjJ7P,Motherboard,Gigabyte B450 AORUS M Micro ATX AM4 Motherboard,33,4.7,94.99,94.99,Gigabyte,,,...,1 x 1000 Mbit/s,6.0,Depends on CPU,2.0,1.0,0.0,0.0,No,,Yes


(3, 56)