# <center>An example of SNS</center>

- Threadless.com is a crowdsouring website for graphic designs.
- Desginers submit artworks and recieve ratings from the community within a seven-day period. 
- Designs with the best scores will be selected to print on T-shirts and other products for sale. 

### Webscraping objectives

- Get a sample of users and artifacts. 
- Scrape artifact-level features.
- Scrape user-level features. 

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import time


In [2]:
# Get ten urls of pages as a sample of latest artifacts.

link="https://www.threadless.com/designs/archive?page="
num=list(range(1,11))
pages=[]
for i in num:
    page=link+str(i)
    pages.append(page)
print(pages)


['https://www.threadless.com/designs/archive?page=1', 'https://www.threadless.com/designs/archive?page=2', 'https://www.threadless.com/designs/archive?page=3', 'https://www.threadless.com/designs/archive?page=4', 'https://www.threadless.com/designs/archive?page=5', 'https://www.threadless.com/designs/archive?page=6', 'https://www.threadless.com/designs/archive?page=7', 'https://www.threadless.com/designs/archive?page=8', 'https://www.threadless.com/designs/archive?page=9', 'https://www.threadless.com/designs/archive?page=10']


In [3]:
# Get urls of all the designs in these ten pages
# To reduce the load to their server, will demonnstrate one page

designs=[]
for i in pages[:1]:
    print('working on page'+str(' ')+str(i))
    response=requests.get(i)
    soup=BeautifulSoup(response.content, "html.parser")
    links=soup.find('ol',class_='feed-archive th-grided')
    li=links.find_all('li',class_="old")
    for j in li:
        name=j.find("a")["href"]
        #print(name)
        designs.append(name)
   

working on page https://www.threadless.com/designs/archive?page=1


In [4]:
designs[:5]

# can write out the sample of artifacts 
# with open('designs.csv', 'w') as csvfile:
#    writer=csv.writer(csvfile, delimiter=',')
#    writer.writerows(zip(designs))


# read in your sample
# raw_data_file = open("designs.csv", 'r')
# csv_data_file = csv.reader(raw_data_file, delimiter=',')
# designs = []
# for line in csv_data_file:
#     print(line[0])
#     designs.append(line[0])

['/designs/natura-marina',
 '/designs/freds-cycling-team',
 '/designs/rye',
 '/designs/funny-upside-down-turtle-wtf-cartoon',
 '/designs/vampire-11']

In [5]:
# Get artifact level features
# For each design, get title, author, average score, number of scores, challenge name

titles=[]
authors=[]
scores=[]
total_scores=[]

for i in designs:
    try:
        url="https://www.threadless.com"+i
        response=requests.get(url)
        soup=BeautifulSoup(response.content, "html.parser")
        
        ##title
        title=soup.find('div',class_='submission-title')
        title=title.find("h1").text
        titles.append(title)
        
        ##author
        author=soup.find('div',class_='author-block')
        author=author.find("a")["href"].lstrip("/@")
        authors.append(author)

        ##score
        score=soup.find('li',class_='avg-score')
        score=(score.find('strong')).text
        scores.append(score)

        ##total scores
        total_score=soup.find('li',class_='total-scores')
        total_score=(total_score.find('strong')).text
        total_scores.append(total_score)
    
    except AttributeError:
        pass

print(titles[0:5])
print(authors[0:5])
print(scores[0:5])
print(total_scores[0:5])

['Natura Marina', 'Fred’s Cycling Team', 'Rye', 'Funny Upside Down Turtle WTF Cartoon', 'Vampire']
['FrauZoe', 'FBGDesigns', 'Kelseytaytay', 'Petspower', 'jclovely']
['3.00', '1.33', '1.33', '1.67', '2.00']
['3', '3', '3', '9', '5']


In [None]:
# Question: How to scrape the challenge information?

# challenge name
# how many designs per challenge
total_score=soup.find('li',class_='Environment')
        total_score=(total_score.find('challenge')).text
        total_scores.append(total_score)

    


Threadless
 109367 designs

Horror
 929 designs

Pride Forever
 376 designs

Pride Forever
 376 designs

Threadless
 109367 designs


In [6]:
#print(authors)
authors_unique=list(set(authors))
print(authors_unique)
len(authors_unique)

['GloopZ', 'AngrySchnauzer', 'DeepSpaceTris', 'marciamotivos', 'Kelseytaytay', 'tobefonseca', 'FrauZoe', 'AttentionLabel', 'HOLOM', 'Surepka', 'AlfonsoML', 'Petspower', 'Alundrart', 'leolorraine', 'Dragonbudgie', 'Twisted_In_Head', 'FBGDesigns', 'OrangeJoe_art', 'zzinor', 'EduEly', 'jclovely', 'Yunus27']


22

In [8]:
# For the designers we found, get the summary of their experience
full=[]

for i in authors_unique[:5]:
    url="https://www.threadless.com/@"+i
    time.sleep(5)
    response=requests.get(url)
    soup=BeautifulSoup(response.content, 'lxml')
    
    # find all stats
    stats=soup.find('div',class_='stats')
    li=stats.find_all('li')
    
    line=[None] * 5
    for j in li:
        char=(j.text).strip()
        
        # threads
        if re.search("started",char):
            line[0]=char
            #line[1]=re.findall(r"[0-9.]+", char)[0]
            
        # submitted
        if re.search("submitted",char):
            line[1]=char   
            #line[1]=re.findall(r"[0-9.]+", char)[0]

        # scored
        if re.search("scored",char):
            line[2]=char
            #line[2]=re.findall(r"[0-9.]+", char)[0]
        
        # given
        if re.search("Given",char):
            line[3]=char
            #line[3]=re.findall(r"[0-9.]+", char)[0]

        # since
        if re.search("since",char):
            line[4]=char
            #line[4]=re.findall(r"[0-9.]+", char)[0]
    
    line.append(i)
    print(line)
    full.append(line)
                     

['28 threads started', '113 designs submitted', '735 designs scored', 'Avg Score Given: 3.12', 'Member since 2009', 'GloopZ']
[None, '30 designs submitted', '71 designs scored', 'Avg Score Given: 4.61', 'Member since 2021', 'AngrySchnauzer']
['28 threads started', '92 designs submitted', '6,501 designs scored', 'Avg Score Given: 2.70', 'Member since 2007', 'DeepSpaceTris']
[None, '13 designs submitted', '86 designs scored', 'Avg Score Given: 4.20', 'Member since 2021', 'marciamotivos']
[None, '1 design submitted', None, 'Avg Score Given: 0.00', 'Member since 2021', 'Kelseytaytay']


In [11]:
# Question: how to scrape each designers' numbers of followers and following?
stats=soup.find('div',class_='lrg-follow-btn')
li=stats.find_all('li')
print(li)




[]


In [None]:
# Scrape the follower-followee network for each designer.
# Can we do this with beautifulsoup? 

from selenium import webdriver
from selenium.webdriver.chrome.options import Options


In [None]:
relations=[]

for i in authors_unique[3:5]:
    
    i=i.replace(" ","%20")
    
    follower_url="https://www.threadless.com/@"+i+"/followers"
    following_url="https://www.threadless.com/@"+i+"/following"

    # close a pop ad
    opts = Options()
    opts.add_argument("user-agent=gene")
    driver = webdriver.Chrome(options=opts)
    #driver.set_page_load_timeout(30)

    # one's follower   
    driver.get(follower_url)  
    time.sleep(5)
    
    # you can scroll many times if not reaching the end
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")  
    time.sleep(10)        
    html = driver.page_source
    soup = BeautifulSoup(html.encode('utf-8'),"html.parser")
    comp=soup.find('ol',class_="following-list")
    comp=comp.find_all("li")

    line=[]
    for k in comp:
        name=k.find("a")["href"]
        name=name.lstrip("/@")
        if name in authors_unique:
            # one's followers send the following tie
            line=[name, i]
            print(line)
            relations.append(line)
    
    # one's follwing
    driver.get(following_url)
    time.sleep(10)   
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")                
    time.sleep(25)  
    html = driver.page_source
    soup = BeautifulSoup(html.encode('utf-8'),"html.parser")
    comp=soup.find('ol',class_="following-list")
    comp=comp.find_all("li")

    line=[]
    for k in comp:
        name=k.find("a")["href"]
        name=name.lstrip("/@")
        if name in authors_unique:
            # one sends the following tie to those to follow
            line=[i, name]
            print(line)
            relations.append(line)
    driver.quit() 

['Tink_M', 'ppmid']
['Tink_M', 'Agimat ni Ingkong']
