# <center>An example of SNS</center>

- Threadless.com is a crowdsouring website for graphic designs.
- Desginers submit artworks and recieve ratings from the community within a seven-day period. 
- Designs with the best scores will be selected to print on T-shirts and other products for sale. 

### Webscraping objectives

- Get a sample of users and artifacts. Consider a sampling strategy. 
- Scrape artifact-level features.
- Scrape user-level features. 

In [2]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import time


In [3]:
# Get ten urls of pages as a sample of latest artifacts.

link="https://www.threadless.com/designs/archive?page="
num=list(range(1,6))
pages=[]
for i in num:
    page=link+str(i)
    pages.append(page)
print(pages)


['https://www.threadless.com/designs/archive?page=1', 'https://www.threadless.com/designs/archive?page=2', 'https://www.threadless.com/designs/archive?page=3', 'https://www.threadless.com/designs/archive?page=4', 'https://www.threadless.com/designs/archive?page=5']


In [5]:
# Get urls of all the designs in these ten pages
# To reduce the load to their server, will demonnstrate one page

designs=[]
for i in pages:
    print('working on page'+str(' ')+str(i))
    response=requests.get(i)
    soup=BeautifulSoup(response.content, "html.parser")
    links=soup.find('ol',class_='feed-archive th-grided')
    li=links.find_all('li',class_="old")
    for j in li:
        name=j.find("a")["href"]
        designs.append(name)
   

working on page https://www.threadless.com/designs/archive?page=1
working on page https://www.threadless.com/designs/archive?page=2
working on page https://www.threadless.com/designs/archive?page=3
working on page https://www.threadless.com/designs/archive?page=4
working on page https://www.threadless.com/designs/archive?page=5


In [6]:
designs[:5]

# can write out the sample of artifacts 
# with open('designs.csv', 'w') as csvfile:
#    writer=csv.writer(csvfile, delimiter=',')
#    writer.writerows(zip(designs))


# read in your sample
# raw_data_file = open("designs.csv", 'r')
# csv_data_file = csv.reader(raw_data_file, delimiter=',')
# designs = []
# for line in csv_data_file:
#     print(line[0])
#     designs.append(line[0])

['/designs/wild-west-mandala-tee',
 '/designs/debra',
 '/designs/love-for-all-retro-style-lgbt-flag-gay-pride-month',
 '/designs/dinkygoose-mermaid',
 '/designs/pastel-goth-mermaid']

In [33]:
# Get artifact level features
# For each design, get title, author, average score, number of scores, challenge name

rows=[]

for i in designs[:50]:
    try:
        url="https://www.threadless.com"+i
        response=requests.get(url)
        soup=BeautifulSoup(response.content, "html.parser")
        
        # initiate the variable for each period
        title=None
        author=None
        avg_score=None
        total_score=None
        
        ##title
        title=soup.select('div.submission-title h1')
        if title!=[]:
            title=title[0].text

        ##author
        author=soup.select('div.author-block a.author')
        if author!=[]:
            author=author[0].text

        ##score
        avg_score=soup.select('li.avg-score strong')
        if avg_score!=[]:
            avg_score=avg_score[0].text

        ##total scores
        total_score=soup.select('li.total-scores strong')
        if total_score!=[]:
            total_score=total_score[0].text
        
        rows.append((title, author, avg_score, total_score))
        print((title, author, avg_score, total_score))
    
    except AttributeError:
        pass



('Wild West Mandala Tee', 'alphabetempire', '2.60', '5')
('Debra', 'muhaonline', '2.86', '35')
('Love For All - RETRO Style LGBT Flag GAY PRIDE Month Transgender Rainbow Lesbian', 'makersyart', '3.63', '8')
('dinkygoose - Mermaid', 'RedAppleTees', '2.74', '42')
('Pastel Goth Mermaid', 'RedAppleTees', '3.18', '82')
('Octo Mermaid', 'RedAppleTees', '3.51', '118')
('Sincerely, What?', 'EliseTowleSnow', '1.46', '13')
('Cute Demon Mermaid', 'RedAppleTees', '2.91', '54')
('Summoning Circle - Chubby Goat Monster', 'RedAppleTees', '3.21', '73')
('dinkygoose - Dark/Villain', 'RedAppleTees', '2.90', '48')
('dinkygoose - Dark/Villain', 'RedAppleTees', '1.14', '7')
('dinkygoose - Fall', 'RedAppleTees', '2.78', '40')
('Halloween Kitties', 'RedAppleTees', '3.83', '134')
('dinkygoose - Summer Time Fun', 'RedAppleTees', '3.16', '76')
('Magical Kitty', 'RedAppleTees', '3.54', '110')
('Chibi Angel Harpy', 'RedAppleTees', '2.58', '40')
('Cthulhu', 'RedAppleTees', '2.59', '34')
('LAST TIME!', 'sillyindust

In [34]:
# Question: How to scrape the challenge information?

# 1. challenge name
# 2. how many designs per challenge


# add your code here




Threadless
 115107 designs

Kawaii Presented by Hot Topic
 680 designs

Pride Forever
 933 designs

Kawaii Presented by Hot Topic
 680 designs

Kawaii Presented by Hot Topic
 680 designs


In [43]:
# get authors
authors=[row[1] for row in rows]
authors=filter(None, authors)
authors_unique=list(set(authors))
print(authors_unique)
len(authors_unique)

['StrawberryHead', 'makersyart', 'dantastic773', 'RedAppleTees', 'mhs23', 'muhaonline', 'alphabetempire', 'Sploot_RI', 'FreelancerMiel', 'Geo95m12', 'deleaf', 'pawkybear', 'EliseTowleSnow', 'sillyindustries', 'badbasilisk', 'Bezzikapa', 'Lalah3', 'Ghost27', 'Cqmw', 'Bayuktx', 'art-shirt73', 'LarD8', 'CatcoInk', 'DW18', 'Tykennedy', 'rizalsalam', 'Witcher1996', 'JonzShop', 'Teefun']


29

In [50]:
# For the designers we found, get the summary of their experience
full=[]

for i in authors_unique[:5]:
    url="https://www.threadless.com/@"+i
    time.sleep(5)
    response=requests.get(url)
    soup=BeautifulSoup(response.content, "html.parser")
    
    # find all stats
    stats=soup.select('div.stats ul')
    li=stats[0].find_all('li')
    
    line=[None] * 5
    for j in li:
        char=(j.text).strip()
        
        # threads
        if re.search("started",char):
            line[0]=char
            #line[1]=re.findall(r"[0-9.]+", char)[0]
            
        # submitted
        if re.search("submitted",char):
            line[1]=char   
            #line[1]=re.findall(r"[0-9.]+", char)[0]

        # scored
        if re.search("scored",char):
            line[2]=char
            #line[2]=re.findall(r"[0-9.]+", char)[0]
        
        # given
        if re.search("Given",char):
            line[3]=char
            #line[3]=re.findall(r"[0-9.]+", char)[0]

        # since
        if re.search("since",char):
            line[4]=char
            #line[4]=re.findall(r"[0-9.]+", char)[0]
    
    line.append(i)
    print(line)
    full.append(line)
                     

[None, '4 designs submitted', '30 designs scored', 'Avg Score Given: 1.97', 'Member since 2022', 'StrawberryHead']
[None, '4 designs submitted', '5 designs scored', 'Avg Score Given: 4.60', 'Member since 2022', 'makersyart']
[None, '152 designs submitted', '386 designs scored', 'Avg Score Given: 3.23', 'Member since 2013', 'dantastic773']
[None, '49 designs submitted', '152 designs scored', 'Avg Score Given: 3.11', 'Member since 2018', 'RedAppleTees']
[None, '3 designs submitted', None, 'Avg Score Given: 0.00', 'Member since 2021', 'mhs23']


In [51]:
# Question: how to scrape each designers' numbers of followers and following?



# add you code here




StrawberryHead 2 6
makersyart 0 37
dantastic773 97 28
RedAppleTees 1 284
mhs23 0 12


In [52]:
# Scrape the follower-followee network for each designer.
# Can we do this with beautifulsoup? 

from selenium import webdriver
from selenium.webdriver.chrome.options import Options


In [53]:
relations=[]

for i in authors_unique[3:5]:
    
    i=i.replace(" ","%20")
    
    follower_url="https://www.threadless.com/@"+i+"/followers"
    following_url="https://www.threadless.com/@"+i+"/following"

    # close a pop ad
    opts = Options()
    opts.add_argument("user-agent=gene")
    driver = webdriver.Chrome(options=opts)

    # one's follower   
    driver.get(follower_url)
    time.sleep(5)
    
    # you can scroll many times if not reaching the end
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")  
    time.sleep(10)
    html = driver.page_source
    soup = BeautifulSoup(html.encode('utf-8'),"html.parser")
    comp=soup.find('ol',class_="following-list")
    comp=comp.find_all("li")

    line=[]
    for k in comp:
        name=k.find("a")["href"]
        name=name.lstrip("/@")
        if name in authors_unique:
            # one's followers send the following tie
            line=[name, i]
            print(line)
            relations.append(line)
    
    # one's follwing
    driver.get(following_url)
    time.sleep(10)   
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")                
    time.sleep(25)  
    html = driver.page_source
    soup = BeautifulSoup(html.encode('utf-8'),"html.parser")
    comp=soup.find('ol',class_="following-list")
    comp=comp.find_all("li")

    line=[]
    for k in comp:
        name=k.find("a")["href"]
        name=name.lstrip("/@")
        if name in authors_unique:
            # one sends the following tie to those to follow
            line=[i, name]
            print(line)
            relations.append(line)
    driver.quit() 

['Bezzikapa', 'RedAppleTees']
['Cqmw', 'mhs23']
