### Import packages

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
from urllib.request import urlopen
from tqdm import tqdm
import random
import time
from selenium import webdriver
from langdetect import detect

### Global variables

In [2]:
project_features = [
    'pro_link',
    'title',
    'status',
    'percentage_fund',
    'funded',
    'target',
    'end_date',
    'backers',
    'subcategory',
    'thumbnail_type',
    'number_rewards',
    'min_price',
    'max_price',
    'num_news',
    'num_comments',
    'num_contributions',
    'creater_link',
    'language'
]

reward_features = [
    'pro_link',
    'price',
    'num_backers',
    'has_media'
]

creater_features = [
    'creater_link',
    'name',
    'num_created',
    'num_backed',
    'num_followed'
]

timeline_features = [
    'pro_link',
    'date',
    'amount'
] 

comment_features = [
    'pro_link',
    'comment_txt'
]

### Functions

In [3]:
def create_url(page):
    return 'https://www.kisskissbankbank.com/en/discover?project[successful]=on&filter=all&page=' + str(page)

In [4]:
def extract_creater(creater_link):
    # 1. Go to the page of the creater
    # 1.1 Load the website by driver
    chrome_path = './chromedriver.exe'
    driver = webdriver.Chrome(chrome_path)
    driver.get(creater_link)
    time.sleep(5)
    
    # 1.2 Accept the cookie
    cookie_question = driver.find_element_by_xpath('//*[@id="Header-react-component"]/div[1]/div/div/div/div/button[2]')
    cookie_question.click()
    
    # 1.3 Load the html page
    html = driver.page_source
    bs = BeautifulSoup(html, 'html.parser')
    
    # 2. Get information about the creater
    name = bs.find('h1', 'k-Title k-Title--quinary').text
    span = bs.findAll('span', 'k-Badge k-HorizontalNav__badge')
    num_created = int(span[0].text)
    num_backed = int(span[1].text)
    num_followed = int(int(span[2].text))
    
    # 3. Quit the driver
    driver.quit()
    
    return name, num_created, num_backed, num_followed

In [83]:
def extract_data(url):
    # 1. Go to the page of the project
    # 1.1 Load the website by driver
    chrome_path = './chromedriver.exe'
    driver = webdriver.Chrome(chrome_path)
    driver.get(url)
    time.sleep(5)
    
    # 1.2 Accept the cookie
    cookie_question = driver.find_element_by_xpath("//*[@id=\"App-react-component\"]/div[1]/div/div/div/div/button[2]")
    cookie_question.click()
    
    # 1.3 Load the html page
    html = driver.page_source
    bs = BeautifulSoup(html, 'html.parser')
    
    # 2. Get information about the project
    # Get pro_link
    pro_link = url
    
    # Get title
    title = bs.find('h1', 'title__StyledTitle-sc-46lshq-0 kxkLAV titles__StyledTitle-sc-1v04wsx-0 gwAQhx k-u-align-center').contents[0]
    
    # Get status
    status = bs.find('span', 'text__StyledText-sc-1jqe2sw-0 kKHxol k-u-color-font1 k-u-weight-regular project-state__StyledText-w82mew-2 dClAPF').contents[0]
    
    #Get percentage_fund
    percentage_fund = bs.find('div', 'text__StyledText-sc-1jqe2sw-0 kKHxol k-u-color-font1 k-u-line-height-normal k-u-weight-regular progress__StyledText-ycznm1-1 iXebEz').contents[0]
    
    # Get funded, target, end_date, backers
    info = bs.findAll('div', 'text__StyledText-sc-1jqe2sw-0 kKHxol k-u-color-font1 k-u-weight-bold info__StyledBigText-lcf1ol-0 clfvNf')
    funded = info[2].contents[0] 
    target = bs.findAll('div', 'text__StyledText-sc-1jqe2sw-0 kKHxol k-u-color-font1 k-u-weight-light info__StyledSmallText-lcf1ol-1 gDdVEG')[2].contents[0]
    end_date = info[1].contents[0]
    backers = info[0].contents[0]
    
    # Get subcategory
    subcategories = bs.findAll('span','text__StyledText-sc-1jqe2sw-0 kKHxol k-u-color-font1 k-u-size-micro k-u-weight-regular')
    temp_sub = []
    for s in subcategories:
        cate = ''
        for k in range (0,len(s.contents),2):
            if(k == 0):
                cate = cate + s.contents[k]
            else:
                cate = cate + ' ' + s.contents[k]
        temp_sub.append(cate)
    subcategory = ','.join(temp_sub)
    
    # Get thumbnail_type
    thumbnail_type = bs.find('img','project-media__StyledAvatar-bus9q7-1 oMMta').get('src').split('.')[-1]
    
    # Get number_rewards, min_price, max_price, and 3. Get information about rewards
    rewards = bs.findAll('div','marger__StyledMarger-sc-1qqifp5-0 kXGmDY')
    price = []
    num_backers = []
    reward = [] 
    for j in range(len(rewards)):
        rj = rewards[j]
            
        # Get remaining variables
        price_j = rj.findAll('h2', 'title__StyledTitle-sc-46lshq-0 edmigd k-RewardCard__title k-u-margin-bottom-double k-u-margin-top-none')
        if(len(price_j) == 0):
            break
        price_j = float(price_j[0].contents[0].replace('€','').replace(',',''))
        price.append(price_j)
        backers_j = rj.findAll('span', 'text__StyledText-sc-1jqe2sw-0 kKHxol k-u-color-font1 k-u-size-micro k-u-weight-regular k-RewardCard__infos k-RewardCard__infos--hasBottomMargin k-RewardCard__infos--disabled')
        bj = 0
        if(len(backers_j) != 0 and backers_j[0].contents[0].replace(' ','').isnumeric()):
            bj = int(backers_j[0].contents[0])
        num_backers.append(bj)
        
        # Get has_media
        img = rj.findAll('img','k-RewardCard__image k-RewardCard__image--disabled')
        if(len(img) > 0):
            has_media_j = True
        else:
            has_media_j = False
            
        reward.append([pro_link,price_j,bj,has_media_j])
    if(len(price) == 0):
        min_price = -1
        max_price = -1
    else:
        min_price = np.min(np.array(price))
        max_price = np.max(np.array(price))
    number_rewards = len(price)
    
    # Get num_news, num_comments, num_contributions
    information = bs.findAll('a', 'k-NavBar__link')
    news_bs = information[2].findAll('span', 'badge__StyledBadge-sc-7liuod-0 eucluz k-Badge k-Badge--spaced')
    comments_bs = information[3].findAll('span', 'badge__StyledBadge-sc-7liuod-0 eucluz k-Badge k-Badge--spaced')
    contributions_bs = information[4].findAll('span', 'badge__StyledBadge-sc-7liuod-0 eucluz k-Badge k-Badge--spaced')
    
    if(len(news_bs) == 0):
        num_news = 0
    else:
        num_news = int(news_bs[0].contents[0])

    if(len(comments_bs) == 0):
        num_comments = 0
    else:
        num_comments = int(comments_bs[0].contents[0])

    if(len(contributions_bs) == 0):
        num_contributions = 0
    else:
        num_contributions = int(contributions_bs[0].contents[0])
        
    # Get creater_link
    creater_link = 'https://www.kisskissbankbank.com' + bs.find('a','owner-info__StyledOwnerGrid-tqxc8c-0 jnrkmW').get('href')
    
    # Get language
    short_des = bs.findAll('p',{'data-test-id':'short-description'})[0].contents[0]
    language = detect(short_des)
    
    project = [
        pro_link,
        title,
        status,
        percentage_fund,
        funded,
        target,
        end_date,
        backers,
        subcategory,
        thumbnail_type,
        number_rewards,
        min_price,
        max_price,
        num_news,
        num_comments,
        num_contributions,
        creater_link,
        language
    ]
    
    
    # 5. Get information about timeline
    timeline = []
    driver.find_element_by_xpath("//a[contains(., 'Contributions')]").click();
    time.sleep(2)
    html = driver.page_source
    bs = BeautifulSoup(html, 'html.parser')
    
    date, amount = [], []
    while True:
        try:
            driver.find_element_by_xpath("//button[contains(., 'Load more')]").click();
            time.sleep(5)
        except:
            time.sleep(5)
            break        
    html = driver.page_source
    bs = BeautifulSoup(html, 'html.parser')
    contributions = bs.findAll('div', 'backer-card__StyledCard-sc-1buoqas-0 TgtUZ')
    print(len(contributions))
    for contribution in contributions:
        spans = contribution.find('p','text__StyledText-sc-1jqe2sw-0 kKHxol k-u-color-font1 k-u-line-height-normal k-u-size-micro k-u-weight-light k-u-margin-none').findAll('span')
        result = [span.text.strip() for span in spans]
        if len(result) > 1: 
            #date.append(result[1])
            #amount.append(result[0].replace('€', ''))
            timeline.append([pro_link, result[1], result[0].replace('€', '')])
        '''if driver.find_element_by_xpath("//button[contains(., 'Load more')]") == False:
            break
        driver.find_element_by_xpath("//button[contains(., 'Load more')]").click();
        time.sleep(1)'''


    # 6. Get information about comments
    comment = []
    driver.find_element_by_xpath("//a[contains(., 'Comments')]").click();
    time.sleep(2)
    html = driver.page_source
    bs = BeautifulSoup(html, 'html.parser')
    
    comments_txt = []
    #replies_txt = []
    
    while True:
        try:
            driver.find_element_by_xpath("//button[contains(., 'Load more')]").click();
            time.sleep(5)
        except:
            time.sleep(5)
            break
    html = driver.page_source
    bs = BeautifulSoup(html, 'html.parser')
    comments = bs.findAll('div', 'comment__StyledMargerText-sc-8s8e85-5 eYhLGU')
    #replies = bs.find('div', 'replies__Container-t86rf-0 bJiWce').findAll('div', 'comment__StyledMargerText-sc-8s8e85-5 eYhLGU')
    print(len(comments))
    for cmt in comments:
        comment.append([pro_link, cmt.text])
    #replies_txt = [reply.text for reply in replies]

    '''comment = [
        [pro_link for i in range(len(comments_txt))], 
        comments_txt,
    ]'''
    
    # 4. Get information about the project creaters
    #name, num_created, num_backed, num_followed = extract_creater(creater_link)
    driver.get(creater_link)
    
    # 4.1 Load the html page
    html = driver.page_source
    bs = BeautifulSoup(html, 'html.parser')
    
    # 4.2 Get information about the creater
    name = bs.find('h1', 'k-Title k-Title--quinary').text
    span = bs.findAll('span', 'k-Badge k-HorizontalNav__badge')
    num_created = int(span[0].text)
    num_backed = int(span[1].text)
    num_followed = int(int(span[2].text))
    
    
    creater = [
        creater_link,
        name,
        num_created,
        num_backed,
        num_followed
    ]
    
    # 7. Quit the driver
    driver.quit()
    
    return project, reward, creater, timeline, comment

In [84]:
project, reward, creater, timeline, comment = extract_data('https://www.kisskissbankbank.com/en/projects/zigzag-le-mag')
project

1254
12


['https://www.kisskissbankbank.com/en/projects/zigzag-le-mag',
 'Zigzag le Mag !',
 'Successful',
 '133',
 '€66,658',
 'Out of €50,000',
 '01/07/2021',
 '1,227',
 'Indie,French manufacturing,Independent media',
 'JPG',
 9,
 19.0,
 5000.0,
 1,
 105,
 1254,
 'https://www.kisskissbankbank.com/en/users/paris-zigzag--2',
 'fr']

In [75]:
comment

[['https://www.kisskissbankbank.com/en/projects/en-immersion-engagee',
  "Sur les traces de l'ours, un cadeau pour les 30 ans de mon fils!"],
 ['https://www.kisskissbankbank.com/en/projects/en-immersion-engagee',
  'Quel beau cadeau ! 🎁Merci beaucoup Nathalie pour votre soutien ✊'],
 ['https://www.kisskissbankbank.com/en/projects/en-immersion-engagee',
  'Quand vais-je pouvoir donner des dates à mon fils pour cette immersion ? Vous prendrez contact avec lui ?'],
 ['https://www.kisskissbankbank.com/en/projects/en-immersion-engagee',
  "Merci pour cette très bonne initiative, une idée originale et responsable, ce sont les initiatives de ce genre qui ont du sens dans le monde d'aujourd'hui !"],
 ['https://www.kisskissbankbank.com/en/projects/en-immersion-engagee',
  "Merci beaucoup Benjamin ✊🌳. Meilleur message de ma journée !Très belles fêtes de fin d'année 🎄"],
 ['https://www.kisskissbankbank.com/en/projects/en-immersion-engagee',
  'Très bonne idée,  très bon principe pour voyager, on 

In [65]:
def collect_data(start = 2, end = 3):
    
    projects = []
    rewards = []
    creaters = []
    timelines = []
    comments = []
    
    for i in range(start,end):
        
        url_i = create_url(i)
        
        # Go to the page number i
        chrome_path = './chromedriver.exe'
        driver_i = webdriver.Chrome(chrome_path)
        driver_i.get(url_i)
        time.sleep(5)

        # Accept the cookie
        cookie_question_i = driver_i.find_element_by_xpath("//*[@id=\"App-react-component\"]/div[1]/div/div/div/div/button[2]")
        cookie_question_i.click()

        # Load the html page
        html_i = driver_i.page_source
        bs_i = BeautifulSoup(html_i, 'html.parser')
        
        # Quit the driver
        driver_i.quit()
        
        # Extract data
        pro_links = bs_i.findAll('a', 'styles__StyledCrowdfundingCard-sc-1dxuhb7-0 dOcwdr k-CrowdfundingCard k-Card k-Card--light k-Card--withoutBoxShadowOnHover k-CrowdfundingCard--titlesMinHeight')
        for p in tqdm(pro_links):
            pro_link = p.get('href')
            #print(pro_link)
            project, reward, creater, timeline, comment = extract_data(pro_link)
            projects.append(project)
            for r in reward:
                rewards.append(r)
            #print(rewards)
            creaters.append(creater)
            for t in timeline:
                timelines.append(t)
            for c in comment:
                comments.append(c)
    projects = pd.DataFrame(projects, columns = project_features)
    rewards = pd.DataFrame(rewards, columns = reward_features)
    creaters = pd.DataFrame(creaters, columns = creater_features)
    timelines = pd.DataFrame(timelines, columns = timeline_features)
    comments = pd.DataFrame(comments, columns = comment_features)
    return projects, rewards, creaters, timelines, comments
        

### Tests

In [None]:
create_url(3)

In [None]:
extract_data('https://www.kisskissbankbank.com/en/projects/en-immersion-engagee')

In [77]:
projects, rewards, creaters, timelines, comments = collect_data(2,3)
projects

  0%|          | 0/9 [00:00<?, ?it/s]

161
22


 11%|█         | 1/9 [01:46<14:15, 107.00s/it]

345
12


 22%|██▏       | 2/9 [05:02<15:34, 133.46s/it]

58
11


 33%|███▎      | 3/9 [06:01<11:06, 111.10s/it]

36
22


 44%|████▍     | 4/9 [06:47<07:38, 91.74s/it] 

121
13


 56%|█████▌    | 5/9 [08:21<06:09, 92.50s/it]

89
12


 67%|██████▋   | 6/9 [09:35<04:20, 86.92s/it]

360
12


 78%|███████▊  | 7/9 [12:49<03:58, 119.01s/it]

80
10


 89%|████████▉ | 8/9 [13:59<01:44, 104.27s/it]

140
16


100%|██████████| 9/9 [15:35<00:00, 103.97s/it]


Unnamed: 0,pro_link,title,status,percentage_fund,funded,target,end_date,backers,subcategory,thumbnail_type,number_rewards,min_price,max_price,num_news,num_comments,num_contributions,creater_link,language
0,https://www.kisskissbankbank.com/en/projects/3...,"3.8 : 1er album de CVN, le groupe de Virginie ...",Successful,101,"€9,980","Out of €9,900",01/08/2021,145,Indie,jpg,7,15.0,2500.0,15,66,161,https://www.kisskissbankbank.com/en/users/virg...,fr
1,https://www.kisskissbankbank.com/en/projects/o...,Omni lance Le GlobeTrotter,Successful,314,"€47,042","Out of €15,000",01/08/2021,306,"Senior,Health and Handicap",gif,8,10.0,5000.0,5,36,345,https://www.kisskissbankbank.com/en/users/char...,fr
2,https://www.kisskissbankbank.com/en/projects/u...,UN FOUR POUR NOTRE STUDIO,Successful,230,"€3,230","Out of €1,404",01/08/2021,57,"Feminism,Indie,Education",jpg,7,20.0,250.0,5,19,58,https://www.kisskissbankbank.com/en/users/just...,en
3,https://www.kisskissbankbank.com/en/projects/u...,Une maison d'hôtes écologique dans les Pouilles,Successful,100,"€3,611","Out of €3,600",01/08/2021,34,"Organic,Local Development,Change your life",jpeg,5,5.0,150.0,9,11,36,https://www.kisskissbankbank.com/en/users/arie...,hr
4,https://www.kisskissbankbank.com/en/projects/u...,Un riad au bout des doigts,Successful,173,"€17,334","Out of €10,000",01/08/2021,117,Change your life,jpeg,11,5.0,2000.0,1,25,121,https://www.kisskissbankbank.com/en/users/mari...,fr
5,https://www.kisskissbankbank.com/en/projects/l...,Luso community project - Zambia,Successful,137,"€15,000","Out of €10,950",01/08/2021,88,"Local Development,International solidarity,Edu...",png,4,20.0,1200.0,0,12,89,https://www.kisskissbankbank.com/en/users/arna...,fr
6,https://www.kisskissbankbank.com/en/projects/z...,Zigzag le Mag !,Successful,133,"€66,658","Out of €50,000",01/07/2021,1227,"Indie,French manufacturing,Independent media",JPG,9,19.0,5000.0,1,105,1254,https://www.kisskissbankbank.com/en/users/pari...,tl
7,https://www.kisskissbankbank.com/en/projects/d...,Défendons le Pitch me,Successful,116,"€3,490","Out of €3,000",01/07/2021,78,"Anti racism,Indie,Education",JPEG,10,5.0,600.0,0,10,80,https://www.kisskissbankbank.com/en/users/ami-...,fr
8,https://www.kisskissbankbank.com/en/projects/l...,La box des Artisans d'Art,Successful,154,"€18,456","Out of €12,000",01/07/2021,138,"Local Development,French manufacturing",gif,9,10.0,4000.0,0,12,140,https://www.kisskissbankbank.com/en/users/sach...,fr


In [78]:
rewards

Unnamed: 0,pro_link,price,num_backers,has_media
0,https://www.kisskissbankbank.com/en/projects/3...,15.0,54,False
1,https://www.kisskissbankbank.com/en/projects/3...,25.0,27,False
2,https://www.kisskissbankbank.com/en/projects/3...,40.0,11,False
3,https://www.kisskissbankbank.com/en/projects/3...,45.0,10,False
4,https://www.kisskissbankbank.com/en/projects/3...,75.0,16,False
...,...,...,...,...
65,https://www.kisskissbankbank.com/en/projects/l...,180.0,0,True
66,https://www.kisskissbankbank.com/en/projects/l...,360.0,4,True
67,https://www.kisskissbankbank.com/en/projects/l...,500.0,2,True
68,https://www.kisskissbankbank.com/en/projects/l...,2000.0,0,True


In [79]:
creaters

Unnamed: 0,creater_link,name,num_created,num_backed,num_followed
0,https://www.kisskissbankbank.com/en/users/virg...,CVN,1,0,0
1,https://www.kisskissbankbank.com/en/users/char...,Omni,1,0,0
2,https://www.kisskissbankbank.com/en/users/just...,Justine Court & Agathe Dupérou,1,0,0
3,https://www.kisskissbankbank.com/en/users/arie...,Arielle,1,1,0
4,https://www.kisskissbankbank.com/en/users/mari...,Marion-Orlans,1,1,1
5,https://www.kisskissbankbank.com/en/users/arna...,Luso Project,1,0,0
6,https://www.kisskissbankbank.com/en/users/pari...,Paris Zigzag,6,1,0
7,https://www.kisskissbankbank.com/en/users/ami-...,Ami-e-s du Pitch me,2,1,0
8,https://www.kisskissbankbank.com/en/users/sach...,La box des Artisans d'Art,1,0,1


In [80]:
timelines

Unnamed: 0,pro_link,date,amount
0,https://www.kisskissbankbank.com/en/projects/3...,"January 8, 2021",50
1,https://www.kisskissbankbank.com/en/projects/3...,"January 8, 2021",80
2,https://www.kisskissbankbank.com/en/projects/3...,"January 8, 2021",50
3,https://www.kisskissbankbank.com/en/projects/3...,"January 8, 2021",28
4,https://www.kisskissbankbank.com/en/projects/3...,"January 8, 2021",1005
...,...,...,...
1211,https://www.kisskissbankbank.com/en/projects/l...,"November 26, 2020",204
1212,https://www.kisskissbankbank.com/en/projects/l...,"November 26, 2020",20
1213,https://www.kisskissbankbank.com/en/projects/l...,"November 26, 2020",204
1214,https://www.kisskissbankbank.com/en/projects/l...,"November 26, 2020",204


In [81]:
comments

Unnamed: 0,pro_link,comment_txt
0,https://www.kisskissbankbank.com/en/projects/3...,J’aime toujours les comptes ronds 😎
1,https://www.kisskissbankbank.com/en/projects/3...,Est-ce que tu aimes aussi les zèbres pour leur...
2,https://www.kisskissbankbank.com/en/projects/3...,J espère une belle Reussite pour ce projet
3,https://www.kisskissbankbank.com/en/projects/3...,Merci beaucoup Laurent
4,https://www.kisskissbankbank.com/en/projects/3...,"Big succès à vous, hâte d'entendre 🙏🙏🙏🙏 bises"
...,...,...
125,https://www.kisskissbankbank.com/en/projects/l...,Merci Simon et Elodie !
126,https://www.kisskissbankbank.com/en/projects/l...,"Superbe initiative, on croise les doigts pour ..."
127,https://www.kisskissbankbank.com/en/projects/l...,"Merci pour ce beau soutien, Valérie et Philippe !"
128,https://www.kisskissbankbank.com/en/projects/l...,Très emballé par votre projet ! Nous sommes ra...
