In [1]:
import urllib.request
import urllib.parse
import re
import bs4
import csv
import datetime
import string
import json
import os
from decimal import Decimal

In [149]:
def filter_text(text):
        """Remove html tags"""
        tag_re = re.compile(r'<[^>]+>')
        printable = set(string.printable)
        wspace = re.compile(r'[\n\r\t]')
        xprob = re.compile(r'\\x[a-z0-9]{2}')
        return ' '.join(xprob.sub(' ', wspace.sub(' ', (''.join(filter(lambda x: x in printable, 
                                                                      tag_re.sub('', text))))
                                                 .replace('\\\'', '\'').replace('\"', '"'))).split())

In [150]:
class Project:
    
    def __init__(self, url):
        """Open url"""
        url = url.split('?')[0] + '/description'
        req = urllib.request.Request(url)
        resp = urllib.request.urlopen(req)
        respData = resp.read()
        self.url = url
        self.abstract = filter_text(re.findall(r'<meta property="og:description" content="(.*?)"/>',
                                               str(respData))[0])
        self.abstractLength = len(self.abstract)
        self.data = re.findall(r'<main role="main">(.*?)</main>', str(respData))[0]

        #print(re.findall(r'(usd|eur|gbp|aud|cad)', self.data))
        #if len(re.findall(r'<span class="money">(\$|€)', self.data)) == 0:
        #    raise Exception('Bad currency!')
        ctgry, subc = re.findall(r'mr3 nowrap type-12" href="/discover/categories/(.*?)/(.*?)\?ref', self.data)[0]
        self.category = ctgry.replace('%20', " ").replace('&amp;', "&")
        self.subcategory = subc.replace('%20', " ").replace('&amp;', "&")
        # self.pcreator: 0 -> on fb, 1 -> num of of backed projects
        # 2 -> num of crested projects
        self.pcreator = [0, 0, 0]

            
    def find_features(self):
        """Project features"""
        sect = re.findall(r'<section class="NS(.*?)</section>', self.data)[0]
        self.title = filter_text(re.findall(r'<a .*>(.*?)</a>', re.findall(r'<section class="NS(.*?)</h2>', 
                                                                           self.data)[0])[0])
        self.titleLength = len(self.title)
        
        #creator features
        self.creator(re.findall(r'="About the creator" href="(.*?)"', sect)[0])
        
        #project features
        backers = re.findall(r'(\d+) backer', str(sect))
        if len(backers) == 0:
            self.backers = 0
            backers = re.findall(r'data-backers-count=\"(\d+)\"', str(sect))
            if len(backers) != 0:
                self.backers = int(float(backers[0]))
        else:
            self.backers = int(float(backers[0]))
        body = self.data[self.data.find(r'NS_projects__content'):]
        
        #self.updates = re.findall(r'<span class="count">(\d+)</span>', body)[0]
        #self.comments = re.findall(r'data-comments-count="(\d+)"', body)[0]
        #self.location = re.findall(r'discover/places/(.*?)\?ref=city', self.data)[0]

        GDB, EUR, AUD = 1.249, 1.064, 0.7445
        unit_goal = re.findall(r'(.{4})(\d+[\.|,]?\d+)</span> goal', body)
        if len(unit_goal) == 0:
            unit, goal = re.findall(r'(.{4})(\d+[\.|,]?\d+[\.|,]?\d+)</span> <span class="mobile-hide">goal', 
                                   sect)[0]
            self.goal = goal.replace(',','')
        else:
            unit, goal = unit_goal[0]
            self.goal = goal.replace(',','')
        self.pladged = re.findall(r'(\d+[\.|,]?\d+[\.|,]?\d+)</[a-z]{4}>', self.data)[0].replace(',','')

        if unit[0] == '2':
            self.goal = float(self.goal[1:])*GDB
            self.pladged = float(self.pladged[1:])*GDB
        elif unit == "\\xac":
            self.goal = float(self.goal)*EUR
            self.pladged = float(self.pladged)*EUR
        elif unit[-1] == '$':
            self.goal = float(self.goal)
            self.pladged = float(self.pladged)
        elif unit == 'CA$ ' or unit == 'AU$ ':
            self.goal = float(self.goal)*AUD
            self.pladged = float(self.pladged)*AUD
        else:
            print(unit)
            raise Exception('Bad currency!')
            
        self.goal, self.pladged = float(self.goal), float(self.pladged)
            
        success = self.pladged - self.goal
        if success >= 0:
            self.success = '1'
        else:
            self.success = '0'
            
        new = 0
        self.num_of_days = re.findall(r'(\d+) days\)', body)
        if len(self.num_of_days) == 0:      # for "first day" scraping
            self.success = '?'
            new = 1
        else:
            self.num_of_days = int(float(self.num_of_days[0]))
        
        if new:
            # for "first day" scraping
            date = re.findall(r'data-format="llll z" datetime="(\d+\-\d+\-\d+)T', self.data)[0]
            self.year = date.split("-")[0]
            month = date.split("-")[1]
            self.month = re.findall(r'data-format="llll z" datetime="(.*?)</time>', self.data)[0].split(",")[-1][1:4]
            date = datetime.date(int(self.year), int(month), int(date.split("-")[2]))
            now = datetime.date.today()
            self.num_of_days = (date - now).days
            self.datetime  = re.findall(r'data-format="llll z" datetime="(.*?)">', self.data)[0][:-6]
        else:
            self.year = re.findall(r' (\d{4})</time>', body)[-1] #end year
            self.month = re.findall(r'data-format="ll" datetime="(.*?)</time>', body)[0].split(" ")[-3][-3:]
            self.datetime = re.findall(r'data-format="ll" datetime="(.*?)">', body)[0][:-6]
            
        rewards = re.findall(r'<li class="hover-group (.*?)</li>', body)
        if new:
            self.num_of_rewards = len(rewards) - 1
            self.min_reward = float(re.findall(r'About <span>\$(\d+\.?,?\d*)</span>', rewards[1])[0].replace(',',''))
            self.max_reward = float(re.findall(r'About <span>\$(\d+\.?,?\d*)</span>', rewards[-1])[0].replace(',','')
                                    .replace('\"',''))
        else:
            self.num_of_rewards = len(rewards)
            self.min_reward = float(re.findall(r'About <span>\$(\d+\.?,?\d*)</span>', rewards[0])[0].replace(',',''))
            self.max_reward = float(re.findall(r'About <span>\$(\d+\.?,?\d*)</span>', rewards[-1])[0].replace(',','')
                                    .replace('\"',''))
        
        #description features
        self.description(body)
        
        
        
    def creator(self, url):
        """Creator features"""
        req = urllib.request.Request('https://www.kickstarter.com' + url)
        resp = urllib.request.urlopen(req)
        respData = resp.read()
        
        connected = len(re.findall(r'Not connected', str(respData)))
        if connected == 0:
            self.pcreator[0] = 1
        else:
            self.pcreator[0] = 0
            
        n = re.findall(r'(\d+) backed', str(respData))
        if len(n) == 0:
            self.pcreator[1] = 0
        else:
            self.pcreator[1] = int(float(n[0]))
            
        n = re.findall(r'(\d+) created', str(respData))
        if len(n) == 0:
            self.pcreator[2] = 0
        else:
            self.pcreator[2] = int(float(n[0]))
            
        cdata = re.findall(r'<div class="readability">(.*?)</div>', str(respData))
        if len(cdata) > 0:
            self.cdataLength = len(cdata[0]) - 11
            self.cdata = filter_text(' '.join(cdata)).replace('\\n', '')
        else:
            self.cdataLength = 0 #!#ascii
            self.cdata = ""            
        
    
    def description(self, body):
        """Description features"""
        desc = re.findall(r'js-full-description responsive-media(.*?)project-faqs', body)[0]
        soup = bs4.BeautifulSoup(desc, "lxml")
        paragraphs = soup.findAll('p')
        self.num_characters = sum([len(str(p)) for p in paragraphs])
        paragraphs = [p.prettify(formatter=None) for p in paragraphs]
        self.text = filter_text(' '.join(paragraphs))[5:].replace('\\n', '')
        self.num_of_pictures = len(soup.findAll('figure'))
        soup2 = bs4.BeautifulSoup(self.data, "lxml")
        self.video = len(soup2.findAll('video'))
        self.hasVideo = 0  #!#
        if(int(self.video) > 0):
            self.hasVideo = 1
        self.faq = len(re.findall(r'<li class="faq"(.*?)>', body))

In [145]:
t = datetime.datetime.now()
j = 0

categories = [i for i in range(1,360)]

for index_id in range(0, len(categories)):
    print("Category: ", categories[index_id])
    for i in range(1, 201):
        try:
            if i % 50 == 0:
                print("Page: ",i)
            url = 'https://www.kickstarter.com/discover/advanced?'
            values = {'category_id' : categories[index_id], 'woe_id': '0', 'sort': 'end_date', 
                      'seed' : '2431954', 'page': i}
            data = urllib.parse.urlencode(values)
            reqUrl = url + str(data)
            print(reqUrl)
            req = urllib.request.Request(reqUrl)
            resp = urllib.request.urlopen(req)
            respData = resp.read()
            projects = re.findall(r'<li class="project col col-3 mb3 mb7-sm">(.*?)</li>', str(respData))
            for pr in projects:
                ahrefs = re.findall(r'href="(.*?)" target=""', str(pr))
                uns = re.findall(r'<div class="project-percent-pledged"(.*?)</div>', str(pr))
                if len(ahrefs) > 0 and len(uns) == 0:
                    try:
                        wp = Project("https://www.kickstarter.com" + ahrefs[0])
                        wp.find_features()
                        jsn = {'Url': wp.url,
                               'Title' : wp.title,
                               'Campaign year': wp.year,
                               'Campaign month': wp.month,
                               'Category': wp.category, 
                               'Subcategory':  wp.subcategory,
                               'Author': 
                               {
                                    'Length of description': wp.cdataLength,
                                    'Description': wp.cdata, 
                                    'Facebook connection': wp.pcreator[0],
                                    'Number of backed projects': wp.pcreator[1], 
                                    'Number of created projects': wp.pcreator[2]
                                },
                               'Title length': wp.titleLength,
                               'Goal': wp.goal, 
                               'Duration': wp.num_of_days, 
                               'Number of pledge levels': wp.num_of_rewards, 
                               'Minimum pledge tiers': wp.min_reward,
                               'Maximum pledge tiers': wp.max_reward, 
                               'Length of project description': wp.num_characters, 
                               'Project description': wp.text,
                               'Abstract': wp.abstract,
                               'Length of abstract': wp.abstractLength,
                               'Number of images': wp.num_of_pictures, 
                               'Number of Faq items': wp.faq,
                               'Number of videos': wp.video, 
                               'Has a video': wp.hasVideo, 
                               'Success': wp.success
                              }
                        dirc = "data/" + str(wp.year) + "/" + str(wp.month)
                        if not os.path.exists(dirc):
                            os.makedirs(dirc)
                        with open(dirc + "/" + str(wp.datetime) + '.json', 'w') as fp:
                            json.dump(jsn, fp, indent=4)
                    except Exception as e:
                        if i % 10 == 0:
                            print(e)
                        j += 1
                        continue
            if len(projects) < 10:
                break
        except Exception as e:
            print(e)
            break
    break
        
print("Errors: ", j)
tK = datetime.datetime.now()
print("Time: ", abs(t-tK).seconds)

Category:  1 , page:  15
https://www.kickstarter.com/discover/advanced?seed=2431954&page=15&sort=end_date&category_id=1&woe_id=0
CA$ 
Bad currency!
DKK 
Bad currency!
CA$ 
Bad currency!
Category:  1 , page:  16
https://www.kickstarter.com/discover/advanced?seed=2431954&page=16&sort=end_date&category_id=1&woe_id=0
CA$ 
Bad currency!
SEK 
Bad currency!
CA$ 
Bad currency!
Category:  1 , page:  17
https://www.kickstarter.com/discover/advanced?seed=2431954&page=17&sort=end_date&category_id=1&woe_id=0
HK$ 
Bad currency!
list index out of range
SEK 
Bad currency!
Category:  1 , page:  18
https://www.kickstarter.com/discover/advanced?seed=2431954&page=18&sort=end_date&category_id=1&woe_id=0
AU$ 
Bad currency!
CA$ 
Bad currency!
Category:  1 , page:  19
https://www.kickstarter.com/discover/advanced?seed=2431954&page=19&sort=end_date&category_id=1&woe_id=0


KeyboardInterrupt: 