In [64]:
import urllib.request
import urllib.parse
import re
import bs4
import csv
import datetime
import string
import json
import os
from decimal import Decimal

In [2]:
def filter_text(text):
        """Remove html tags"""
        tag_re = re.compile(r'<[^>]+>')
        printable = set(string.printable)
        wspace = re.compile(r'[\n\r\t]')
        xprob = re.compile(r'\\x[a-z0-9]')
        return ' '.join(xprob.sub('', wspace.sub(' ', (''.join(filter(lambda x: x in printable, 
                                                                      tag_re.sub('', text))))
                                                 .replace('\\\'', '\''))).split())

In [72]:
class Project:
    
    def __init__(self, url, category_id):
        """Open url"""
        url = url.split('?')[0] + '/description'
        req = urllib.request.Request(url)
        resp = urllib.request.urlopen(req)
        respData = resp.read()
        self.url = url
        self.abstract = filter_text(re.findall(r'<meta property="og:description" content="(.*?)"/>',
                                               str(respData))[0])
        self.data = re.findall(r'<main role="main">(.*?)</main>', str(respData))[0]

        if len(re.findall(r'money (usd|eur|gbp|aud|cad) no-code', self.data)) == 0:
            raise Exception('Bad currency!')
        ctgry, subc = re.findall(r'mr3 nowrap type-12" href="/discover/categories/(.*?)/(.*?)\?ref', self.data)[0]
        self.category = ctgry.replace('%20', " ").replace('&amp;', "&")
        self.subcategory = subc.replace('%20', " ").replace('&amp;', "&")
        # self.pcreator: 0 -> on fb, 1 -> num of of backed projects
        # 2 -> num of crested projects
        self.pcreator = [0, 0, 0]

            
    def find_features(self):
        """Project features"""
        sect = re.findall(r'<section class="NS(.*?)</section>', self.data)[0]
        self.title = filter_text(re.findall(r'<a .*>(.*?)</a>', re.findall(r'<section class="NS(.*?)</h2>', 
                                                                           self.data)[0])[0])
        self.titleLength = len(self.title)
        
        #creator features
        self.creator(re.findall(r'="About the creator" href="(.*?)"', sect)[0])
        
        #project features
        backers = int(float(re.findall(r'(\d+) backer', str(sect))))
        if len(backers) == 0:
            self.backers = 0
            backers = re.findall(r'data-backers-count=\"(\d+)\"', str(sect))
            if len(backers) != 0:
                self.backers = int(float(backers[0]))
        else:
            self.backers = int(float(backers[0]))
        body = self.data[self.data.find(r'NS_projects__content'):]
        #self.updates = re.findall(r'<span class="count">(\d+)</span>', body)[0]
        #self.comments = re.findall(r'data-comments-count="(\d+)"', body)[0]
        #self.location = re.findall(r'discover/places/(.*?)\?ref=city', self.data)[0]

        GDB, EUR, AUD = 1.439095, 1.131835, 0.76625
        goal = re.findall(r'(\d+[\.|,]?\d+)</span> goal', body)
        if len(goal) == 0:
            self.goal = re.findall(r'(\d+[\.|,]?\d+[\.|,]?\d+)</span> <span class="mobile-hide">goal', 
                                   sect)[0].replace(',','')
        else:
            self.goal = goal[0].replace(',','')
        self.pladged = re.findall(r'(\d+[\.|,]?\d+[\.|,]?\d+)</[a-z]{4}>', self.data)[0].replace(',','')
        if len(re.findall(r'money gbp', self.data)) > 0:
            self.goal = float(self.goal[1:])*GDB
            self.pladged = float(self.pladged[1:])*GDB
        elif len(re.findall(r'money eur', self.data)) > 0:
            self.goal = float(self.goal)*EUR
            self.pladged = float(self.pladged)*EUR
        elif len(re.findall(r'money (aud|cad)', self.data)) > 0:
            self.goal = float(self.goal)*AUD
            self.pladged = float(self.pladged)*AUD
        else:
            self.goal, self.pladged = float(self.goal), float(self.pladged)
            
        success = self.pladged - self.goal
        if success >= 0:
            self.success = '1'
        else:
            self.success = '0'
            
        new = 0
        self.num_of_days = re.findall(r'(\d+) days\)', body)
        if len(self.num_of_days) == 0:      # for "first day" scraping
            self.success = '?'
            new = 1
            self.num_of_days = re.findall(r'<div class="js-num num f1 bold">(\d+)</div>', self.data)
        else:
            self.num_of_days = self.num_of_days[0]
        
        if new:     # for "first day" scraping
            date = re.findall(r'data-format="llll z" datetime="(\d+\-\d+\-\d+)T', self.data)[0]
            self.year = date.split("-")[0]
            month = date.split("-")[1]
            self.month = re.findall(r'data-format="llll z" datetime="(.*?)</time>', self.data)[0].split(",")[-1][1:4]
            date = datetime.date(int(self.year), int(month), int(date.split("-")[2]))
            now = datetime.date.today()
            self.num_of_days = (date - now).days
            self.datetime  = re.findall(r'data-format="llll z" datetime="(.*?)">', self.data)[0][:-6]
        else:
            self.year = re.findall(r' (\d{4})</time>', body)[-1] #end year
            self.month = re.findall(r'data-format="ll" datetime="(.*?)</time>', body)[0].split(" ")[-3][-3:]
            self.datetime = re.findall(r'data-format="ll" datetime="(.*?)">', body)[0][:-6]
            
        rewards = re.findall(r'<li class="hover-group (.*?)</li>', body)
        if new:
            self.num_of_rewards = len(rewards) - 1
            self.min_reward = float(re.findall(r'(\d+\.?,?\d*) USD</span>', rewards[1])[0].replace(',',''))
            self.max_reward = float(re.findall(r'(\d+\.?,?\d*) USD</span>', rewards[-1])[0].replace(',','')
                                    .replace('\"',''))
        else:
            self.num_of_rewards = len(rewards)
            self.min_reward = float(re.findall(r'(\d+\.?,?\d*) USD</span>', rewards[0])[0].replace(',',''))
            self.max_reward = float(re.findall(r'(\d+\.?,?\d*) USD</span>', rewards[-1])[0].replace(',','')
                                    .replace('\"',''))
        
        #description features
        self.description(body)
        
        
        
    def creator(self, url):
        """Creator features"""
        req = urllib.request.Request('https://www.kickstarter.com' + url)
        resp = urllib.request.urlopen(req)
        respData = resp.read()
        
        connected = len(re.findall(r'Not connected', str(respData)))
        if connected == 0:
            self.pcreator[0] = 1
        else:
            self.pcreator[0] = 0
            
        n = re.findall(r'(\d+) backed', str(respData))
        if len(n) == 0:
            self.pcreator[1] = 0
        else:
            self.pcreator[1] = n[0]
            
        n = re.findall(r'(\d+) created', str(respData))
        if len(n) == 0:
            self.pcreator[2] = 0
        else:
            self.pcreator[2] = n[0]
            
        cdata = re.findall(r'<div class="readability">(.*?)</div>', str(respData))
        if len(cdata) > 0:
            self.cdataLength = len(cdata[0]) - 11
            self.cdata = filter_text(' '.join(cdata)).replace('\\n', '')
        else:
            self.cdataLength = 0 #!#ascii
            self.cdata = ""            
        
    
    def description(self, body):
        """Description features"""
        desc = re.findall(r'js-full-description responsive-media(.*?)project-faqs', body)[0]
        soup = bs4.BeautifulSoup(desc, "lxml")
        paragraphs = soup.findAll('p')
        self.num_characters = str(sum([len(str(p)) for p in paragraphs]))
        paragraphs = [p.prettify(formatter=None) for p in paragraphs]
        self.text = filter_text(' '.join(paragraphs))[5:].replace('\\n', '')
        self.num_of_pictures = str(len(soup.findAll('figure')))
        soup2 = bs4.BeautifulSoup(self.data, "lxml")
        self.video = str(len(soup2.findAll('video')))
        self.hasVideo = 0  #!#
        if(int(self.video) > 0):
            self.hasVideo = 1
        self.faq = len(re.findall(r'<li class="faq"(.*?)>', body))

In [178]:
def read():    
    with open('Kickstarter.csv', 'wunicode(') as f:
        writer = csv.writer(f)#, delimiter='\t')
        writer.writerow(['Url', 'Title', 'Campaign year', 'Campaign month', 'Category', 'Subcategory',
                          'Facebook connection', 'Number of backed projects', 'Number of created projects', 
                          'Length of creator description', 'Creator description', 'Title length',
                          'Goal', 'Duration', 'Number of pledge levels', 'Minimum pledge tiers',
                          'Maximum pledge tiers', 'Length of project description', 'Project description',
                          'Abstract' ,'Number of images', 'Number of Faq items'
                          'Number of videos', 'Has a video', 'Success'])
        i = 0
        j = 0
        with open('urls/new.txt', 'r') as furl:
            for line in furl:
                i += 1
                if i % 2 == 0:
                    tmp = line.split(',')
                    url, category_id = tmp
                    try:
                        wp = Project(url, category_id.replace('\n', ''))
                        wp.find_features()
                        project = [wp.url, wp.title, wp.year, wp.month, wp.category, wp.subcategory,
                                   wp.pcreator[0], wp.pcreator[1], wp.pcreator[2], wp.cdataLength,
                                   wp.cdata, wp.titleLength, "{0:.2f}".format(wp.goal),wp.num_of_days,
                                   wp.num_of_rewards, wp.min_reward, wp.max_reward.replace('\"',''), 
                                   wp.num_characters, wp.text, wp.abstract, wp.num_of_pictures,
                                   wp.faq, wp.video, wp.hasVideo, wp.success]
                        writer.writerow(project)
                    except Exception as inst:
                        j += 1
                        continue
                if i%200 == 0:
                    print(i)
        print("Errors: ", j)

                    
t = datetime.datetime.now()
read()
tK = datetime.datetime.now()
print("Time: ", abs(t-tK).seconds/60)
    

ValueError: invalid mode: 'wunicode('

In [73]:
urls = ['https://www.kickstarter.com/projects/martinmonk/alpha-girl-alphamadchen-short-film?ref=city',
        'https://www.kickstarter.com/projects/e3dbigbox/the-e3d-bigbox-3d-printer?ref=category_ending_soon',
        'https://www.kickstarter.com/projects/1713137134/friday-feature-documentary?ref=city',
        'https://www.kickstarter.com/projects/martinmonk/alpha-girl-alphamadchen-short-film?ref=city',
        'https://www.kickstarter.com/projects/347698129/dialect-a-game-about-language-and-how-it-dies?ref=home_potd']
t = datetime.datetime.now()

for url in urls:
    wp = Project(url, 331)
    wp.find_features()
    jsn = {'Url': wp.url,
           'Title' : wp.title,
           'Campaign year': wp.year,
           'Campaign month': wp.month,
           'Category': wp.category, 
           'Subcategory':  wp.subcategory,
           'Facebook connection': wp.pcreator[0], 
           'Number of backed projects': wp.pcreator[1], 
           'Number of created projects': wp.pcreator[2], 
           'Length of creator description': wp.cdataLength, 
           'Creator description': wp.cdata, 
           'Title length': wp.titleLength,
           'Goal': "{0:.2f}".format(wp.goal), 
           'Duration': wp.num_of_days, 
           'Number of pledge levels': wp.num_of_rewards, 
           'Minimum pledge tiers': wp.min_reward,
           'Maximum pledge tiers': wp.max_reward, 
           'Length of project description': wp.num_characters, 
           'Project description': wp.text,
           'Abstract': wp.abstract,
           'Number of images': wp.num_of_pictures, 
           'Number of Faq items': wp.faq,
           'Number of videos': wp.video, 
           'Has a video': wp.hasVideo, 
           'Success': wp.success
          }
    
    dirc = "data/" + str(wp.year) + "/" + str(wp.month)
    if not os.path.exists(dirc):
        os.makedirs(dirc)
    with open(dirc + "/" + str(wp.datetime) + '.json', 'w') as fp:
        json.dump(jsn, fp, indent=4)

tK = datetime.datetime.now()
print("Time: ", abs(t-tK).seconds/60)

URLError: <urlopen error [Errno -2] Name or service not known>