In [50]:
import urllib.request
import urllib.parse
import re
import bs4
import csv
import datetime
import string

In [112]:
chars = {
    '\xc2\x82' : ',',        # High code comma
    '\xc2\x84' : ',,',       # High code double comma
    '\xc2\x85' : '...',      # Tripple dot
    '\xc2\x88' : '^',        # High carat
    '\xc2\x91' : '\x27',     # Forward single quote
    '\xc2\x92' : '\x27',     # Reverse single quote
    '\xc2\x93' : '\x22',     # Forward double quote
    '\xc2\x94' : '\x22',     # Reverse double quote
    '\xc2\x95' : ' ',
    '\xc2\x96' : '-',        # High hyphen
    '\xc2\x97' : '--',       # Double hyphen
    '\xc2\x99' : ' ',
    '\xc2\xa0' : ' ',
    '\xc2\xa6' : '|',        # Split vertical bar
    '\xc2\xab' : '<<',       # Double less than
    '\xc2\xbb' : '>>',       # Double greater than
    '\xc2\xbc' : '1/4',      # one quarter
    '\xc2\xbd' : '1/2',      # one half
    '\xc2\xbe' : '3/4',      # three quarters
    '\xca\xbf' : '\x27',     # c-single quote
    '\xcc\xa8' : '',         # modifier - under curve
    '\xcc\xb1' : '',         # modifier - under line
}
def replace_chars(match):
    char = match.group(0)
    return chars[char]

def replace(text):
    return re.sub('(' + '|'.join(chars.keys()) + ')', replace_chars, text)

In [125]:
def remove_tags(text):
        """Remove html tags"""
        tag_re = re.compile(r'<[^>]+>')
        printable = set(string.printable)
        return ''.join(filter(lambda x: x in printable, tag_re.sub('', text)))

In [127]:
class Project:
    
    def __init__(self, url, category_id):
        """Open url"""
        url = url.split('?')[0] + '/description'
        req = urllib.request.Request(url)
        resp = urllib.request.urlopen(req)
        respData = resp.read()
        self.data = re.findall(r'<main role="main">(.*?)</main>', str(respData))[0]
        if len(re.findall(r'money (usd|eur|gbp|aud|cad) no-code', self.data)) == 0:
            raise Exception('Bad currency!')
        self.pcreator = [0, 0, 0]
        self.category = category_id

            
    def find_features(self):
        """Project features"""
        sect = re.findall(r'<section class="NS(.*?)</section>', self.data)[0]
        self.title = re.findall(r'<a .*>(.*?)</a>', re.findall(r'<section class="NS(.*?)</h2>', self.data)[0])[0]
        self.titleLength = len(self.title)
        
        #creator features
        self.creator(re.findall(r'="About the creator" href="(.*?)"', sect)[0])
        
        #project features
        backers = re.findall(r'(\d+) backer', str(sect))
        if len(backers) == 0:
            self.backers = 0
            backers = re.findall(r'data-backers-count=\"(\d+)\"', str(sect))
            if len(backers) != 0:
                self.backers = backers[0]
        else:
            self.backers = backers[0]
        body = self.data[self.data.find(r'NS_projects__content'):]
        self.updates = re.findall(r'<span class="count">(\d+)</span>', body)[0]
        self.comments = re.findall(r'data-comments-count="(\d+)"', body)[0]
        self.location = re.findall(r'discover/places/(.*?)\?ref=city', self.data)[0]

        GDB, EUR, AUD = 1.439095, 1.131835, 0.76625
        goal = re.findall(r'(\d+[\.|,]?\d+)</span> goal', body)
        if len(goal) == 0:
            self.goal = re.findall(r'(\d+[\.|,]?\d+[\.|,]?\d+)</span> <span class="mobile-hide">goal', sect)[0].replace(',','')
        else:
            self.goal = goal[0].replace(',','')
        self.pladged = re.findall(r'(\d+[\.|,]?\d+[\.|,]?\d+)</[a-z]{4}>', self.data)[0].replace(',','')
        if len(re.findall(r'money gbp', self.data)) > 0:
            self.goal = float(self.goal[1:])*GDB
            self.pladged = float(self.pladged[1:])*GDB
        elif len(re.findall(r'money eur', self.data)) > 0:
            self.goal = float(self.goal)*EUR
            self.pladged = float(self.pladged)*EUR
        elif len(re.findall(r'money (aud|cad)', self.data)) > 0:
            self.goal = float(self.goal)*AUD
            self.pladged = float(self.pladged)*AUD
        else:
            self.goal, self.pladged = float(self.goal), float(self.pladged)
            
        success = self.pladged - self.goal
        if success >= 0:
            self.success = '1'
        else:
            self.success = '0'
            
        new = 0
        self.num_of_days = re.findall(r'(\d+) days\)', body)
        if len(self.num_of_days) == 0:      # for first day scraping
            self.success = '?'
            new = 1
            self.num_of_days = re.findall(r'<div class="js-num num f1 bold">(\d+)</div>', self.data)   # bug!!!
        else:
            self.num_of_days = self.num_of_days[0]
        
        if new:     # for first day scraping
            date = re.findall(r'data-format="llll z" datetime="(\d+\-\d+\-\d+)T', self.data)[0]
            self.year = date.split("-")[0]
            self.month = date.split("-")[1]
            date = datetime.date(int(self.year), int(self.month), int(date.split("-")[2]))
            now = datetime.date.today()
            self.num_of_days = (date - now).days
        else:
            self.year = re.findall(r' (\d{4})</time>', body)[-1] #end year
            self.month = re.findall(r'data-format="ll" datetime="(.*?)</time>', body)[0].split(" ")[-3][-3:]
            
        rewards = re.findall(r'<li class="hover-group (.*?)</li>', body)
        if new:
            self.num_of_rewards = str(len(rewards) - 1)
            self.min_reward = re.findall(r'(\d+\.?,?\d*) USD</span>', rewards[1])[0].replace(',','')
            self.max_reward = re.findall(r'(\d+\.?,?\d*) USD</span>', rewards[-1])[0].replace(',','')
        else:
            self.num_of_rewards = str(len(rewards))
            self.min_reward = re.findall(r'(\d+\.?,?\d*) USD</span>', rewards[0])[0].replace(',','')
            self.max_reward = re.findall(r'(\d+\.?,?\d*) USD</span>', rewards[-1])[0].replace(',','')
        
        #description features
        self.description(body)
        
        
        
    def creator(self, url):
        """Creator features"""
        req = urllib.request.Request('https://www.kickstarter.com' + url)
        resp = urllib.request.urlopen(req)
        respData = resp.read()
        
        connected = len(re.findall(r'Not connected', str(respData)))
        if connected == 0:
            self.pcreator[0] = '1'
        else:
            self.pcreator[0] = '0'
            
        n = re.findall(r'(\d+) backed', str(respData))
        if len(n) == 0:
            self.pcreator[1] = '0'
        else:
            self.pcreator[1] = str(n[0])
            
        n = re.findall(r'(\d+) created', str(respData))
        if len(n) == 0:
            self.pcreator[2] = '0'
        else:
            self.pcreator[2] = n[0]
        cdata = re.findall(r'<div class="readability">(.*?)</div>', str(respData))
        self.cdata = '0' #!#
        if len(cdata) > 0:
            self.cdata = str(len(cdata[0]) - 11) 
            
        
    
    def description(self, body):
        """Description features"""
        desc = re.findall(r'js-full-description responsive-media(.*?)project-faqs', body)[0]
        soup = bs4.BeautifulSoup(desc, "lxml")
        paragraphs = soup.findAll('p')
        self.num_characters = str(sum([len(str(p)) for p in paragraphs]))
        paragraphs = [p.prettify(formatter=None) for p in paragraphs]
        self.text = remove_tags(' '.join(paragraphs))
        print(self.text)
        self.num_of_pictures = str(len(soup.findAll('figure')))
        soup2 = bs4.BeautifulSoup(self.data, "lxml")
        self.video = str(len(soup2.findAll('video')))
        self.hasVideo = 0  #!#
        if(int(self.video) > 0):
            self.hasVideo = 1

In [None]:
def read():    
    with open('Kickstarter.csv', 'wunicode(') as f:
        writer = csv.writer(f)#, delimiter='\t')
        writer.writerow(['CampaignYear', 'Url', 'Subcategory',
                          'FbConnection', 'BackedProj', 'CreatedProj', 'CDescLength',
                          'Location', 'BackersNum', 'UpdatesNum', 'CommentsNum', 'TitleLength',
                          'Goal', 'Plaged', 'Duration', 'RewardsNum', 'MinPledgeTiers',
                          'MaxPledgeTiers', 'CharactersNum', 'PucturesNum',
                          'VideosNum', 'HasVideo', 'Success'])
        i = 0
        j = 0
        with open('urls/new.txt', 'r') as furl:
            for line in furl:
                i += 1
                if i % 2 == 0:
                    tmp = line.split(',')
                    url, category_id = tmp
                    try:
                        wp = Project(url, category_id.replace('\n', ''))
                        wp.find_features()
                        project = [wp.year, wp.month, url, wp.category, 
                                  str(wp.pcreator[0]), wp.pcreator[1], wp.pcreator[2], wp.cdata,
                                  wp.location, wp.backers, wp.updates, wp.comments, wp.titleLength,
                                  "{0:.2f}".format(wp.goal), "{0:.2f}".format(wp.pladged), wp.num_of_days,
                                  wp.num_of_rewards, wp.min_reward, wp.max_reward.replace('\"',''), 
                                  wp.num_characters, wp.num_of_pictures, wp.video, wp.hasVideo, wp.success]
                        writer.writerow(project)
                    except Exception as inst:
                        j += 1
                        continue
                if i%200 == 0:
                    print(i)
        print("Errors: ", j)

                    
t = datetime.datetime.now()
read()
tK = datetime.datetime.now()
print("Time: ", abs(t-tK).seconds/60)
    

In [128]:
urls = ['https://www.kickstarter.com/projects/rkl/eir-1-sci-fi-all-ages-one-shot-comic?ref=home_popular',]
        #'https://www.kickstarter.com/projects/e3dbigbox/the-e3d-bigbox-3d-printer?ref=category_ending_soon',
        #'https://www.kickstarter.com/projects/1713137134/friday-feature-documentary?ref=city',
        # 'https://www.kickstarter.com/projects/martinmonk/alpha-girl-alphamadchen-short-film?ref=city',
        #'https://www.kickstarter.com/projects/347698129/dialect-a-game-about-language-and-how-it-dies?ref=home_potd']

for url in urls:
    wp = Project(url, 331)
    wp.find_features()
    print(url, wp.title, wp.year, wp.month, wp.category, 
          str(wp.pcreator[0]), wp.pcreator[1], wp.pcreator[2], wp.cdata,
          wp.location, wp.backers, wp.updates, wp.comments, wp.titleLength,
          "{0:.2f}".format(wp.goal), "{0:.2f}".format(wp.pladged), wp.num_of_days,
          wp.num_of_rewards, wp.min_reward, wp.max_reward.replace('\"',''), 
          wp.num_characters, wp.num_of_pictures, wp.video, wp.hasVideo, wp.success)

b'\n ">\\n\n\n \n EIR is a sci fi all-ages one-shot comic that\\\'s about exploration, finding weird alien creatures, deciding how to observe and connect, as well as accepting rage, and defeating grief.\n\n \n Imagine Raina Telgemeier adapted some Robert E. Heinlein.\n\n \n This 24 page comic tells the tale of Sasha, who wakes one ordinary morning to find a sentient space helmet promising her adventure and a good time. She takes it up on the offer and flees into space rather than face her pestering family downstairs. What ensues can only be described as cosmically epic.\n\n \n It\\\'s also a deeply personal comic, in ways you hopefully won\\\'t see coming.\n\n \n \n  Alfie Gallagher\n \n on art brings an effortless fantasy to this scenario that still feels all too real, and\n \n  Triona Farrell\n \n on colours ensures everything has tone and timbre - you feel the scene changes, the cool breezes, the hectic danger.\n \n  Ryan Ferrier\n \n on letters drops his usual science of placement 