In [1]:
import urllib.request
import urllib.parse
import re
import bs4
import csv
import datetime

In [2]:
def remove_tags(text):
        """Remove html tags"""
        tag_re = re.compile(r'<[^>]+>')
        return tag_re.sub('', text)

In [3]:
class Project:
    
    def __init__(self, url, category_id):
        """Open url"""
        url = url.split('?')[0] + '/description'
        req = urllib.request.Request(url)
        resp = urllib.request.urlopen(req)
        respData = resp.read()
        self.data = re.findall(r'<main role="main">(.*?)</main>', str(respData))[0]
        if len(re.findall(r'money (usd|eur|gbp|aud|cad) no-code', self.data)) == 0:
            raise Exception('Bad currency!')
        self.pcreator = [0, 0, 0]
        self.category = category_id
        
    def find_features(self):
        """Project features"""
        sect = re.findall(r'<section class="NS(.*?)</section>', self.data)[0]
        self.title = re.findall(r'<a .*>(.*?)</a>', re.findall(r'<section class="NS(.*?)</h2>', self.data)[0])[0]
        self.titleLength = len(self.title)
        
        #creator features
        self.creator(re.findall(r'="About the creator" href="(.*?)"', sect)[0])
        
        #project features
        backers = re.findall(r'(\d+) backer', str(sect))
        if len(backers) == 0:
            self.backers = 0
            backers = re.findall(r'data-backers-count=\"(\d+)\"', str(sect))
            if len(backers) != 0:
                self.backers = backers[0]
        else:
            self.backers = backers[0]
        body = self.data[self.data.find(r'NS_projects__content'):]
        self.updates = re.findall(r'<span class="count">(\d+)</span>', body)[0]
        self.comments = re.findall(r'data-comments-count="(\d+)"', body)[0]
        self.location = re.findall(r'discover/places/(.*?)\?ref=city', self.data)[0]

        GDB, EUR, AUD = 1.439095, 1.131835, 0.76625
        goal = re.findall(r'(\d+[\.|,]?\d+)</span> goal', body)
        if len(goal) == 0:
            self.goal = re.findall(r'(\d+[\.|,]?\d+[\.|,]?\d+)</span> <span class="mobile-hide">goal', sect)[0].replace(',','')
        else:
            self.goal = goal[0].replace(',','')
        self.pladged = re.findall(r'(\d+[\.|,]?\d+[\.|,]?\d+)</[a-z]{4}>', self.data)[0].replace(',','')
        if len(re.findall(r'money gbp', self.data)) > 0:
            self.goal = float(self.goal[1:])*GDB
            self.pladged = float(self.pladged[1:])*GDB
        elif len(re.findall(r'money eur', self.data)) > 0:
            self.goal = float(self.goal)*EUR
            self.pladged = float(self.pladged)*EUR
        elif len(re.findall(r'money (aud|cad)', self.data)) > 0:
            self.goal = float(self.goal)*AUD
            self.pladged = float(self.pladged)*AUD
        else:
            self.goal, self.pladged = float(self.goal), float(self.pladged)
            
        success = self.pladged - self.goal
        if success >= 0:
            self.success = '1'
        else:
            self.success = '0'
        self.num_of_days = re.findall(r'(\d+) days\)', body)[0]
        self.year = re.findall(r' (\d{4})</time>', body)[-1] #end year
        self.month = re.findall(r'data-format="ll" datetime="(.*?)</time>', body)[0].split(" ")[-3][-3:]
        rewards = re.findall(r'<li class="hover-group pledge--(.*?)</li>', body)
        self.num_of_rewards = str(len(rewards))
        self.min_reward = re.findall(r'(\d+\.?,?\d*) USD</span>', rewards[0])[0].replace(',','')
        self.max_reward = re.findall(r'(\d+\.?,?\d*) USD</span>', rewards[-1])[0].replace(',','')
        
        #description features
        self.description(body)
        
        
        
    def creator(self, url):
        """Creator features"""
        req = urllib.request.Request('https://www.kickstarter.com' + url)
        resp = urllib.request.urlopen(req)
        respData = resp.read()
        
        connected = re.findall(r'<div class="facebook py1 border-bottom h5">(.*?)</div>', str(respData))[0]
        if len(connected) > 100:
            self.pcreator[0] = '1'
        else:
            self.pcreator[0] = '0'
            
        n = re.findall(r'(\d+) backed', str(respData))
        if len(n) == 0:
            self.pcreator[1] = '0'
        else:
            self.pcreator[1] = str(n[0])
            
        n = re.findall(r'(\d+) created', str(respData))
        if len(n) == 0:
            self.pcreator[2] = '0'
        else:
            self.pcreator[2] = n[0]
        cdata = re.findall(r'<div class="readability">(.*?)</div>', str(respData))
        self.cdata = '0' #!#
        if len(cdata) > 0:
            self.cdata = str(len(cdata[0]) - 11) 
            
        
    
    def description(self, body):
        """Description features"""
        desc = re.findall(r'<div class="formatted-lists full-description js-full-description responsive-media">(.*?)faq-ask-box', body)[0]
        soup = bs4.BeautifulSoup(desc, "lxml")
        paragraphs = soup.findAll('p')
        self.num_characters = str(sum([len(str(p)) for p in paragraphs]))
        self.text = ' '.join(str(paragraphs))
        self.num_of_pictures = str(len(soup.findAll('figure')))
        soup2 = bs4.BeautifulSoup(self.data, "lxml")
        self.video = str(len(soup2.findAll('video')))
        self.hasVideo = 0  #!#
        if(int(self.video) > 0):
            self.hasVideo = 1



In [5]:
def read():    
    with open('Kickstarter.csv', 'w') as f:
        writer = csv.writer(f)#, delimiter='\t')
        writer.writerow(['CampaignYear', 'Url', 'Subcategory',
                          'FbConnection', 'BackedProj', 'CreatedProj', 'CDescLength',
                          'Location', 'BackersNum', 'UpdatesNum', 'CommentsNum', 'TitleLength',
                          'Goal', 'Plaged', 'Duration', 'RewardsNum', 'MinPledgeTiers',
                          'MaxPledgeTiers', 'CharactersNum', 'PucturesNum',
                          'VideosNum', 'HasVideo', 'Success'])
        i = 0
        j = 0
        with open('urls/new.txt', 'r') as furl:
            for line in furl:
                i += 1
                if i % 2 == 0:
                    tmp = line.split(',')
                    url, category_id = tmp
                    try:
                        wp = Project(url, category_id.replace('\n', ''))
                        wp.find_features()
                        project = [wp.year, wp.month, url, wp.category, 
                                  str(wp.pcreator[0]), wp.pcreator[1], wp.pcreator[2], wp.cdata,
                                  wp.location, wp.backers, wp.updates, wp.comments, wp.titleLength,
                                  "{0:.2f}".format(wp.goal), "{0:.2f}".format(wp.pladged), wp.num_of_days,
                                  wp.num_of_rewards, wp.min_reward, wp.max_reward.replace('\"',''), 
                                  wp.num_characters, wp.num_of_pictures, wp.video, wp.hasVideo, wp.success]
                        writer.writerow(project)
                    except Exception as inst:
                        j += 1
                        continue
                if i%200 == 0:
                    print(i)
        print("Errors: ", j)

                    
t = datetime.datetime.now()
read()
tK = datetime.datetime.now()
print("Time: ", abs(t-tK).seconds/60)
    

200
400
600
800
1000
Errors:  17
Time:  17.083333333333332
