In [1]:
import os
import pandas as pd
import numpy as np
import urllib
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
fname = os.path.join(
    'Data',
    'kickstarter_data.json'
)
df = pd.read_json(fname, 'records')

In [3]:
df.head()

Unnamed: 0,category_url,goal_and_pledged_backers,project_description,project_name,risks,story,url
0,https://www.kickstarter.com/discover/categories/publishing/children's%20books?ref=category,"$13,037\npledged of $12,000 goal\n171\nbackers","A delightful, beautifully illustrated, children’s book that you and your family will treasure.","The Christmas Stairs: a story of love, joy, hope","Risks and challenges\nThe Christmas Stairs is ready for print through Credo House Publishers, which i...","Yes, it's Christmas in August! And you are among the first to hear about The Christmas Stairs, a bra...",https://www.kickstarter.com/projects/thechristmasstory/the-christmas-stairs-a-story-of-love-joy-hope
1,https://www.kickstarter.com/discover/categories/fashion/apparel?ref=project_category_badge,$36\npledged of $392 goal\n3\nbackers,,,Risks and challenges\nPins can develop flaws during the manufacturing process. I have dedicated some ...,with the help of an amazingly Talented Lady @thingymabobsboutique \nmy Wendy Darling inspired Pin was...,https://www.kickstarter.com/projects/pincessemmifer/wendy-darling-fantasy-pin
2,https://www.kickstarter.com/discover/categories/art?ref=category,£325\npledged of £325 goal\n19\nbackers,A collection of Pokemon inspired enamel pins.,Cutiemon - Pokemon inspired enamel pins,Risks and challenges\nI've done extensive research to ensure I'm working with a manufacturer who has ...,"Hai!\nWelcome to my kickstarter, I’m Rosanna, and I’m a small girl with a vivid imagination hiding ou...",https://www.kickstarter.com/projects/rosannalouise/cutiemon-pokemon-inspired-enamel-pins
3,https://www.kickstarter.com/discover/categories/design/product%20design?ref=project_category_badge,"$5,005\npledged of $8,500 goal\n100\nbackers",,,"Risks and challenges\nThe entire planner is created, and the file is submitted to the manufacturer. W...","Update: \nIf we reach our funding goal, all backers will receive a digital download to an entire 2020...",https://www.kickstarter.com/projects/totteoki/manuscript-planner-2020
4,https://www.kickstarter.com/discover/categories/fashion/jewelry?ref=category,"£1,487\npledged of £500 goal\n27\nbackers","A spiritual pendant design to embrace the owners strength and courage following personal loss, Kidemó...",Amoreantos: Sterling Silver Pendant - Kidemonas (guardian),Risks and challenges\nWe hope that you will help us get this Kidemónas prototype into production in o...,A new Amoreantos' pendant design called Kidemónas meaning ‘guardian’ is now available here on Kicksta...,https://www.kickstarter.com/projects/1836446643/new-kick-ass-sterling-silver-pendant-by-amoreantos


In [4]:
cat_pat = r'https://www.kickstarter.com/discover/categories/(.+)[/?].+'
df.category_url.str.extract(cat_pat).drop_duplicates()

Unnamed: 0,0
0,publishing/children's%20books
1,fashion/apparel
2,art
3,design/product%20design
4,fashion/jewelry
...,...
1748,technology/sound
1822,food/food%20trucks
1929,film%20&%20video/movie%20theaters
2018,publishing/letterpress


In [5]:
cat_pat = r'https://www.kickstarter.com/discover/categories/(.+)[/\?].+'
mask = df.category_url.str.extract(cat_pat)[0].isna()
df.loc[mask, 'category_url']

Series([], Name: category_url, dtype: object)

In [6]:
cat_pat = r'https://www.kickstarter.com/discover/categories/([\w \&]+)[/\?].+'
print(df.loc[:, 'category_url'].apply(urllib.parse.unquote).str.extract(cat_pat)[0].drop_duplicates().to_string())

0        publishing
1           fashion
2               art
3            design
5        technology
7            comics
9             games
10            music
11             food
18     film & video
25            dance
29       journalism
39           crafts
82          theater
173     photography


In [7]:
df.loc[:, 'category'] = (df.loc[:, 'category_url'].apply(urllib.parse.unquote)
    .str.extract(cat_pat)
    [0]
)

In [8]:
df.category.value_counts()

games           363
art             299
design          240
film & video    230
publishing      216
technology      203
fashion         202
music           189
comics          163
food            114
crafts           66
photography      37
theater          32
journalism       21
dance             9
Name: category, dtype: int64

In [9]:
pat = r'(?P<pledged>.+)\npledged of (?P<goal>.+) goal\n[,\d]+\nbackers?'
df[['pledged', 'goal']] = df.goal_and_pledged_backers.str.extract(pat)

In [10]:
"""
Convert pledged and goals into respective currencies and amount
Output to csv to check in excel
"""
df = df.assign(
    pledged_currency=lambda x: x.pledged.str.extract('([^\d]+)'),
    pledged_amount=lambda x: x.pledged.map(lambda x: ''.join([i for i in x if i.isdigit()])).astype(int),
    goal_currency=lambda x: x.goal.str.extract('([^\d]+)'),
    goal_amount=lambda x: x.goal.map(lambda x: ''.join([i for i in x if i.isdigit()])).astype(int),
    success=lambda x: np.where(x.pledged_amount >= x.goal_amount, 1, 0),
)