## Imports and reading the data

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [32]:
raw_df = pd.read_csv('../data/all_project_week_19_32.csv')
print(raw_df.shape)
cols = ['project_slug', 'Category', 'Location', 'Goal_USD', 'Pledge_USD', 'Number_Backers' , 'Creator_nb_projects','Project_Community_top_countries','Project_description', 'Deadline', 'Launched_at' ]
raw_df = raw_df[cols]
raw_df = raw_df.dropna()
print(raw_df.shape)


(82088, 29)
(60462, 11)


## Filtering projects that are still running when the data collection happened (2019 aug. 11)

In [33]:
# getting rid on unix timestamps
raw_df['end_date'] = pd.to_datetime(raw_df['Deadline'], unit='s', origin='unix')
raw_df['launch_date'] = pd.to_datetime(raw_df['Launched_at'], unit='s', origin='unix')

# Exploring how the launch- and end-dates look like in the dataset
print('earliest launch date: ', raw_df['launch_date'].min())
print('earliest deadline: ', raw_df['end_date'].min())
print('latest launch date: ', raw_df['launch_date'].max())
print('latest deadline: ', raw_df['end_date'].max())


earliest launch date:  2009-05-06 16:20:07
earliest deadline:  2009-06-24 22:26:00
latest launch date:  2019-07-30 10:03:18
latest deadline:  2019-09-28 08:11:41


In [34]:
# filtering on date of collection
cutoff_date = pd.Timestamp(year=2019, month=8, day=11, hour=0)
raw_df = raw_df[ raw_df['end_date'] < cutoff_date ]
raw_df = raw_df.drop(columns=['Deadline', 'Launched_at', 'end_date'])
raw_df.columns

Index(['project_slug', 'Category', 'Location', 'Goal_USD', 'Pledge_USD',
       'Number_Backers', 'Creator_nb_projects',
       'Project_Community_top_countries', 'Project_description',
       'launch_date'],
      dtype='object')

## Creating target variable and adjusting other attributes

### Gettint the top countries by backers and location

In [35]:
# getting the top country and location to use as exo variable for STM
raw_df['top_country'] = raw_df.apply( lambda row: re.search(r'(?<=\|)(.*?)(?=\:)', row['Project_Community_top_countries']).group().lower() if re.search(r'(?<=\|)(.*?)(?=\:)', row['Project_Community_top_countries']) is not None else '' , axis=1)
raw_df['location'] = raw_df.apply( lambda row: re.search(r'(?<=\|).*', row['Location']).group().lower().strip() if re.search(r'(?<=\|).*', row['Location']) is not None else '' , axis=1)
raw_df.head()

Unnamed: 0,project_slug,Category,Location,Goal_USD,Pledge_USD,Number_Backers,Creator_nb_projects,Project_Community_top_countries,Project_description,launch_date,top_country,location
0,the-because-black-life-conference-2018,Art,Minneapolis | US,10000.0,10150.0,72,1.0,|United States:57 backers|Australia:1 backer|S...,The Because Black Life Conference 2018 - This ...,2018-05-14 15:22:13,united states,us
1,petography-illustrations-of-your-pet,Illustration,Victoria | CA,156.13997,645.084479,34,5.0,|United States:23 backers|Canada:2 backers|Uni...,Risks and challenges Having done print focuse...,2018-05-14 14:57:39,united states,ca
2,dessert-cuties-enamel-pins,Art,Boston | US,800.0,815.0,33,2.0,|United States:24 backers|Australia:1 backer,Who doesn't love cute enamel pins? I madeÂ sev...,2018-05-14 10:48:26,united states,us
3,adorable-deadpool-chibi-sticker,Illustration,Kettering | US,400.0,870.0,61,12.0,|United States:49 backers|United Kingdom:5 bac...,My name is Melanie (Melbaka) and welcome to my...,2018-05-13 21:02:31,united states,us
4,japanese-mythos-enamel-pins,Art,Minneapolis | US,1050.0,11069.0,293,5.0,|United States:239 backers|Canada:11 backers|U...,â¦ï¸â¦ï¸â¦ï¸ Information â¦ï¸â¦ï¸â¦...,2018-05-13 17:03:50,united states,us


### Calculating the percentage of amount actually pledged

In [36]:
raw_df['pledged_percentage'] = raw_df.apply(  lambda row: float(row['Pledge_USD'])/float(row['Goal_USD'])  , axis=1)
raw_df['pledged_binary'] = raw_df.apply(  lambda row:  1 if row['pledged_percentage'] >= 1 else 0, axis=1)
raw_df.head()

Unnamed: 0,project_slug,Category,Location,Goal_USD,Pledge_USD,Number_Backers,Creator_nb_projects,Project_Community_top_countries,Project_description,launch_date,top_country,location,pledged_percentage,pledged_binary
0,the-because-black-life-conference-2018,Art,Minneapolis | US,10000.0,10150.0,72,1.0,|United States:57 backers|Australia:1 backer|S...,The Because Black Life Conference 2018 - This ...,2018-05-14 15:22:13,united states,us,1.015,1
1,petography-illustrations-of-your-pet,Illustration,Victoria | CA,156.13997,645.084479,34,5.0,|United States:23 backers|Canada:2 backers|Uni...,Risks and challenges Having done print focuse...,2018-05-14 14:57:39,united states,ca,4.13145,1
2,dessert-cuties-enamel-pins,Art,Boston | US,800.0,815.0,33,2.0,|United States:24 backers|Australia:1 backer,Who doesn't love cute enamel pins? I madeÂ sev...,2018-05-14 10:48:26,united states,us,1.01875,1
3,adorable-deadpool-chibi-sticker,Illustration,Kettering | US,400.0,870.0,61,12.0,|United States:49 backers|United Kingdom:5 bac...,My name is Melanie (Melbaka) and welcome to my...,2018-05-13 21:02:31,united states,us,2.175,1
4,japanese-mythos-enamel-pins,Art,Minneapolis | US,1050.0,11069.0,293,5.0,|United States:239 backers|Canada:11 backers|U...,â¦ï¸â¦ï¸â¦ï¸ Information â¦ï¸â¦ï¸â¦...,2018-05-13 17:03:50,united states,us,10.541905,1


In [37]:
# getting to the final dataset
cols = ['project_slug', 'Category', 'location', 'Number_Backers', 'Creator_nb_projects', 'Project_description', 'top_country', 'pledged_percentage', 'pledged_binary', 'Goal_USD', 'Pledge_USD', 'launch_date']
raw_df = raw_df[cols]
raw_df.head()

Unnamed: 0,project_slug,Category,location,Number_Backers,Creator_nb_projects,Project_description,top_country,pledged_percentage,pledged_binary,Goal_USD,Pledge_USD,launch_date
0,the-because-black-life-conference-2018,Art,us,72,1.0,The Because Black Life Conference 2018 - This ...,united states,1.015,1,10000.0,10150.0,2018-05-14 15:22:13
1,petography-illustrations-of-your-pet,Illustration,ca,34,5.0,Risks and challenges Having done print focuse...,united states,4.13145,1,156.13997,645.084479,2018-05-14 14:57:39
2,dessert-cuties-enamel-pins,Art,us,33,2.0,Who doesn't love cute enamel pins? I madeÂ sev...,united states,1.01875,1,800.0,815.0,2018-05-14 10:48:26
3,adorable-deadpool-chibi-sticker,Illustration,us,61,12.0,My name is Melanie (Melbaka) and welcome to my...,united states,2.175,1,400.0,870.0,2018-05-13 21:02:31
4,japanese-mythos-enamel-pins,Art,us,293,5.0,â¦ï¸â¦ï¸â¦ï¸ Information â¦ï¸â¦ï¸â¦...,united states,10.541905,1,1050.0,11069.0,2018-05-13 17:03:50


## Slicing dataset up on meta-categories

In [38]:
# Defining meta-categories according to kickstarter standards
art_meta = ['Art', 'Ceramics', 'Conceptual Art', 'Digital Art', 'Illustration', 'Installations', 'Mixed Media', 'Painting', 'Performance Art', 'Public Art', 'Sculpture', 'Social Practice', 'Textiles', 'Video Art']
comics_meta = ['Comics', 'Comic Books', 'Graphic Novels', 'Webcomics']
crafts_meta = ['Crafts', 'Candles', 'Crochet', 'DIY', 'Embroidery', 'Glass', 'Knitting', 'Pottery', 'Printing', 'Quilts', 'Stationery', 'Taxidermy', 'Weaving', 'Woodworking']
dance_meta = ['Dance', 'Performances', 'Residencies', 'Workshops']
design_meta = ['Design', 'Architecture', 'Civic Design', 'Graphic Design', 'Interactive Design', 'Product Design', 'Toys', 'Typography']
fashion_meta =['Fashion', 'Accessories', 'Apparel', 'Childrenswear', 'Couture', 'Footwear', 'Jewelry', 'Pet Fashion', 'Ready-to-wear']
film_meta = ['Film', 'Film &amp; Video', 'Action', 'Animation', 'Comedy', 'Documentary', 'Drama', 'Experimental', 'Family', 'Fantasy', 'Festivals', 'Horror', 'Movie Theaters', 'Music Videos', 'Narrative Film', 'Romance', 'Science Fiction', 'Shorts', 'Television', 'Thrillers', 'Webseries']
food_meta = ['Food', 'Bacon', 'Community Gardens', 'Cookbooks', 'Drinks', 'Farms', 'Food Trucks', 'Food Trucks', 'Restaurants', 'Small Batch', 'Spaces', 'Vegan']
games_meta = ['Games', 'Gaming Hardware', 'Live Games', 'Mobile Games', 'Playing Cards', 'Puzzles', 'Tabletop Games', 'Video Games']
journalism_meta = ['Journalism', 'Audio', 'Photo', 'Print', 'Video', 'Web']
music_meta = ['Music', 'Blues', 'Chiptune', 'Classical Music', 'Comedy', 'Country &amp; Folk', 'Electronic Music', 'Faith', 'Hip-Hop', 'Indie Rock', 'Jazz', 'Kids', 'Latin', 'Metal', 'Pop', 'Punk', 'R&amp;B', 'Rock', 'World Music']
photography_meta = ['Photography', 'Animals', 'Fine Art', 'Nature', 'People', 'Photobooks', 'Places']
publishing_meta = ['Publishing', 'Academic', 'Art Books', 'Calendars', 'Children&#39;s Books', 'Comedy', 'Fiction', 'Letterpress', 'Literary Journals', 'Literary Spaces', 'Nonfiction', 'Periodicals', 'Poetry', 'Radio &amp; Podcasts', 'Translations', 'Young Adult', 'Zines']
technology_meta = ['Technology', '3D Printing', 'Apps', 'Camera Equipment', 'DIY Electronics', 'Fabrication Tools', 'Flight', 'Gadgets', 'Hardware', 'Makerspaces', 'Robots', 'Software', 'Sound', 'Space Exploration', 'Wearables', 'Web']
theater_meta = ['Theater','Comedy', 'Experimental', 'Festivals', 'Immersive', 'Musical', 'Plays']

meta_categories = [art_meta, comics_meta, crafts_meta, dance_meta, design_meta, fashion_meta, film_meta, food_meta, games_meta, journalism_meta, music_meta, photography_meta, publishing_meta, technology_meta, theater_meta]
cat_df_dict = {}
for category in meta_categories:
  df_1 = raw_df[raw_df['Category'].isin(category)]
  cat_df_dict.update( {str(category[0]): df_1} )

In [39]:
# exploring category numbers
for key in cat_df_dict:
  print(key + ' category has ' + str(len(cat_df_dict[key])) + ' rows')

Art category has 8916 rows
Comics category has 1925 rows
Crafts category has 1728 rows
Dance category has 1682 rows
Design category has 3588 rows
Fashion category has 2112 rows
Film category has 8824 rows
Food category has 1761 rows
Games category has 5165 rows
Journalism category has 1011 rows
Music category has 9651 rows
Photography category has 1777 rows
Publishing category has 6677 rows
Technology category has 4089 rows
Theater category has 1707 rows


## Subsampling dataset
According to [statista.com](https://www.statista.com/statistics/222455/amount-of-dollars-pledged-per-category-on-kickstarter/), the three most funded categories were *Games*, *Design*, and *Technology*, so we will focus our efforts on these. We will test descriptive stats to figure out the best subsampling strategy.

In [10]:
def generate_descriptive_statistics(dataframe):
    name =[x for x in globals() if globals()[x] is dataframe][0]
    mean_of_cont = dataframe['pledged_percentage'].mean()
    var_of_cont = dataframe.var()['pledged_percentage']
    std_of_cont = dataframe.std()['pledged_percentage']
    binary_ratio = float(dataframe['pledged_binary'].value_counts()[0])/float(dataframe['pledged_binary'].value_counts()[1])
    print('----------------------')
    print('in the ', name, 'dataframe the  ratio of the binary target counts:')
    print(binary_ratio)
    print('in the ', name, 'dataframe the mean of cont. target var: ', mean_of_cont, ', std: ', std_of_cont)


In [11]:
games_df = cat_df_dict['Games']
games_sub = games_df.sample(n=3500)

In [14]:
raw_df['pledged_binary'].value_counts()

1    53460
0     6074
Name: pledged_binary, dtype: int64

In [111]:
generate_descriptive_statistics(games_df)
generate_descriptive_statistics(games_sub)

----------------------
in the  games_df dataframe the  ratio of the binary target counts:
0.10127931769722814
in the  games_df dataframe the mean of cont. target var:  21.8255945773875 , std:  638.9184219503052
----------------------
in the  games_sub dataframe the  ratio of the binary target counts:
0.10270951480781348
in the  games_sub dataframe the mean of cont. target var:  19.309524954583196 , std:  704.8549737286678


  var_of_cont = dataframe.var()['pledged_percentage']
  std_of_cont = dataframe.std()['pledged_percentage']


In [None]:
design_df = cat_df_dict['Design']
design_sub = design_df.sample(n=3500)

In [95]:
generate_descriptive_statistics(design_df)
generate_descriptive_statistics(design_sub)

----------------------
in the  design_df dataframe the  ratio of the binary target counts:
0.20281595709017766
in the  design_df dataframe the mean of cont. target var:  6.24611912653669 , std:  24.3941190415017
----------------------
in the  design_sub dataframe the  ratio of the binary target counts:
0.2006861063464837
in the  design_sub dataframe the mean of cont. target var:  6.263030799771488 , std:  24.60503939955467


  var_of_cont = dataframe.var()['pledged_percentage']
  std_of_cont = dataframe.std()['pledged_percentage']


In [96]:
tech_df = cat_df_dict['Technology']
tech_sub = tech_df.sample(n=3500)

In [97]:
generate_descriptive_statistics(tech_df)
generate_descriptive_statistics(tech_sub)

----------------------
in the  tech_df dataframe the  ratio of the binary target counts:
0.3076431084106172
in the  tech_df dataframe the mean of cont. target var:  4.229911844795767 , std:  26.097965303366713
----------------------
in the  tech_sub dataframe the  ratio of the binary target counts:
0.3049962714392245
in the  tech_sub dataframe the mean of cont. target var:  4.2240070472287155 , std:  27.812399716918126


  var_of_cont = dataframe.var()['pledged_percentage']
  std_of_cont = dataframe.std()['pledged_percentage']


In [112]:
games_sub.to_csv('../data/raw_games_data.csv')
design_sub.to_csv('../data/raw_design_data.csv')
tech_sub.to_csv('../data/raw_tech_data.csv')

## Text preprocessing

The preprocessing will be done using the following function but we run it on a vm to make it faster.

In [14]:
def preprocess(text):
    if isinstance(text, str):
        lemmatizer = WordNetLemmatizer()
        text = text.strip()
        result = ""
        punctless = re.sub('[^a-zA-Z0-9]', ' ', text)
        lowercase = punctless.lower()
        words_list = lowercase.split()
        words_list =  [ lemmatizer.lemmatize(word) for word in words_list if not word in set(stopwords.words('english')) ]
        for word in words_list:
            result += " " + word.strip()
        return result
    else: return ''

## TO DO: joining CLEAN datasets with pledge table data, and add in 'Staff_Recommended' variable

In [40]:
clean_games = pd.read_csv('../data/clean_games_data.csv', index_col=0)
clean_design = pd.read_csv('../data/clean_design_data.csv', index_col=0)
clean_tech = pd.read_csv('../data/clean_tech_data.csv', index_col=0)

In [41]:
raw_df

Unnamed: 0,project_slug,Category,location,Number_Backers,Creator_nb_projects,Project_description,top_country,pledged_percentage,pledged_binary,Goal_USD,Pledge_USD,launch_date
0,the-because-black-life-conference-2018,Art,us,72,1.0,The Because Black Life Conference 2018 - This ...,united states,1.015000,1,10000.000000,1.015000e+04,2018-05-14 15:22:13
1,petography-illustrations-of-your-pet,Illustration,ca,34,5.0,Risks and challenges Having done print focuse...,united states,4.131450,1,156.139970,6.450845e+02,2018-05-14 14:57:39
2,dessert-cuties-enamel-pins,Art,us,33,2.0,Who doesn't love cute enamel pins? I madeÂ sev...,united states,1.018750,1,800.000000,8.150000e+02,2018-05-14 10:48:26
3,adorable-deadpool-chibi-sticker,Illustration,us,61,12.0,My name is Melanie (Melbaka) and welcome to my...,united states,2.175000,1,400.000000,8.700000e+02,2018-05-13 21:02:31
4,japanese-mythos-enamel-pins,Art,us,293,5.0,â¦ï¸â¦ï¸â¦ï¸ Information â¦ï¸â¦ï¸â¦...,united states,10.541905,1,1050.000000,1.106900e+04,2018-05-13 17:03:50
...,...,...,...,...,...,...,...,...,...,...,...,...
82039,remembrance-0,Immersive,us,35,1.0,This is Margaret's Story. This is Remembrance...,united states,0.595429,0,3500.000000,2.084000e+03,2019-07-25 20:59:31
82042,quickstarter-miniature-bonsai-wire-sculptures,Sculpture,gb,18,4.0,"Hi I'm Andy, Â I'm a wire artist based in Manc...",united states,19.800000,1,62.155271,1.230674e+03,2019-07-25 20:52:40
82056,mag4life-fantastic-magnetic-usb-c-and-tb3-conn...,Hardware,de,30,1.0,"EN: We could now tell you a lot, write many ...",germany,0.220400,0,5572.011550,1.228071e+03,2019-07-25 17:59:26
82077,avensi-coffee-enhancing-glassware-reveal-flavo...,Product Design,us,2038,2.0,"Youâve had hundreds of coffees before, but t...",united states,7.623550,1,20000.000000,1.524710e+05,2019-07-23 16:48:10


In [44]:
joint_games = clean_games.merge(raw_df[['project_slug', 'Goal_USD', 'Pledge_USD']], on='project_slug', how='inner')
joint_design = clean_design.merge(raw_df[['project_slug', 'Goal_USD', 'Pledge_USD']], on='project_slug', how='inner')
joint_tech = clean_tech.merge(raw_df[['project_slug', 'Goal_USD', 'Pledge_USD']], on='project_slug', how='inner')

In [45]:
joint_games

Unnamed: 0,project_slug,Category,location,Number_Backers,Creator_nb_projects,top_country,pledged_percentage,pledged_binary,clean_text,Staff_recommended,launch_date,Goal_USD,Pledge_USD
0,xeno-crisis-a-new-game-for-the-sega-genesis-me...,Video Games,gb,1289,1.0,united states,3.628450,1,xeno crisis new original title sega mega driv...,False,2017-12-11 11:18:18,26773.761800,97147.256003
1,torg-eternity-aysle,Tabletop Games,de,1156,12.0,united states,8.803940,1,aysle continues possibility war started torg ...,False,2019-07-02 14:21:40,20084.237772,176820.421410
2,diehard-miniatures-law-and-disorder,Tabletop Games,gb,250,5.0,united states,2.375778,1,marshal klint k 9 unit art johan egerkrans iz...,False,2018-10-11 07:59:03,5920.943580,14066.846181
3,project-orion-1,Tabletop Games,de,34,1.0,germany,0.493500,0,project orion started almost 2 year ago 2 dec...,False,2019-04-17 21:49:19,2260.540900,1115.576934
4,the-journey-east,Video Games,us,10,1.0,united states,1.060000,1,overview journey east simplistic platforming ...,False,2015-12-30 04:25:11,150.000000,159.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3495,mystery-of-woolley-mountain-a-new-point-and-click,Video Games,gb,345,1.0,united kingdom,1.197375,1,mystery woolley mountain new otherworldly poi...,False,2016-07-20 09:32:24,10581.030880,12669.461850
3496,zetawar,Video Games,us,104,1.0,united states,1.026675,1,update 7 13 preliminary shirt design new zeta...,False,2016-06-16 16:32:18,15000.000000,15400.130000
3497,catharsis-tarot-by-howcroft-and-moore,Playing Cards,us,168,1.0,united states,5.771000,1,divination followed humankind throughout exis...,False,2019-05-30 06:14:16,1000.000000,5771.000000
3498,3d-printable-scifi-modular-tiles-for-tabletop-...,Tabletop Games,ca,95,9.0,united states,4.796613,1,story past designed four six successful kicks...,False,2019-04-26 12:58:52,593.101344,2844.877320


In [46]:
joint_games.to_csv('../data/clean_games_data.csv')
joint_design.to_csv('../data/clean_design_data.csv')
joint_tech.to_csv('../data/clean_tech_data.csv')