# Read all csv files

In [1]:
# import libraries
import pandas as pd
import os
import glob

# Read all csv files in the data folder
path = os.getcwd() + '/data'
csv_files = glob.glob(os.path.join(path, "*.csv"))
  
li = []
#loop over csv files and append
for f in csv_files:
    df_s = pd.read_csv(f, index_col=None, header=0)
    li.append(df_s)

df = pd.concat(li, axis=0, ignore_index=True)

# Data cleaning

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1189287 entries, 0 to 1189286
Data columns (total 39 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   backers_count             1189287 non-null  int64  
 1   blurb                     1189242 non-null  object 
 2   category                  1189287 non-null  object 
 3   converted_pledged_amount  1189287 non-null  int64  
 4   country                   1189287 non-null  object 
 5   country_displayable_name  1189287 non-null  object 
 6   created_at                1189287 non-null  int64  
 7   creator                   1189287 non-null  object 
 8   currency                  1189287 non-null  object 
 9   currency_symbol           1189287 non-null  object 
 10  currency_trailing_code    1189287 non-null  bool   
 11  current_currency          1189287 non-null  object 
 12  deadline                  1189287 non-null  int64  
 13  disable_communication     1

### Dropping columns that are not relevant to study

In [3]:
columns_to_drop = df[['backers_count', 'country_displayable_name', 'creator', 'currency',
                      'currency_symbol', 'currency_trailing_code', 'current_currency',
                     'disable_communication', 'friends', 'is_backing', 'is_starrable', 'is_starred',
                     'location', 'permissions', 'photo', 'profile','slug', 'source_url',
                      'spotlight', 'staff_pick','static_usd_rate', 'urls', 'usd_type', 'usd_exchange_rate']]

df.drop(columns_to_drop, axis = 1, inplace = True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1189287 entries, 0 to 1189286
Data columns (total 15 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   blurb                     1189242 non-null  object 
 1   category                  1189287 non-null  object 
 2   converted_pledged_amount  1189287 non-null  int64  
 3   country                   1189287 non-null  object 
 4   created_at                1189287 non-null  int64  
 5   deadline                  1189287 non-null  int64  
 6   fx_rate                   1189287 non-null  float64
 7   goal                      1189287 non-null  float64
 8   id                        1189287 non-null  int64  
 9   launched_at               1189287 non-null  int64  
 10  name                      1189287 non-null  object 
 11  pledged                   1189287 non-null  float64
 12  state                     1189287 non-null  object 
 13  state_changed_at          1

### Only keeping projects that were successful or failed

In [5]:
df = df[(df['state'] == 'successful') | (df['state'] == 'failed')]

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1108341 entries, 0 to 1189286
Data columns (total 15 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   blurb                     1108331 non-null  object 
 1   category                  1108341 non-null  object 
 2   converted_pledged_amount  1108341 non-null  int64  
 3   country                   1108341 non-null  object 
 4   created_at                1108341 non-null  int64  
 5   deadline                  1108341 non-null  int64  
 6   fx_rate                   1108341 non-null  float64
 7   goal                      1108341 non-null  float64
 8   id                        1108341 non-null  int64  
 9   launched_at               1108341 non-null  int64  
 10  name                      1108341 non-null  object 
 11  pledged                   1108341 non-null  float64
 12  state                     1108341 non-null  object 
 13  state_changed_at          1

### Dropping the duplicate projects, found by project id

In [7]:
df.drop_duplicates(subset = 'id', keep = 'first', inplace = True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 193779 entries, 0 to 1188867
Data columns (total 15 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   blurb                     193777 non-null  object 
 1   category                  193779 non-null  object 
 2   converted_pledged_amount  193779 non-null  int64  
 3   country                   193779 non-null  object 
 4   created_at                193779 non-null  int64  
 5   deadline                  193779 non-null  int64  
 6   fx_rate                   193779 non-null  float64
 7   goal                      193779 non-null  float64
 8   id                        193779 non-null  int64  
 9   launched_at               193779 non-null  int64  
 10  name                      193779 non-null  object 
 11  pledged                   193779 non-null  float64
 12  state                     193779 non-null  object 
 13  state_changed_at          193779 non-null  

### Fix time values

In [9]:
from datetime import datetime
import time

In [10]:
def convert_times(row):
    row['created_at'] = datetime.strptime(time.strftime('%Y-%m-%d %H:%M:%S', 
                                                        time.localtime(row['created_at'])), '%Y-%m-%d %H:%M:%S')
    row['deadline'] = datetime.strptime(time.strftime('%Y-%m-%d %H:%M:%S', 
                                                      time.localtime(row['deadline'])), '%Y-%m-%d %H:%M:%S')
    row['launched_at'] = datetime.strptime(time.strftime('%Y-%m-%d %H:%M:%S', 
                                                         time.localtime(row['launched_at'])), '%Y-%m-%d %H:%M:%S')
    row['state_changed_at'] = datetime.strptime(time.strftime('%Y-%m-%d %H:%M:%S', 
                                                              time.localtime(row['state_changed_at'])), '%Y-%m-%d %H:%M:%S')
    return row

In [11]:
df = df.apply(convert_times, axis = 1)

In [12]:
df.head()

Unnamed: 0,blurb,category,converted_pledged_amount,country,created_at,deadline,fx_rate,goal,id,launched_at,name,pledged,state,state_changed_at,usd_pledged
0,Help Guy and Steph publish their illustrated c...,"{""id"":46,""name"":""Children's Books"",""slug"":""pub...",6114,GB,2020-04-16 05:18:45,2020-05-15 01:58:55,1.36408,2500.0,1846656783,2020-04-24 01:58:55,An Illustrated Children's Book - Kindness is C...,5008.0,successful,2020-05-15 01:58:55,6172.413435
1,A black and white photography book by renowned...,"{""id"":280,""name"":""Photobooks"",""slug"":""photogra...",30850,US,2017-05-29 05:11:31,2017-09-13 14:00:00,1.0,10000.0,2036647097,2017-08-02 06:54:08,The Last of the Hill Farms: Echoes of Vermont'...,30850.0,successful,2017-09-13 14:00:00,30850.0
2,"Sarah, the displaced hero, will empower your c...","{""id"":46,""name"":""Children's Books"",""slug"":""pub...",24292,SE,2016-10-14 06:19:56,2016-12-19 14:59:00,0.119897,218000.0,923683665,2016-11-15 10:13:46,Sarah's Journey: An Empowering Adventure Book ...,227101.0,successful,2016-12-19 14:59:00,24904.533814
3,A world ravaged by climate change. A father tr...,"{""id"":285,""name"":""Plays"",""slug"":""theater/plays...",13312,US,2017-10-19 15:18:53,2017-12-15 15:11:03,1.0,12000.0,1028831108,2017-11-03 15:11:03,The Promised Land,13312.7,successful,2017-12-15 15:11:03,13312.7
4,Get Sinned-Angel-Stock & SenshiStock together ...,"{""id"":278,""name"":""People"",""slug"":""photography/...",3000,US,2017-05-05 08:18:48,2017-06-23 19:00:00,1.0,750.0,2080833032,2017-06-01 05:02:54,Pose References for Artists,3000.0,successful,2017-06-23 19:00:00,3000.0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 193779 entries, 0 to 1188867
Data columns (total 15 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   blurb                     193777 non-null  object        
 1   category                  193779 non-null  object        
 2   converted_pledged_amount  193779 non-null  int64         
 3   country                   193779 non-null  object        
 4   created_at                193779 non-null  datetime64[ns]
 5   deadline                  193779 non-null  datetime64[ns]
 6   fx_rate                   193779 non-null  float64       
 7   goal                      193779 non-null  float64       
 8   id                        193779 non-null  int64         
 9   launched_at               193779 non-null  datetime64[ns]
 10  name                      193779 non-null  object        
 11  pledged                   193779 non-null  float64       
 12  s

In [14]:
df = df[df['launched_at'] > '2020-02']

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31953 entries, 0 to 1188732
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   blurb                     31953 non-null  object        
 1   category                  31953 non-null  object        
 2   converted_pledged_amount  31953 non-null  int64         
 3   country                   31953 non-null  object        
 4   created_at                31953 non-null  datetime64[ns]
 5   deadline                  31953 non-null  datetime64[ns]
 6   fx_rate                   31953 non-null  float64       
 7   goal                      31953 non-null  float64       
 8   id                        31953 non-null  int64         
 9   launched_at               31953 non-null  datetime64[ns]
 10  name                      31953 non-null  object        
 11  pledged                   31953 non-null  float64       
 12  state           

### Obtain Categories

In [20]:
from ast import literal_eval
import numpy as np

In [28]:
# Make Categories as an attribute
category = df.category.apply(literal_eval)
main_category = []
for i, d in enumerate(category):
    try:
        main_category.append(d['parent_name'])
    except:
        main_category.append(np.nan)
df['main_category'] = main_category
df.main_category.value_counts()

Games           5649
Technology      3272
Design          3121
Publishing      2730
Art             2670
Comics          2425
Film & Video    2218
Fashion         1677
Music           1573
Food            1351
Crafts           525
Photography      475
Journalism       243
Theater          184
Dance             58
Name: main_category, dtype: int64

In [29]:
sub_category = []
for i, name in enumerate(d['name'] for d in category):
    sub_category.append(name)
df['sub_category'] = sub_category
df.sub_category.value_counts()

Tabletop Games    3849
Product Design    2590
Art               1505
Illustration      1242
Comic Books       1188
                  ... 
Photo                4
Residencies          4
Workshops            3
Chiptune             2
Quilts               1
Name: sub_category, Length: 160, dtype: int64

In [30]:
df[df.main_category.isna()]['sub_category'].value_counts()

Art             1505
Music            428
Comics           306
Design           302
Publishing       293
Technology       190
Crafts           169
Food             155
Games            121
Film & Video     112
Fashion           72
Journalism        48
Photography       41
Theater           24
Dance             16
Name: sub_category, dtype: int64

In [31]:
df.main_category.fillna(df.sub_category, inplace=True)
df.isna().sum()

blurb                       0
category                    0
converted_pledged_amount    0
country                     0
created_at                  0
deadline                    0
fx_rate                     0
goal                        0
id                          0
launched_at                 0
name                        0
pledged                     0
state                       0
state_changed_at            0
usd_pledged                 0
main_category               0
sub_category                0
dtype: int64

In [32]:
df.main_category.value_counts()

Games           5770
Art             4175
Technology      3462
Design          3423
Publishing      3023
Comics          2731
Film & Video    2330
Music           2001
Fashion         1749
Food            1506
Crafts           694
Photography      516
Journalism       291
Theater          208
Dance             74
Name: main_category, dtype: int64

### Obtain Country/Region

In [33]:
df.country.value_counts()

US    17795
GB     4410
CA     1912
DE      925
AU      835
FR      822
ES      768
IT      741
MX      734
HK      685
NL      328
SG      314
SE      309
JP      292
CH      191
DK      181
BE      147
NZ      118
IE      104
AT      103
PL       78
NO       69
GR       53
SI       21
LU       18
Name: country, dtype: int64

In [38]:
country_dic = {'GB':'EU', 'CA': 'AM_non_US', 'DE':'EU', 'AU':'AP', 'FR':'EU', 'ES': 'EU', 'IT':'EU', 
               'MX':'AM_non_US','HK':'AP', 'NL':'EU', 'SG':'AP', 'SE':'EU', 'JP':'AP', 'CH':'AP', 'DK':'EU', 
               'BE':'EU', 'NZ':'AP', 'IE':'EU', 'AT':'EU', 'PL':'EU', 'NO':'EU', 'GR':'EU', 'SI':'EU', 'LU':'EU'}
df = df.replace({"country": country_dic})

In [43]:
df.rename({'country':'region'}, axis = 1, inplace = True)

In [44]:
df.head()

Unnamed: 0,blurb,category,converted_pledged_amount,region,created_at,deadline,fx_rate,goal,id,launched_at,name,pledged,state,state_changed_at,usd_pledged,main_category,sub_category
0,Help Guy and Steph publish their illustrated c...,"{""id"":46,""name"":""Children's Books"",""slug"":""pub...",6114,EU,2020-04-16 05:18:45,2020-05-15 01:58:55,1.36408,2500.0,1846656783,2020-04-24 01:58:55,An Illustrated Children's Book - Kindness is C...,5008.0,successful,2020-05-15 01:58:55,6172.413435,Publishing,Children's Books
10,Support us by backing! Wearable Blanket featur...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",5643,US,2020-05-27 06:47:29,2020-07-09 09:00:00,1.0,5000.0,1091394927,2020-06-09 09:14:06,Snuggy Buddy Baby - The First Wearable Lovey B...,5643.0,successful,2020-07-09 09:00:00,5643.0,Design,Product Design
17,Yono Clip is the ultimate solution to keep you...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",9857,US,2020-04-15 07:11:52,2020-07-13 05:22:08,1.0,7700.0,1892452780,2020-06-13 05:22:08,Yono Clip,9857.44,successful,2020-07-13 05:22:08,9857.44,Design,Product Design
24,Our new go to mini rod is ready for all of you...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",116629,US,2016-12-27 13:19:41,2020-04-23 19:33:00,1.0,10000.0,591954572,2020-03-24 10:02:22,Tenkara Rod Co. - The Beartooth Rod - Go Anywh...,116629.0,successful,2020-04-23 19:33:00,116629.0,Design,Product Design
25,Looking for a menstrual cup that is comfortabl...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",11880,US,2020-07-17 19:34:35,2020-08-26 23:59:00,1.0,10000.0,1488247508,2020-07-28 08:05:39,Kind Cup - the innovative menstrual cup,11880.0,successful,2020-08-26 23:59:01,11880.0,Design,Product Design


In [45]:
df.region.value_counts()

US           17795
EU            9077
AM_non_US     2646
AP            2435
Name: region, dtype: int64

### Standardize currency values

In [47]:
df.columns

Index(['blurb', 'category', 'converted_pledged_amount', 'region', 'created_at',
       'deadline', 'fx_rate', 'goal', 'id', 'launched_at', 'name', 'pledged',
       'state', 'state_changed_at', 'usd_pledged', 'main_category',
       'sub_category'],
      dtype='object')

In [48]:
df['goal'] = df.goal.astype(float)
df['fx_rate'] = df.fx_rate.astype(float)
df['goal_usd'] = df.goal * df.fx_rate
df.rename({'usd_pledged':'pledged_usd'}, axis = 1, inplace = True)
df.head()

Unnamed: 0,blurb,category,converted_pledged_amount,region,created_at,deadline,fx_rate,goal,id,launched_at,name,pledged,state,state_changed_at,pledged_usd,main_category,sub_category,goal_usd
0,Help Guy and Steph publish their illustrated c...,"{""id"":46,""name"":""Children's Books"",""slug"":""pub...",6114,EU,2020-04-16 05:18:45,2020-05-15 01:58:55,1.36408,2500.0,1846656783,2020-04-24 01:58:55,An Illustrated Children's Book - Kindness is C...,5008.0,successful,2020-05-15 01:58:55,6172.413435,Publishing,Children's Books,3410.199225
10,Support us by backing! Wearable Blanket featur...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",5643,US,2020-05-27 06:47:29,2020-07-09 09:00:00,1.0,5000.0,1091394927,2020-06-09 09:14:06,Snuggy Buddy Baby - The First Wearable Lovey B...,5643.0,successful,2020-07-09 09:00:00,5643.0,Design,Product Design,5000.0
17,Yono Clip is the ultimate solution to keep you...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",9857,US,2020-04-15 07:11:52,2020-07-13 05:22:08,1.0,7700.0,1892452780,2020-06-13 05:22:08,Yono Clip,9857.44,successful,2020-07-13 05:22:08,9857.44,Design,Product Design,7700.0
24,Our new go to mini rod is ready for all of you...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",116629,US,2016-12-27 13:19:41,2020-04-23 19:33:00,1.0,10000.0,591954572,2020-03-24 10:02:22,Tenkara Rod Co. - The Beartooth Rod - Go Anywh...,116629.0,successful,2020-04-23 19:33:00,116629.0,Design,Product Design,10000.0
25,Looking for a menstrual cup that is comfortabl...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",11880,US,2020-07-17 19:34:35,2020-08-26 23:59:00,1.0,10000.0,1488247508,2020-07-28 08:05:39,Kind Cup - the innovative menstrual cup,11880.0,successful,2020-08-26 23:59:01,11880.0,Design,Product Design,10000.0


### Get number of words in blurb & name

In [65]:
def num_words(row):
    try:
        row['blurb_len'] = len(row['blurb'].split())
    except TypeError: 
        print[row['blurb']]
    try:
        row['name_len'] = len(row['name'].split())
    except TypeError:
        print[row['name']]
    return row

In [66]:
df['blurb'] = df.blurb.astype(str)
df['name'] = df.name.astype(str)
df.apply(num_words, axis = 1)

Unnamed: 0,blurb,category,converted_pledged_amount,region,created_at,deadline,fx_rate,goal,id,launched_at,name,pledged,state,state_changed_at,pledged_usd,main_category,sub_category,goal_usd,blurb_len,name_len
0,Help Guy and Steph publish their illustrated c...,"{""id"":46,""name"":""Children's Books"",""slug"":""pub...",6114,EU,2020-04-16 05:18:45,2020-05-15 01:58:55,1.36408,2500.0,1846656783,2020-04-24 01:58:55,An Illustrated Children's Book - Kindness is C...,5008.00,successful,2020-05-15 01:58:55,6172.413435,Publishing,Children's Books,3410.199225,21,8
10,Support us by backing! Wearable Blanket featur...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",5643,US,2020-05-27 06:47:29,2020-07-09 09:00:00,1.00000,5000.0,1091394927,2020-06-09 09:14:06,Snuggy Buddy Baby - The First Wearable Lovey B...,5643.00,successful,2020-07-09 09:00:00,5643.000000,Design,Product Design,5000.000000,22,9
17,Yono Clip is the ultimate solution to keep you...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",9857,US,2020-04-15 07:11:52,2020-07-13 05:22:08,1.00000,7700.0,1892452780,2020-06-13 05:22:08,Yono Clip,9857.44,successful,2020-07-13 05:22:08,9857.440000,Design,Product Design,7700.000000,26,2
24,Our new go to mini rod is ready for all of you...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",116629,US,2016-12-27 13:19:41,2020-04-23 19:33:00,1.00000,10000.0,591954572,2020-03-24 10:02:22,Tenkara Rod Co. - The Beartooth Rod - Go Anywh...,116629.00,successful,2020-04-23 19:33:00,116629.000000,Design,Product Design,10000.000000,13,12
25,Looking for a menstrual cup that is comfortabl...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",11880,US,2020-07-17 19:34:35,2020-08-26 23:59:00,1.00000,10000.0,1488247508,2020-07-28 08:05:39,Kind Cup - the innovative menstrual cup,11880.00,successful,2020-08-26 23:59:01,11880.000000,Design,Product Design,10000.000000,21,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1187809,Custom built game table made by gamers.,"{""id"":12,""name"":""Games"",""slug"":""games"",""positi...",197,US,2020-07-08 15:17:30,2020-08-05 13:00:00,1.00000,10.0,931225606,2020-07-23 10:14:17,High Quality Game Tables,197.00,successful,2020-08-05 13:00:00,197.000000,Games,Games,10.000000,7,4
1187896,A whimsical series of collectible pins and sti...,"{""id"":1,""name"":""Art"",""slug"":""art"",""position"":1...",1298,US,2020-03-23 09:35:22,2020-04-27 20:59:00,1.00000,100.0,2041689538,2020-03-27 08:23:12,Where My Peeps At?,1298.00,successful,2020-04-27 20:59:00,1298.000000,Art,Art,100.000000,16,4
1188138,Magical potion-themed pins inspired by the sol...,"{""id"":1,""name"":""Art"",""slug"":""art"",""position"":1...",2206,US,2020-03-30 05:51:26,2020-05-08 20:59:00,1.00000,400.0,173818091,2020-04-16 21:01:00,Love & Potion - A LOONA-inspired enamel pin se...,2206.00,successful,2020-05-08 20:59:01,2206.000000,Art,Art,400.000000,19,11
1188236,Furry anthro playing cards with corresponding ...,"{""id"":273,""name"":""Playing Cards"",""slug"":""games...",135,US,2020-10-30 19:57:40,2021-01-04 20:34:02,1.00000,5000.0,1384927751,2020-11-05 20:34:02,Willow Standard Playing Deck,135.00,failed,2021-01-04 20:34:02,135.000000,Games,Playing Cards,5000.000000,8,4


### Get durations

In [76]:
df['campaign_duration'] = (df['deadline'] - df['launched_at']).dt.days

In [77]:
df.head()

Unnamed: 0,blurb,category,converted_pledged_amount,region,created_at,deadline,fx_rate,goal,id,launched_at,name,pledged,state,state_changed_at,pledged_usd,main_category,sub_category,goal_usd,campaign_duration
0,Help Guy and Steph publish their illustrated c...,"{""id"":46,""name"":""Children's Books"",""slug"":""pub...",6114,EU,2020-04-16 05:18:45,2020-05-15 01:58:55,1.36408,2500.0,1846656783,2020-04-24 01:58:55,An Illustrated Children's Book - Kindness is C...,5008.0,successful,2020-05-15 01:58:55,6172.413435,Publishing,Children's Books,3410.199225,21
10,Support us by backing! Wearable Blanket featur...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",5643,US,2020-05-27 06:47:29,2020-07-09 09:00:00,1.0,5000.0,1091394927,2020-06-09 09:14:06,Snuggy Buddy Baby - The First Wearable Lovey B...,5643.0,successful,2020-07-09 09:00:00,5643.0,Design,Product Design,5000.0,29
17,Yono Clip is the ultimate solution to keep you...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",9857,US,2020-04-15 07:11:52,2020-07-13 05:22:08,1.0,7700.0,1892452780,2020-06-13 05:22:08,Yono Clip,9857.44,successful,2020-07-13 05:22:08,9857.44,Design,Product Design,7700.0,30
24,Our new go to mini rod is ready for all of you...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",116629,US,2016-12-27 13:19:41,2020-04-23 19:33:00,1.0,10000.0,591954572,2020-03-24 10:02:22,Tenkara Rod Co. - The Beartooth Rod - Go Anywh...,116629.0,successful,2020-04-23 19:33:00,116629.0,Design,Product Design,10000.0,30
25,Looking for a menstrual cup that is comfortabl...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",11880,US,2020-07-17 19:34:35,2020-08-26 23:59:00,1.0,10000.0,1488247508,2020-07-28 08:05:39,Kind Cup - the innovative menstrual cup,11880.0,successful,2020-08-26 23:59:01,11880.0,Design,Product Design,10000.0,29


In [78]:
df['create_to_launch_duration'] = (df['launched_at'] - df['created_at']).dt.days

In [79]:
df.head()

Unnamed: 0,blurb,category,converted_pledged_amount,region,created_at,deadline,fx_rate,goal,id,launched_at,name,pledged,state,state_changed_at,pledged_usd,main_category,sub_category,goal_usd,campaign_duration,create_to_launch_duration
0,Help Guy and Steph publish their illustrated c...,"{""id"":46,""name"":""Children's Books"",""slug"":""pub...",6114,EU,2020-04-16 05:18:45,2020-05-15 01:58:55,1.36408,2500.0,1846656783,2020-04-24 01:58:55,An Illustrated Children's Book - Kindness is C...,5008.0,successful,2020-05-15 01:58:55,6172.413435,Publishing,Children's Books,3410.199225,21,7
10,Support us by backing! Wearable Blanket featur...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",5643,US,2020-05-27 06:47:29,2020-07-09 09:00:00,1.0,5000.0,1091394927,2020-06-09 09:14:06,Snuggy Buddy Baby - The First Wearable Lovey B...,5643.0,successful,2020-07-09 09:00:00,5643.0,Design,Product Design,5000.0,29,13
17,Yono Clip is the ultimate solution to keep you...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",9857,US,2020-04-15 07:11:52,2020-07-13 05:22:08,1.0,7700.0,1892452780,2020-06-13 05:22:08,Yono Clip,9857.44,successful,2020-07-13 05:22:08,9857.44,Design,Product Design,7700.0,30,58
24,Our new go to mini rod is ready for all of you...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",116629,US,2016-12-27 13:19:41,2020-04-23 19:33:00,1.0,10000.0,591954572,2020-03-24 10:02:22,Tenkara Rod Co. - The Beartooth Rod - Go Anywh...,116629.0,successful,2020-04-23 19:33:00,116629.0,Design,Product Design,10000.0,30,1182
25,Looking for a menstrual cup that is comfortabl...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",11880,US,2020-07-17 19:34:35,2020-08-26 23:59:00,1.0,10000.0,1488247508,2020-07-28 08:05:39,Kind Cup - the innovative menstrual cup,11880.0,successful,2020-08-26 23:59:01,11880.0,Design,Product Design,10000.0,29,10


### Get failed targets

In [83]:
df.state.value_counts(normalize=True)

successful    0.768566
failed        0.231434
Name: state, dtype: float64

In [88]:
df['Failed'] = (df['state'] == 'failed').astype(int)

In [89]:
df.head(20)

Unnamed: 0,blurb,category,converted_pledged_amount,region,created_at,deadline,fx_rate,goal,id,launched_at,...,pledged,state,state_changed_at,pledged_usd,main_category,sub_category,goal_usd,campaign_duration,create_to_launch_duration,Failed
0,Help Guy and Steph publish their illustrated c...,"{""id"":46,""name"":""Children's Books"",""slug"":""pub...",6114,EU,2020-04-16 05:18:45,2020-05-15 01:58:55,1.36408,2500.0,1846656783,2020-04-24 01:58:55,...,5008.0,successful,2020-05-15 01:58:55,6172.413435,Publishing,Children's Books,3410.199225,21,7,0
10,Support us by backing! Wearable Blanket featur...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",5643,US,2020-05-27 06:47:29,2020-07-09 09:00:00,1.0,5000.0,1091394927,2020-06-09 09:14:06,...,5643.0,successful,2020-07-09 09:00:00,5643.0,Design,Product Design,5000.0,29,13,0
17,Yono Clip is the ultimate solution to keep you...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",9857,US,2020-04-15 07:11:52,2020-07-13 05:22:08,1.0,7700.0,1892452780,2020-06-13 05:22:08,...,9857.44,successful,2020-07-13 05:22:08,9857.44,Design,Product Design,7700.0,30,58,0
24,Our new go to mini rod is ready for all of you...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",116629,US,2016-12-27 13:19:41,2020-04-23 19:33:00,1.0,10000.0,591954572,2020-03-24 10:02:22,...,116629.0,successful,2020-04-23 19:33:00,116629.0,Design,Product Design,10000.0,30,1182,0
25,Looking for a menstrual cup that is comfortabl...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",11880,US,2020-07-17 19:34:35,2020-08-26 23:59:00,1.0,10000.0,1488247508,2020-07-28 08:05:39,...,11880.0,successful,2020-08-26 23:59:01,11880.0,Design,Product Design,10000.0,29,10,0
45,Optimize how you perform and feel. Dial in any...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",69374,US,2020-08-05 14:23:52,2020-10-08 10:52:32,1.0,65000.0,682900823,2020-09-01 10:52:32,...,69374.0,successful,2020-10-08 10:52:32,69374.0,Design,Product Design,65000.0,37,26,0
57,An inheritable writing instrument created with...,"{""id"":7,""name"":""Design"",""slug"":""design"",""posit...",3100,AP,2020-02-05 05:43:06,2020-04-03 06:01:21,0.128972,10000.0,1608805395,2020-03-04 06:01:21,...,24037.0,successful,2020-04-03 06:01:21,3088.915308,Design,Design,1289.7202,30,28,0
60,A powder-coated steel hammock hitch that attac...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",9121,US,2020-03-09 11:39:11,2020-07-15 23:59:00,1.0,5000.0,2127787680,2020-06-15 09:05:13,...,9121.0,successful,2020-07-15 23:59:01,9121.0,Design,Product Design,5000.0,30,97,0
84,Recycled Ocean plastic & Cotton waste turned i...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",28426,EU,2020-07-07 15:37:04,2020-09-05 02:45:21,1.21498,18700.0,339238889,2020-07-27 02:45:21,...,24013.0,successful,2020-09-05 02:45:21,27990.73544,Design,Product Design,22720.120951,40,19,0
86,FableWood is back with cute and cuddly creatio...,"{""id"":28,""name"":""Product Design"",""slug"":""desig...",109623,EU,2020-07-13 05:34:12,2020-10-03 13:30:00,0.163326,50000.0,206884957,2020-08-26 01:59:25,...,696371.17,successful,2020-10-03 13:30:00,110463.617769,Design,Product Design,8166.313,38,43,0


In [92]:
df.Failed.value_counts(normalize=True)

0    0.768566
1    0.231434
Name: Failed, dtype: float64

### drop the irrelevant columns

In [93]:
df.columns

Index(['blurb', 'category', 'converted_pledged_amount', 'region', 'created_at',
       'deadline', 'fx_rate', 'goal', 'id', 'launched_at', 'name', 'pledged',
       'state', 'state_changed_at', 'pledged_usd', 'main_category',
       'sub_category', 'goal_usd', 'campaign_duration',
       'create_to_launch_duration', 'Failed'],
      dtype='object')

In [94]:
df.drop(df[['blurb', 'category', 'converted_pledged_amount', 'fx_rate', 'goal',
           'name', 'pledged', 'state', 'state_changed_at']], axis = 1, inplace = True)

In [95]:
df.head()

Unnamed: 0,region,created_at,deadline,id,launched_at,pledged_usd,main_category,sub_category,goal_usd,campaign_duration,create_to_launch_duration,Failed
0,EU,2020-04-16 05:18:45,2020-05-15 01:58:55,1846656783,2020-04-24 01:58:55,6172.413435,Publishing,Children's Books,3410.199225,21,7,0
10,US,2020-05-27 06:47:29,2020-07-09 09:00:00,1091394927,2020-06-09 09:14:06,5643.0,Design,Product Design,5000.0,29,13,0
17,US,2020-04-15 07:11:52,2020-07-13 05:22:08,1892452780,2020-06-13 05:22:08,9857.44,Design,Product Design,7700.0,30,58,0
24,US,2016-12-27 13:19:41,2020-04-23 19:33:00,591954572,2020-03-24 10:02:22,116629.0,Design,Product Design,10000.0,30,1182,0
25,US,2020-07-17 19:34:35,2020-08-26 23:59:00,1488247508,2020-07-28 08:05:39,11880.0,Design,Product Design,10000.0,29,10,0


In [96]:
df.set_index('id', inplace = True)

In [97]:
df.head()

Unnamed: 0_level_0,region,created_at,deadline,launched_at,pledged_usd,main_category,sub_category,goal_usd,campaign_duration,create_to_launch_duration,Failed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1846656783,EU,2020-04-16 05:18:45,2020-05-15 01:58:55,2020-04-24 01:58:55,6172.413435,Publishing,Children's Books,3410.199225,21,7,0
1091394927,US,2020-05-27 06:47:29,2020-07-09 09:00:00,2020-06-09 09:14:06,5643.0,Design,Product Design,5000.0,29,13,0
1892452780,US,2020-04-15 07:11:52,2020-07-13 05:22:08,2020-06-13 05:22:08,9857.44,Design,Product Design,7700.0,30,58,0
591954572,US,2016-12-27 13:19:41,2020-04-23 19:33:00,2020-03-24 10:02:22,116629.0,Design,Product Design,10000.0,30,1182,0
1488247508,US,2020-07-17 19:34:35,2020-08-26 23:59:00,2020-07-28 08:05:39,11880.0,Design,Product Design,10000.0,29,10,0


In [98]:
df.to_csv('data_post_pandemic.csv')