In [46]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from datetime import datetime
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import re
import json

## Import Dataset



In [47]:
df_merged = pd.read_csv("/Users/bandy/Downloads/data/kickstart_data_merged_with_empty.csv", dtype={"backer-amount":str})
print(len(df_merged))
# remove inconsistencies from webscraping and merging 
# this step was added after proposal submission, so the number of observations differ slightly.
# in other words this step is part of post-intermediate data cleaning in preparation for the final report
index = df_merged[pd.isnull(df_merged['story'])].index

df_merged = df_merged[pd.notnull(df_merged['story'])]
df_merged = df_merged[pd.notnull(df_merged['risk'])]
print(len(df_merged))

47341
30031


In [48]:
df_merged.columns

Index(['Unnamed: 0', 'Unnamed: 0_x', 'web-scraper-order',
       'web-scraper-start-url', 'Title', 'Description', 'Status',
       'Pledged-Amount', 'Funded-percent', 'Time left', 'Sphere', 'Location',
       'Image-src', 'Link', 'Link-href', 'backer-amount', 'video',
       'description_story', 'description_risks', 'rewards', 'updateCount',
       'commentCount', 'backers_count', 'blurb_x', 'category',
       'converted_pledged_amount', 'country', 'country_displayable_name',
       'created_at', 'creator', 'currency', 'currency_symbol',
       'currency_trailing_code', 'current_currency', 'deadline',
       'disable_communication', 'friends', 'fx_rate', 'goal', 'id',
       'is_backing', 'is_starrable', 'is_starred', 'launched_at', 'location',
       'name', 'permissions', 'photo', 'pledged', 'profile', 'slug',
       'source_url', 'spotlight', 'staff_pick', 'state', 'state_changed_at',
       'static_usd_rate', 'urls', 'usd_exchange_rate', 'usd_pledged',
       'usd_type', 'main_url'

## Train Test Split After Cleaning (70-15-15)
https://www.researchgate.net/post/Removing_a_low_predictive_column_before_or_after_train_test_split#:~:text=Yes%2C%20you%20should%20remove%20the,but%20not%20before%20the%20split.

- Perform encoding for the target then split

- So we will apply the same cleaning to the test data at the end

In [49]:
# get the target variable - success or failure - binary classfication problem
df_merged = df_merged[(df_merged['Status'] == "successful") | (df_merged['Status'] == "failed")]
print('after remove \n', df_merged['Status'].value_counts())
df_merged['Status'] = df_merged['Status'].apply(lambda x: 0 if x=="failed" else 1)

train, test = train_test_split(df_merged, test_size=0.15, random_state=69, stratify=df_merged['state'])

after remove 
 successful    20569
failed         8556
Name: Status, dtype: int64


In [50]:
print("train size:", len(train))
print("test size", len(test))

train size: 24756
test size 4369


In [51]:
train['Status'].value_counts()

1    17482
0     7274
Name: Status, dtype: int64

## Intermediate Data Cleaning

In [52]:
#global remove list 
remove_list = []

### Remove columns with the same values for all rows





In [53]:
print("Columns with the same values in all of its rows are:")
for i in train: 
  if len(set(train[i])) == 1 or all(pd.isnull(train[i])):
    remove_list.append(i)
    print(i)

Columns with the same values in all of its rows are:
Pledged-Amount
Time left
Sphere
Location
disable_communication


### Remove columns that have the exact same values or very similar values as another column, and keep unique columns

There will be duplicate or very similar columns because:
- Merged dataset takes data from 2 data sources and they may overlap:
  - For the columns with very similar values:
    - Take the column with more information
    - Take the more recent column (Webscraper.io was the most recently scraped)
    - Take the column whose values can be compared with each other

In [54]:
### name and Title 
# some differences in name and Title - keep the more recent one which is Title
train[train['name'] != train['Title']][['Title', 'name']]
remove_list.append('name')

### Description, blurb_x, blurb_y
train[['blurb_x', 'blurb_y', 'Description']]
# blurb_y has significant amount of missing values so blurb_x has more information 
train[train['blurb_x'] != train['Description']][['blurb_x', 'Description']]
# prefer Description over blurb_x 
remove_list.extend(['blurb_y', 'blurb_x'])

### deadline and Time left 
# deadline is more informative because Time left is dynamic 
remove_list.append('Time left')

# ### location, Location, Country
train[['location', 'Location', 'country', 'country_displayable_name']]
# Location contains no useful information
remove_list.append("Location")
train['location'].iloc[1]
train[pd.isnull(train['location'])][['location', 'country', 'country_displayable_name']]
remove_list.append("country_displayable_name")
# location seems to contain more information than Country, but location has a few missing values while country has no missing values
# use values in country to impute missing entries in location, then remove country
train.loc[train['location'].isna(), 'location'] = train['country_displayable_name']
 
### state and Status
# Status is the more recent one 
remove_list.append('state')

### Remove columns that are redundant 

Data is redundant in helping us with our problem statement when:
- The data is metadata 
- The data contains urls that cannot be accessed
- Variables that have leaks information into target variable

In [55]:
# meta-data
remove_list.extend(['web-scraper-start-url', 'web-scraper-order', 'urls', 'Link', 'Link-href',
                   'Unnamed: 0', 'Funded-percent', 'Unnamed: 0_x', 'Unnamed: 0_y', 'currency_symbol',
                   'currency_trailing_code', 'current_currency', 'description_risks', 'description_story', 
                    'final_index', 'Image-src', 'currency', 
                   'friends', 'fx_rate', 'is_backing', 'is_starrable', 'is_starred', 'ivan_index',
                   'main_url', 'permissions', 'photo', 'profile', 'slug', 'source_url', 'state_changed_at',
                   'static_usd_rate', 'usd_exchange_rate', 'usd_type', 'country', 'final_index', 'creator',
                   'pledged'])

# Features that leak information into target variable 
remove_list.extend(['backer-amount', 'backers_count', 'Pledged-Amount', 'updateCount', 'commentCount', 
                   'spotlight', 'converted_pledged_amount', 'usd_pledged'])

In [56]:
#drop all at once
remove_list = list(set(remove_list))
train = train.drop(remove_list, axis = 1)
train.columns

Index(['Title', 'Description', 'Status', 'video', 'rewards', 'category',
       'created_at', 'deadline', 'goal', 'id', 'launched_at', 'location',
       'staff_pick', 'story', 'risk'],
      dtype='object')

### Rename columns to be more readable and convert columns to their correct formats



#### Parsing text features 

In [57]:
# Remove rewards that got cut off while scraping and closes the dictionary
# Text cleaning for: rewards, description, description story, description risks

# def clean_text_old(df):
#     def clean_rewards(corpus):
#         for row in corpus:
#             reward_list = row
#             try:
#                 tiers = ast.literal_eval(reward_list)
#             except SyntaxError as e:
#                 i = len(reward_list) -1
#                 while reward_list[i] != "{":
#                     i -= 1
#                 reward_list = reward_list[:(i-1)]
#                 reward_list += ']'
#         return reward_list
#     df['rewards'] = clean_rewards(df['rewards'])
#     return df

def clean_text(df):
    def clean_rewards(reward_list):
        try:
            tiers = ast.literal_eval(reward_list)
        except SyntaxError as e:
            i = len(reward_list) -1
            while reward_list[i] != "{":
                i -= 1
            reward_list = reward_list[:(i-1)]
            reward_list += ']'
        return reward_list
    df["rewards"] = df["rewards"].apply(lambda x : clean_rewards(x))
    return df

        

#### Readable Columns

In [58]:
train = train.rename(columns={
    "Title":"name",
    "Description": "description",
    "story": "description_story",
    "risk": "description_risks",
    "Description": "description",
    "Status": "state"
})
train = train[['id', 'name', 'description', 
                       'description_story', 'description_risks', 'rewards',
                       'category', 'goal', 'deadline', 'location', 
                       'state','staff_pick', 'video', 'launched_at', 'created_at']]

train['staff_pick'] = train['staff_pick'].astype(int)
train['deadline'] = pd.to_datetime(train['deadline'], unit='s')  
train['launched_at'] = pd.to_datetime(train['launched_at'], unit='s')
train['created_at'] = pd.to_datetime(train['created_at'], unit='s')
train['category'] = train['category'].apply(lambda x: json.loads(x)['slug'].split("/")[0])
train['location'] = train['location'].apply(lambda x: json.loads(x)['country'])
train = clean_text(train)

In [59]:
train.columns

Index(['id', 'name', 'description', 'description_story', 'description_risks',
       'rewards', 'category', 'goal', 'deadline', 'location', 'state',
       'staff_pick', 'video', 'launched_at', 'created_at'],
      dtype='object')

In [60]:
len(train)
len(train.columns)

15

## Apply the same data cleaning to test set

In [61]:
# test.loc[test['location'].isna(), 'location'] = test['country_displayable_name']
test = test.drop(remove_list, axis = 1)

test = test.rename(columns={
    "Title":"name",
    "Description": "description",
    "story": "description_story",
    "risk": "description_risks",
    "Description": "description",
    "Status": "state"
})
test = test[['id', 'name', 'description', 
                       'description_story', 'description_risks', 'rewards',
                       'category', 'goal', 'deadline', 'location', 
                       'state','staff_pick', 'video', 'launched_at', 'created_at']]

test['staff_pick'] = test['staff_pick'].astype(int)
test['deadline'] = pd.to_datetime(test['deadline'], unit='s')  
test['launched_at'] = pd.to_datetime(test['launched_at'], unit='s')
test['created_at'] = pd.to_datetime(test['created_at'], unit='s')
test['category'] = test['category'].apply(lambda x: json.loads(x)['slug'].split("/")[0])
test['location'] = test['location'].apply(lambda x: json.loads(x)['country'])
test = clean_text(test)

## Export to excel

In [62]:
train.to_csv("./data/train/kickstarter_train.csv", index=False)
test.to_csv("./data/test/kickstarter_test.csv", index=False)

In [63]:
test.head()

Unnamed: 0,id,name,description,description_story,description_risks,rewards,category,goal,deadline,location,state,staff_pick,video,launched_at,created_at
34790,1766053000.0,Samson The Short Film,A former boxing champion must fight to save hi...,The film will be submitted to various festival...,"We've shot the footage, but we need help takin...","[{""rewards"":""Pledge US$ 5 or more\n\nAbout S$ ...",film & video,3000.0,2016-05-14 07:15:38,US,1,0,https://v2.kickstarter.com/1663906404-wOtoc0aU...,2016-04-14 07:15:38,2016-04-13 16:26:29
19711,572002600.0,Unmortal : Out of Time Short Film,A Short film for a time-bending Fantasy series...,"With your help, we can create a 10-minute Fant...","While we work hard to mitigate our risks, the ...","[{""rewards"":""Pledge US$ 1 or more\n\nAbout S$ ...",film & video,2500.0,2021-07-01 01:00:00,US,1,0,,2021-06-15 15:49:37,2021-05-23 23:22:33
16014,1639267000.0,The Nuclear Family travels to Barcelona this J...,"Continue ""The Nuclear Family"" at the Can Serra...",Hello friends of kickstarter! My name is Aust...,"I dont expect production set backs, and the re...","[{""rewards"":""Pledge US$ 1 or more\n\nAbout S$ ...",art,4200.0,2013-04-20 06:59:00,ES,1,1,https://v2.kickstarter.com/1664116092-mdW9sNO%...,2013-03-21 20:10:32,2013-02-19 09:04:56
38436,1866271000.0,Spiritual Business Basics and Spotlight,My goal is to create a community that will hel...,"For those of you who don't know me, my name is...",The only challenges I foresee are related to n...,"[{""rewards"":""Pledge €10 or more\n\nAbout S$ 15...",publishing,3000.0,2018-02-23 16:04:50,AT,0,0,,2018-01-24 16:04:50,2018-01-15 15:54:37
44735,19056750.0,MyBodyModel: Fashion Sketch Templates to Your ...,Fashion sketch templates (croquis) made to you...,MyBodyModel's mission is to create body-positi...,We have been working for more than 6 months to...,"[{""rewards"":""Pledge US$ 10 or more\n\nAbout S$...",fashion,20000.0,2017-08-24 22:00:00,US,1,1,https://v2.kickstarter.com/1663784821-8d8QWgz%...,2017-08-01 10:51:42,2017-04-11 11:40:36
