In [139]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from datetime import datetime
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

## Import Dataset



In [140]:
df_merged = pd.read_csv("kickstart_data_merged_with_empty.csv", dtype={"backer-amount":str})

print(len(df_merged))
# remove inconsistencies from webscraping and merging 
# this step was added after proposal submission, so the number of observations differ slightly.
# in other words this step is part of post-intermediate data cleaning in preparation for the final report
index = df_merged[pd.isnull(df_merged['story'])].index

df_merged = df_merged[pd.notnull(df_merged['story'])]
print(len(df_merged))

47341
31135


In [141]:
df_merged.columns

Index(['Unnamed: 0', 'Unnamed: 0_x', 'web-scraper-order',
       'web-scraper-start-url', 'Title', 'Description', 'Status',
       'Pledged-Amount', 'Funded-percent', 'Time left', 'Sphere', 'Location',
       'Image-src', 'Link', 'Link-href', 'backer-amount', 'video',
       'description_story', 'description_risks', 'rewards', 'updateCount',
       'commentCount', 'backers_count', 'blurb_x', 'category',
       'converted_pledged_amount', 'country', 'country_displayable_name',
       'created_at', 'creator', 'currency', 'currency_symbol',
       'currency_trailing_code', 'current_currency', 'deadline',
       'disable_communication', 'friends', 'fx_rate', 'goal', 'id',
       'is_backing', 'is_starrable', 'is_starred', 'launched_at', 'location',
       'name', 'permissions', 'photo', 'pledged', 'profile', 'slug',
       'source_url', 'spotlight', 'staff_pick', 'state', 'state_changed_at',
       'static_usd_rate', 'urls', 'usd_exchange_rate', 'usd_pledged',
       'usd_type', 'main_url'

## Train Test Split After Cleaning (70-15-15)
https://www.researchgate.net/post/Removing_a_low_predictive_column_before_or_after_train_test_split#:~:text=Yes%2C%20you%20should%20remove%20the,but%20not%20before%20the%20split.

- Perform encoding for the target then split

- So we will apply the same cleaning to the test data at the end

In [142]:
# get the target variable - success or failure - binary classfication problem
df_merged = df_merged[(df_merged['Status'] == "successful") | (df_merged['Status'] == "failed")]
print('after remove \n', df_merged['Status'].value_counts())
df_merged['Status'] = df_merged['Status'].apply(lambda x: 0 if x=="failed" else 1)

train, test = train_test_split(df_merged, test_size=0.15, random_state=69, stratify=df_merged['state'])

after remove 
 successful    21657
failed         8570
Name: Status, dtype: int64


In [143]:
print("train size:", len(train))
print("test size", len(test))

train size: 25692
test size 4535


In [144]:
train['Status'].value_counts()

1    18409
0     7283
Name: Status, dtype: int64

## Intermediate Data Cleaning

In [145]:
#global remove list 
remove_list = []

### 3. Remove columns with the same values for all rows





In [146]:
print("Columns with the same values in all of its rows are:")
for i in train: 
  if len(set(train[i])) == 1 or all(pd.isnull(train[i])):
    remove_list.append(i)
    print(i)

Columns with the same values in all of its rows are:
Pledged-Amount
Time left
Sphere
Location
disable_communication


### 4. Remove columns that have the exact same values or very similar values as another column, and keep unique columns

There will be duplicate or very similar columns because:
- Merged dataset takes data from 2 data sources and they may overlap:
  - For the columns with very similar values:
    - Take the column with more information
    - Take the more recent column (Webscraper.io was the most recently scraped)
    - Take the column whose values can be compared with each other

In [147]:
### name and Title 
# some differences in name and Title - keep the more recent one which is Title
train[train['name'] != train['Title']][['Title', 'name']]
remove_list.append('name')

### Description, blurb_x, blurb_y
train[['blurb_x', 'blurb_y', 'Description']]
# blurb_y has significant amount of missing values so blurb_x has more information 
train[train['blurb_x'] != train['Description']][['blurb_x', 'Description']]
# prefer Description over blurb_x 
remove_list.extend(['blurb_y', 'blurb_x'])

### deadline and Time left 
# deadline is more informative because Time left is dynamic 
remove_list.append('Time left')

# ### location, Location, Country
train[['location', 'Location', 'country', 'country_displayable_name']]
# Location contains no useful information
remove_list.append("Location")
train['location'].iloc[1]
train[pd.isnull(train['location'])][['location', 'country', 'country_displayable_name']]
remove_list.append("country_displayable_name")
# location seems to contain more information than Country, but location has a few missing values while country has no missing values
# use values in country to impute missing entries in location, then remove country
train.loc[train['location'].isna(), 'location'] = train['country_displayable_name']
 
### state and Status
# Status is the more recent one 
remove_list.append('state')

### 5. Remove columns that are redundant 

Data is redundant in helping us with our problem statement when:
- The data is metadata 
- The data contains urls that cannot be accessed
- Variables that have leaks information into target variable

In [148]:
# meta-data
remove_list.extend(['web-scraper-start-url', 'web-scraper-order', 'urls', 'Link', 'Link-href',
                   'Unnamed: 0', 'Funded-percent', 'Unnamed: 0_x', 'Unnamed: 0_y', 'currency_symbol',
                   'currency_trailing_code', 'current_currency', 'description_risks', 'description_story', 
                    'final_index', 'Image-src', 'currency', 
                   'friends', 'fx_rate', 'is_backing', 'is_starrable', 'is_starred', 'ivan_index',
                   'main_url', 'permissions', 'photo', 'profile', 'slug', 'source_url', 'state_changed_at',
                   'static_usd_rate', 'usd_exchange_rate', 'usd_type', 'country', 'final_index', 'creator',
                   'pledged'])

# Features that leak information into target variable 
remove_list.extend(['backer-amount', 'backers_count', 'Pledged-Amount', 'updateCount', 'commentCount', 
                   'spotlight', 'converted_pledged_amount', 'usd_pledged'])

In [149]:
#drop all at once
remove_list = list(set(remove_list))
train = train.drop(remove_list, axis = 1)
train.columns

Index(['Title', 'Description', 'Status', 'video', 'rewards', 'category',
       'created_at', 'deadline', 'goal', 'id', 'launched_at', 'location',
       'staff_pick', 'story', 'risk'],
      dtype='object')

### 6. Rename columns to be more readable and convert columns to their correct formats



In [150]:
train = train.rename(columns={
    "Title":"name",
    "Description": "description",
    "story": "description_story",
    "risk": "description_risks",
    "Description": "description",
    "Status": "state"
})
train = train[['id', 'name', 'description', 
                       'description_story', 'description_risks', 'rewards',
                       'category', 'goal', 'deadline', 'location', 
                       'state','staff_pick', 'video', 'launched_at', 'created_at']]

train['staff_pick'] = train['staff_pick'].astype(int)
train['deadline'] = pd.to_datetime(train['deadline'], unit='s')  
train['launched_at'] = pd.to_datetime(train['launched_at'], infer_datetime_format=True)
train['created_at'] = pd.to_datetime(train['launched_at'], infer_datetime_format=True)

In [151]:
train.head()

Unnamed: 0,id,name,description,description_story,description_risks,rewards,category,goal,deadline,location,state,staff_pick,video,launched_at,created_at
24645,1193757000.0,DARKARTS ALBUM,Help DARKARTS record their next album!,To the genuine music fans! Welcome to DARKARTS...,We anticipate few risks to launching this proj...,"[{""rewards"":""Pledge US$ 25 or more\n\nAbout S$...","{""id"":40,""name"":""Indie Rock"",""analytics_name"":...",3400.0,2022-06-11 21:01:43,"{""id"":2391279,""name"":""Denver"",""slug"":""denver-c...",1,0,https://v2.kickstarter.com/1664695127-1NAeIZwv...,1970-01-01 00:00:01.652389303,1970-01-01 00:00:01.652389303
34127,1802286000.0,100% Hand-Poured Soy Candles w/ Wood Wicks in ...,I hand-pour in small batches 8oz. 100% soy can...,I am looking to raise funds in order to buy pe...,The only challenges that I can foresee would b...,"[{""rewards"":""Pledge US$ 1 or more\n\nAbout S$ ...","{""id"":343,""name"":""Candles"",""analytics_name"":""C...",1000.0,2016-01-20 23:00:54,"{""id"":2502716,""name"":""Sussex"",""slug"":""sussex-n...",1,0,,1970-01-01 00:00:01.450738854,1970-01-01 00:00:01.450738854
33100,95377780.0,First Move by Ori Flomin,First Move is a reflection on the mature danci...,First\nMove is a reflection on the mature danc...,The risks and challenges are the same as those...,"[{""rewards"":""Pledge US$ 10 or more\n\nAbout S$...","{""id"":254,""name"":""Performances"",""analytics_nam...",7500.0,2015-03-13 16:09:51,"{""id"":2459115,""name"":""New York"",""slug"":""new-yo...",1,1,https://v2.kickstarter.com/1663849120-vK6gi3cV...,1970-01-01 00:00:01.423674591,1970-01-01 00:00:01.423674591
42569,314843400.0,Looking for the Masters in Ricardo's Golden Shoes,A beautiful hardcover photo book with gold edg...,"One summer day in 2013, I was having breakfast...",The risks and challenges when publishing a pho...,"[{""rewards"":""Pledge €2 or more\n\nAbout S$ 3\n...","{""id"":276,""name"":""Fine Art"",""analytics_name"":""...",12000.0,2016-04-24 11:20:25,"{""id"":615702,""name"":""Paris"",""slug"":""paris-fr"",...",1,1,https://v2.kickstarter.com/1664123995-bH2MjakM...,1970-01-01 00:00:01.458818425,1970-01-01 00:00:01.458818425
47318,165520700.0,Deep Love: Going to the Moon,Deep Love is headed to the moon! (Via Oregon) ...,********** Stretch Goals************ We're com...,As we learned with our last kickstarter campai...,"[{""rewards"":""Pledge US$ 1 or more\n\nAbout US$...","{""id"":284,""name"":""Musical"",""analytics_name"":""M...",5000.0,2014-08-31 12:54:48,"{""id"":2487610,""name"":""Salt Lake City"",""slug"":""...",1,1,https://v2.kickstarter.com/1664440295-pSizLCYF...,1970-01-01 00:00:01.406897688,1970-01-01 00:00:01.406897688


In [152]:
train.columns

Index(['id', 'name', 'description', 'description_story', 'description_risks',
       'rewards', 'category', 'goal', 'deadline', 'location', 'state',
       'staff_pick', 'video', 'launched_at', 'created_at'],
      dtype='object')

In [153]:
len(train)
len(train.columns)

15

## Apply the same data cleaning to test set

In [154]:
test.loc[test['location'].isna(), 'location'] = test['country_displayable_name']
test = test.drop(remove_list, axis = 1)

test = test.rename(columns={
    "Title":"name",
    "Description": "description",
    "story": "description_story",
    "risk": "description_risks",
    "Description": "description",
    "Status": "state"
})
test = test[['id', 'name', 'description', 
                       'description_story', 'description_risks', 'rewards',
                       'category', 'goal', 'deadline', 'location', 
                       'state','staff_pick', 'video', 'launched_at', 'created_at']]

test['staff_pick'] = test['staff_pick'].astype(int)
test['deadline'] = pd.to_datetime(test['deadline'], unit='s')  
test['launched_at'] = pd.to_datetime(test['launched_at'], infer_datetime_format=True)
test['created_at'] = pd.to_datetime(test['launched_at'], infer_datetime_format=True)

## Export to excel

In [155]:
train.to_csv("data/kickstarter_train.csv", index=False)
test.to_csv("data/kickstarter_test.csv", index=False)