In [216]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from datetime import datetime
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
import re

## Import Dataset



In [217]:
df_merged = pd.read_csv("/Users/ivankoh/Library/CloudStorage/OneDrive-NationalUniversityofSingapore/1D/NUS Y3S1/BT4222/Data/kickstart_data_merged_with_empty.csv", dtype={"backer-amount":str})

print(len(df_merged))
# remove inconsistencies from webscraping and merging 
# this step was added after proposal submission, so the number of observations differ slightly.
# in other words this step is part of post-intermediate data cleaning in preparation for the final report
index = df_merged[pd.isnull(df_merged['story'])].index

df_merged = df_merged[pd.notnull(df_merged['story'])]
df_merged = df_merged[pd.notnull(df_merged['risk'])]
print(len(df_merged))

47341
30031


In [218]:
df_merged.columns

Index(['Unnamed: 0', 'Unnamed: 0_x', 'web-scraper-order',
       'web-scraper-start-url', 'Title', 'Description', 'Status',
       'Pledged-Amount', 'Funded-percent', 'Time left', 'Sphere', 'Location',
       'Image-src', 'Link', 'Link-href', 'backer-amount', 'video',
       'description_story', 'description_risks', 'rewards', 'updateCount',
       'commentCount', 'backers_count', 'blurb_x', 'category',
       'converted_pledged_amount', 'country', 'country_displayable_name',
       'created_at', 'creator', 'currency', 'currency_symbol',
       'currency_trailing_code', 'current_currency', 'deadline',
       'disable_communication', 'friends', 'fx_rate', 'goal', 'id',
       'is_backing', 'is_starrable', 'is_starred', 'launched_at', 'location',
       'name', 'permissions', 'photo', 'pledged', 'profile', 'slug',
       'source_url', 'spotlight', 'staff_pick', 'state', 'state_changed_at',
       'static_usd_rate', 'urls', 'usd_exchange_rate', 'usd_pledged',
       'usd_type', 'main_url'

## Train Test Split After Cleaning (70-15-15)
https://www.researchgate.net/post/Removing_a_low_predictive_column_before_or_after_train_test_split#:~:text=Yes%2C%20you%20should%20remove%20the,but%20not%20before%20the%20split.

- Perform encoding for the target then split

- So we will apply the same cleaning to the test data at the end

In [219]:
# get the target variable - success or failure - binary classfication problem
df_merged = df_merged[(df_merged['Status'] == "successful") | (df_merged['Status'] == "failed")]
print('after remove \n', df_merged['Status'].value_counts())
df_merged['Status'] = df_merged['Status'].apply(lambda x: 0 if x=="failed" else 1)

train, test = train_test_split(df_merged, test_size=0.15, random_state=69, stratify=df_merged['state'])

after remove 
 successful    20569
failed         8556
Name: Status, dtype: int64


In [220]:
print("train size:", len(train))
print("test size", len(test))

train size: 24756
test size 4369


In [221]:
train['Status'].value_counts()

1    17482
0     7274
Name: Status, dtype: int64

## Intermediate Data Cleaning

In [222]:
#global remove list 
remove_list = []

### Remove columns with the same values for all rows





In [223]:
print("Columns with the same values in all of its rows are:")
for i in train: 
  if len(set(train[i])) == 1 or all(pd.isnull(train[i])):
    remove_list.append(i)
    print(i)

Columns with the same values in all of its rows are:
Pledged-Amount
Time left
Sphere
Location
disable_communication


### Remove columns that have the exact same values or very similar values as another column, and keep unique columns

There will be duplicate or very similar columns because:
- Merged dataset takes data from 2 data sources and they may overlap:
  - For the columns with very similar values:
    - Take the column with more information
    - Take the more recent column (Webscraper.io was the most recently scraped)
    - Take the column whose values can be compared with each other

In [224]:
### name and Title 
# some differences in name and Title - keep the more recent one which is Title
train[train['name'] != train['Title']][['Title', 'name']]
remove_list.append('name')

### Description, blurb_x, blurb_y
train[['blurb_x', 'blurb_y', 'Description']]
# blurb_y has significant amount of missing values so blurb_x has more information 
train[train['blurb_x'] != train['Description']][['blurb_x', 'Description']]
# prefer Description over blurb_x 
remove_list.extend(['blurb_y', 'blurb_x'])

### deadline and Time left 
# deadline is more informative because Time left is dynamic 
remove_list.append('Time left')

# ### location, Location, Country
train[['location', 'Location', 'country', 'country_displayable_name']]
# Location contains no useful information
remove_list.append("Location")
train['location'].iloc[1]
train[pd.isnull(train['location'])][['location', 'country', 'country_displayable_name']]
remove_list.append("country_displayable_name")
# location seems to contain more information than Country, but location has a few missing values while country has no missing values
# use values in country to impute missing entries in location, then remove country
train.loc[train['location'].isna(), 'location'] = train['country_displayable_name']
 
### state and Status
# Status is the more recent one 
remove_list.append('state')

### Remove columns that are redundant 

Data is redundant in helping us with our problem statement when:
- The data is metadata 
- The data contains urls that cannot be accessed
- Variables that have leaks information into target variable

In [225]:
# meta-data
remove_list.extend(['web-scraper-start-url', 'web-scraper-order', 'urls', 'Link', 'Link-href',
                   'Unnamed: 0', 'Funded-percent', 'Unnamed: 0_x', 'Unnamed: 0_y', 'currency_symbol',
                   'currency_trailing_code', 'current_currency', 'description_risks', 'description_story', 
                    'final_index', 'Image-src', 'currency', 
                   'friends', 'fx_rate', 'is_backing', 'is_starrable', 'is_starred', 'ivan_index',
                   'main_url', 'permissions', 'photo', 'profile', 'slug', 'source_url', 'state_changed_at',
                   'static_usd_rate', 'usd_exchange_rate', 'usd_type', 'country', 'final_index', 'creator',
                   'pledged'])

# Features that leak information into target variable 
remove_list.extend(['backer-amount', 'backers_count', 'Pledged-Amount', 'updateCount', 'commentCount', 
                   'spotlight', 'converted_pledged_amount', 'usd_pledged'])

In [226]:
#drop all at once
remove_list = list(set(remove_list))
train = train.drop(remove_list, axis = 1)
train.columns

Index(['Title', 'Description', 'Status', 'video', 'rewards', 'category',
       'created_at', 'deadline', 'goal', 'id', 'launched_at', 'location',
       'staff_pick', 'story', 'risk'],
      dtype='object')

### Rename columns to be more readable and convert columns to their correct formats



#### Parsing text features (Post Proposal)

In [227]:
# Remove rewards that got cut off while scraping and closes the dictionary
# Text cleaning for: rewards, description, description story, description risks
def clean_text(df):
    def clean_rewards(corpus):
        for row in corpus:
            reward_list = row
            try:
                tiers = ast.literal_eval(reward_list)
            except SyntaxError as e:
                i = len(reward_list) -1
                while reward_list[i] != "{":
                    i -= 1
                reward_list = reward_list[:(i-1)]
                reward_list += ']'
        return reward_list
    
    df['rewards'] = clean_rewards(df['rewards'])

    return df

In [228]:
train = train.rename(columns={
    "Title":"name",
    "Description": "description",
    "story": "description_story",
    "risk": "description_risks",
    "Description": "description",
    "Status": "state"
})
train = train[['id', 'name', 'description', 
                       'description_story', 'description_risks', 'rewards',
                       'category', 'goal', 'deadline', 'location', 
                       'state','staff_pick', 'video', 'launched_at', 'created_at']]

train['staff_pick'] = train['staff_pick'].astype(int)
train['deadline'] = pd.to_datetime(train['deadline'], unit='s')  
train['launched_at'] = pd.to_datetime(train['launched_at'], unit='s')
train['created_at'] = pd.to_datetime(train['created_at'], unit='s')
train['category'] = train['category'].apply(lambda x: json.loads(x)['slug'].split("/")[0])
train['location'] = train['location'].apply(lambda x: json.loads(x)['country'])
train = clean_text(train)

In [229]:
train.head()

Unnamed: 0,id,name,description,description_story,description_risks,rewards,category,goal,deadline,location,state,staff_pick,video,launched_at,created_at
33288,307103100.0,"""To the things themselves!""","A new dance exploring both my allergy to ""abou...","November 2, 2018 OMG! We're just over halfway ...",We really need your support! Thank you.,"[{""rewards"":""Pledge US$ 100 or more\n\nAbout S...",dance,9000.0,2018-11-17 04:59:00,US,1,1,https://v2.kickstarter.com/1663805284-vaRunMxL...,2018-10-15 20:59:22,2018-10-09 01:27:56
30856,214319300.0,Journals 1990-2014,How did we get to the new century? Ride the wa...,Many thanks to my backers for getting us to th...,Here are the steps of my process:\r\n\r\nStep ...,"[{""rewards"":""Pledge US$ 100 or more\n\nAbout S...",publishing,5000.0,2015-04-09 14:39:44,US,1,1,https://v2.kickstarter.com/1664643846-VxuMrMK%...,2015-03-16 14:39:44,2015-02-24 17:29:35
30738,74750640.0,Bailarina Piano,Hemos conseguido poner en marcha el proyecto B...,-¿Te gustaría disfrutar de la mezcla de la dan...,El principal riesgo es el retrasar el proyecto...,"[{""rewards"":""Pledge US$ 100 or more\n\nAbout S...",music,650.0,2021-05-23 01:36:40,ES,1,0,https://v2.kickstarter.com/1664750650-Kfqx1NI3...,2021-05-03 01:36:40,2021-04-30 22:16:36
39633,1946709000.0,Lily Locksmith full length album,Help Lily Locksmith put out her final recordin...,"Lily Locksmith, with her powerful voice, rich ...",The biggest challange here is the time line. I...,"[{""rewards"":""Pledge US$ 100 or more\n\nAbout S...",music,34000.0,2022-04-14 07:21:20,SE,1,0,https://v2.kickstarter.com/1664734207-Ac%2F11N...,2022-03-20 07:21:20,2022-01-30 12:56:59
11522,537252400.0,Very Young Composers @ 20!,Help celebrate 20 years of empowering the youn...,"For 20 years now, the Very Young Composers pro...","Live streaming to South Korea, Finland, Venezu...","[{""rewards"":""Pledge US$ 100 or more\n\nAbout S...",music,20000.0,2016-05-06 18:45:32,US,1,1,https://v2.kickstarter.com/1664682609-xfA%2FLS...,2016-04-05 18:45:32,2016-02-24 16:11:49


In [230]:
train.columns

Index(['id', 'name', 'description', 'description_story', 'description_risks',
       'rewards', 'category', 'goal', 'deadline', 'location', 'state',
       'staff_pick', 'video', 'launched_at', 'created_at'],
      dtype='object')

In [231]:
len(train)
len(train.columns)

15

## Apply the same data cleaning to test set

In [232]:
# test.loc[test['location'].isna(), 'location'] = test['country_displayable_name']
test = test.drop(remove_list, axis = 1)

test = test.rename(columns={
    "Title":"name",
    "Description": "description",
    "story": "description_story",
    "risk": "description_risks",
    "Description": "description",
    "Status": "state"
})
test = test[['id', 'name', 'description', 
                       'description_story', 'description_risks', 'rewards',
                       'category', 'goal', 'deadline', 'location', 
                       'state','staff_pick', 'video', 'launched_at', 'created_at']]

test['staff_pick'] = test['staff_pick'].astype(int)
test['deadline'] = pd.to_datetime(test['deadline'], unit='s')  
test['launched_at'] = pd.to_datetime(test['launched_at'], unit='s')
test['created_at'] = pd.to_datetime(test['created_at'], unit='s')
test['category'] = test['category'].apply(lambda x: json.loads(x)['slug'].split("/")[0])
test['location'] = test['location'].apply(lambda x: json.loads(x)['country'])
test = clean_text(test)

## Export to excel

In [233]:
train.to_csv("/Users/ivankoh/Library/CloudStorage/OneDrive-NationalUniversityofSingapore/1D/NUS Y3S1/BT4222/Data/kickstarter_train.csv", index=False)
test.to_csv("/Users/ivankoh/Library/CloudStorage/OneDrive-NationalUniversityofSingapore/1D/NUS Y3S1/BT4222/Data/kickstarter_test.csv", index=False)