In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from datetime import datetime
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

## Import Dataset



In [None]:
df_merged = pd.read_csv("kickstart_data_merged_with_empty.csv", dtype={"backer-amount":str})

print(len(df_merged))
# remove inconsistencies from webscraping and merging 
# this step was added after proposal submission, so the number of observations differ slightly.
# in other words this step is part of post-intermediate data cleaning in preparation for the final report
index = df_merged[pd.isnull(df_merged['story'])].index

df_merged = df_merged[pd.notnull(df_merged['story'])]
df_merged = df_merged[pd.notnull(df_merged['risk'])]
print(len(df_merged))

In [None]:
df_merged.columns

## Train Test Split After Cleaning (70-15-15)
https://www.researchgate.net/post/Removing_a_low_predictive_column_before_or_after_train_test_split#:~:text=Yes%2C%20you%20should%20remove%20the,but%20not%20before%20the%20split.

- Perform encoding for the target then split

- So we will apply the same cleaning to the test data at the end

In [None]:
# get the target variable - success or failure - binary classfication problem
df_merged = df_merged[(df_merged['Status'] == "successful") | (df_merged['Status'] == "failed")]
print('after remove \n', df_merged['Status'].value_counts())
df_merged['Status'] = df_merged['Status'].apply(lambda x: 0 if x=="failed" else 1)

train, test = train_test_split(df_merged, test_size=0.15, random_state=69, stratify=df_merged['state'])

In [None]:
print("train size:", len(train))
print("test size", len(test))

In [None]:
train['Status'].value_counts()

## Intermediate Data Cleaning

In [None]:
#global remove list 
remove_list = []

### 3. Remove columns with the same values for all rows





In [None]:
print("Columns with the same values in all of its rows are:")
for i in train: 
  if len(set(train[i])) == 1 or all(pd.isnull(train[i])):
    remove_list.append(i)
    print(i)

### 4. Remove columns that have the exact same values or very similar values as another column, and keep unique columns

There will be duplicate or very similar columns because:
- Merged dataset takes data from 2 data sources and they may overlap:
  - For the columns with very similar values:
    - Take the column with more information
    - Take the more recent column (Webscraper.io was the most recently scraped)
    - Take the column whose values can be compared with each other

In [None]:
### name and Title 
# some differences in name and Title - keep the more recent one which is Title
train[train['name'] != train['Title']][['Title', 'name']]
remove_list.append('name')

### Description, blurb_x, blurb_y
train[['blurb_x', 'blurb_y', 'Description']]
# blurb_y has significant amount of missing values so blurb_x has more information 
train[train['blurb_x'] != train['Description']][['blurb_x', 'Description']]
# prefer Description over blurb_x 
remove_list.extend(['blurb_y', 'blurb_x'])

### deadline and Time left 
# deadline is more informative because Time left is dynamic 
remove_list.append('Time left')

# ### location, Location, Country
train[['location', 'Location', 'country', 'country_displayable_name']]
# Location contains no useful information
remove_list.append("Location")
train['location'].iloc[1]
train[pd.isnull(train['location'])][['location', 'country', 'country_displayable_name']]
remove_list.append("country_displayable_name")
# location seems to contain more information than Country, but location has a few missing values while country has no missing values
# use values in country to impute missing entries in location, then remove country
train.loc[train['location'].isna(), 'location'] = train['country_displayable_name']
 
### state and Status
# Status is the more recent one 
remove_list.append('state')

### 5. Remove columns that are redundant 

Data is redundant in helping us with our problem statement when:
- The data is metadata 
- The data contains urls that cannot be accessed
- Variables that have leaks information into target variable

In [None]:
# meta-data
remove_list.extend(['web-scraper-start-url', 'web-scraper-order', 'urls', 'Link', 'Link-href',
                   'Unnamed: 0', 'Funded-percent', 'Unnamed: 0_x', 'Unnamed: 0_y', 'currency_symbol',
                   'currency_trailing_code', 'current_currency', 'description_risks', 'description_story', 
                    'final_index', 'Image-src', 'currency', 
                   'friends', 'fx_rate', 'is_backing', 'is_starrable', 'is_starred', 'ivan_index',
                   'main_url', 'permissions', 'photo', 'profile', 'slug', 'source_url', 'state_changed_at',
                   'static_usd_rate', 'usd_exchange_rate', 'usd_type', 'country', 'final_index', 'creator',
                   'pledged'])

# Features that leak information into target variable 
remove_list.extend(['backer-amount', 'backers_count', 'Pledged-Amount', 'updateCount', 'commentCount', 
                   'spotlight', 'converted_pledged_amount', 'usd_pledged'])

In [None]:
#drop all at once
remove_list = list(set(remove_list))
train = train.drop(remove_list, axis = 1)
train.columns

### 6. Rename columns to be more readable and convert columns to their correct formats



In [None]:
train = train.rename(columns={
    "Title":"name",
    "Description": "description",
    "story": "description_story",
    "risk": "description_risks",
    "Description": "description",
    "Status": "state"
})
train = train[['id', 'name', 'description', 
                       'description_story', 'description_risks', 'rewards',
                       'category', 'goal', 'deadline', 'location', 
                       'state','staff_pick', 'video', 'launched_at', 'created_at']]

train['staff_pick'] = train['staff_pick'].astype(int)
train['deadline'] = pd.to_datetime(train['deadline'], unit='s')  
train['launched_at'] = pd.to_datetime(train['launched_at'], unit='s')
train['created_at'] = pd.to_datetime(train['created_at'], unit='s')

In [None]:
train.head()

In [None]:
train.columns

In [None]:
len(train)
len(train.columns)

## Apply the same data cleaning to test set

In [None]:
# test.loc[test['location'].isna(), 'location'] = test['country_displayable_name']
# test = test.drop(remove_list, axis = 1)

test = test.rename(columns={
    "Title":"name",
    "Description": "description",
    "story": "description_story",
    "risk": "description_risks",
    "Description": "description",
    "Status": "state"
})
test = test[['id', 'name', 'description', 
                       'description_story', 'description_risks', 'rewards',
                       'category', 'goal', 'deadline', 'location', 
                       'state','staff_pick', 'video', 'launched_at', 'created_at']]

test['staff_pick'] = test['staff_pick'].astype(int)
test['deadline'] = pd.to_datetime(test['deadline'], unit='s')  
test['launched_at'] = pd.to_datetime(test['launched_at'], unit='s')
test['created_at'] = pd.to_datetime(test['created_at'], unit='s')

In [None]:
test.head()

## Export to excel

In [None]:
train.to_csv("data/kickstarter_train.csv", index=False)
test.to_csv("data/kickstarter_test.csv", index=False)