# Table of Contents
1. Data Wrangling
2

Introduction 

In [1]:
import numpy as np
import pandas as pd
import glob
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from keras.layers import Input, Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.models import Model

Using TensorFlow backend.


In [2]:
#define path and set up to import csv files
path = 'Dataset'
all_files = glob.glob(path +"/*.csv")

In [3]:
#create dataframes from all csv files
li = []
for filename in all_files:
    df = pd.read_csv(filename, index_col = None)
    li.append(df)
frame = pd.concat(li, ignore_index = True)

In [4]:
#merge files
frame = pd.concat(li, ignore_index = True)
frame.head()

Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,created_at,creator,currency,currency_symbol,currency_trailing_code,...,slug,source_url,spotlight,staff_pick,state,state_changed_at,static_usd_rate,urls,usd_pledged,usd_type
0,103,"Funding the mixing, mastering, and promotion o...","{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",5612,US,1456593666,"{""id"":1531055178,""name"":""JC Stroebel and Henry...",USD,$,True,...,john-chuck-and-the-class-debut-ep,https://www.kickstarter.com/discover/categorie...,True,True,successful,1459964983,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",5612.0,domestic
1,318,We follow the challenges and achievements of g...,"{""id"":30,""name"":""Documentary"",""slug"":""film & v...",26237,US,1495058182,"{""id"":652875854,""name"":""Matthew Temple"",""is_re...",USD,$,True,...,girls-of-summer-big-diamond-dreams,https://www.kickstarter.com/discover/categorie...,True,True,successful,1499054401,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",26237.0,domestic
2,0,Task No.1 is inspired by the history and expre...,"{""id"":38,""name"":""Electronic Music"",""slug"":""mus...",0,GB,1357630802,"{""id"":1699678150,""name"":""Sonny Phillips"",""slug...",GBP,£,False,...,task-no1,https://www.kickstarter.com/discover/categorie...,False,False,failed,1362937678,1.614583,"{""web"":{""project"":""https://www.kickstarter.com...",0.0,international
3,22,MAJOR KEY ALERT - Future Heroes is a Denver ra...,"{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",1575,US,1455591114,"{""id"":518056209,""name"":""Future Heroes"",""is_reg...",USD,$,True,...,future-heroes-sxsw-is-calling,https://www.kickstarter.com/discover/categorie...,True,False,successful,1457935201,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",1575.0,domestic
4,17,We're traveling to Rhode Island to film Mako a...,"{""id"":30,""name"":""Documentary"",""slug"":""film & v...",3290,US,1465224753,"{""id"":632937188,""name"":""Ryan Walton"",""is_regis...",USD,$,True,...,pelagic-shark-diving-shoot,https://www.kickstarter.com/discover/categorie...,True,False,successful,1467825676,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",3290.0,domestic


In [5]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207621 entries, 0 to 207620
Data columns (total 37 columns):
backers_count               207621 non-null int64
blurb                       207613 non-null object
category                    207621 non-null object
converted_pledged_amount    207621 non-null int64
country                     207621 non-null object
created_at                  207621 non-null int64
creator                     207621 non-null object
currency                    207621 non-null object
currency_symbol             207621 non-null object
currency_trailing_code      207621 non-null bool
current_currency            207621 non-null object
deadline                    207621 non-null int64
disable_communication       207621 non-null bool
friends                     444 non-null object
fx_rate                     207621 non-null float64
goal                        207621 non-null float64
id                          207621 non-null int64
is_backing                  444 

In [6]:
#drop unrelated features
frame = frame.drop(columns = ['backers_count', 'category', 'converted_pledged_amount','created_at', 
                             'creator', 'currency', 'currency_symbol', 'currency_trailing_code', 
                             'current_currency', 'deadline', 'friends', 'id', 'is_backing', 'is_starrable', 
                             'is_starred', 'location', 'name', 'slug', 'permissions', 'photo', 'pledged', 
                             'profile', 'source_url', 'spotlight', 'staff_pick', 'urls', 'static_usd_rate', 
                             'usd_pledged', 'usd_type'])

In [7]:
#convert all goal to USD
frame['goal'] = frame['goal']*frame['fx_rate']
frame = frame[frame['disable_communication'] == False]

In [8]:
#convert epoch to datetime
frame['state_changed_at'] = frame['state_changed_at'].apply(lambda x: datetime.fromtimestamp(x))
frame['launched_at'] = frame['launched_at'].apply(lambda x: datetime.fromtimestamp(x))

#create new vector for days taken to complete goals
frame['days_to_state_change'] = frame['state_changed_at'] - frame['launched_at']
frame = frame.drop(columns = ['fx_rate', 'disable_communication', 'state_changed_at'])

In [9]:
frame['days_to_state_change'] = frame['days_to_state_change'].dt.days

In [10]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 206989 entries, 0 to 207620
Data columns (total 6 columns):
blurb                   206981 non-null object
country                 206989 non-null object
goal                    206989 non-null float64
launched_at             206989 non-null datetime64[ns]
state                   206989 non-null object
days_to_state_change    206989 non-null int64
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 11.1+ MB


In [13]:
#create launch_month and lauch_year column
frame['month_launched'] = frame['launched_at'].apply(lambda x: x.month)
frame['year_launched'] = frame['launched_at'].apply(lambda x: x.year)
frame = frame.drop(columns = ['launched_at'])

In [14]:
frame.head()

Unnamed: 0,blurb,country,goal,state,days_to_state_change,month_launched,year_launched
0,"Funding the mixing, mastering, and promotion o...",US,5000.0,successful,30,3,2016
1,We follow the challenges and achievements of g...,US,24042.0,successful,26,6,2017
2,Task No.1 is inspired by the history and expre...,GB,4822.36808,failed,60,1,2013
3,MAJOR KEY ALERT - Future Heroes is a Denver ra...,US,500.0,successful,19,2,2016
4,We're traveling to Rhode Island to film Mako a...,US,2500.0,successful,30,6,2016


In [17]:
frame = frame.dropna()