In [5]:
import glob
import pandas as pd
import numpy as np
import pickle
import ast
import json
from datetime import datetime

In [6]:
# Grab all files witin data folder for each month in 2019
extension = 'csv'
directories = ['2019-01-01', '2019-02-01', '2019-03-01', '2019-04-01', '2019-05-01', '2019-06-01',
              '2019-07-01', '2019-08-01', '2019-09-01', '2019-10-01', '2019-11-01', '2019-12-01']

all_files_lists = []
for folder in directories:
    file_names = [i for i in glob.glob('data/Kickstarter_{}/*.{}'.format(folder, extension))]
    all_files_lists.append(file_names)

In [7]:
# Flatten the resulting list to create a list of all file paths
all_files = [item for sublist in all_files_lists for item in sublist]

# Dataframe concatenation and loading

In [8]:
def load_data_into_dataframe(file_names):
    """
    Takes in a list of file names and creates dataframes from each element. Then each dataframe is filtered and
    cleaned, then all dataframes are concatenated along columns.

    :param file_names (list): List of file names as strings
    :return: Pandas dataframe
    """

    # Set dataframe filters and parameters
    columns_to_keep = ['backers_count', 'creator', 'category', 'converted_pledged_amount', 'country',
        'created_at', 'deadline', 'goal', 'id', 'launched_at', 'name', 'pledged', 'source_url', 'spotlight',
        'staff_pick', 'state', 'state_changed_at', 'urls', 'usd_pledged']
    country = 'US'
    
    data_frames = []
    for file in file_names:
        df = pd.read_csv(file)

        # Filter based on specified criteria
        df = df.loc[:, columns_to_keep]
        df = df[df['country'] == country]

        # Convert dates from unix to datetime
        date_cols = ['created_at', 'deadline', 'launched_at', 'state_changed_at']
        for col in date_cols:
            df[col] = pd.to_datetime(df[col], origin='unix', unit='s')

        # Clean rows
        df.drop_duplicates(keep=False, inplace=True)

        data_frames.append(df)
        
    combined_df = pd.concat(data_frames)
    combined_df.reset_index(inplace=True, drop=True)
    return combined_df

In [9]:
df = load_data_into_dataframe(all_files)

In [10]:
print(df.shape)
df.head()

(1776735, 19)


Unnamed: 0,backers_count,creator,category,converted_pledged_amount,country,created_at,deadline,goal,id,launched_at,name,pledged,source_url,spotlight,staff_pick,state,state_changed_at,urls,usd_pledged
0,1,"{""id"":983022919,""name"":""Rhys Kucharski"",""is_re...","{""id"":356,""name"":""Woodworking"",""slug"":""crafts/...",240,US,2016-09-11 22:05:51,2016-12-05 19:42:23,5000.0,1504859185,2016-11-05 18:42:23,Industrial Bamboo Table,240.0,https://www.kickstarter.com/discover/categorie...,False,False,failed,2016-12-05 19:42:23,"{""web"":{""project"":""https://www.kickstarter.com...",240.0
1,243,"{""id"":372111659,""name"":""Kevin, Domingo & Suzie...","{""id"":311,""name"":""Food Trucks"",""slug"":""food/fo...",41738,US,2015-03-24 17:41:14,2015-05-15 16:22:34,35000.0,1228074690,2015-04-15 16:22:34,The Barmobile: Boston's Mobile Cocktail Cateri...,41738.0,https://www.kickstarter.com/discover/categorie...,True,True,successful,2015-05-15 16:22:34,"{""web"":{""project"":""https://www.kickstarter.com...",41738.0
2,27,"{""id"":1336552462,""name"":""Christine Almstrom"",""...","{""id"":46,""name"":""Children's Books"",""slug"":""pub...",3115,US,2017-05-18 12:30:32,2017-07-16 15:03:03,3000.0,330962986,2017-06-01 15:03:03,Grandfather Thunder & The Night Horses,3115.0,https://www.kickstarter.com/discover/categorie...,True,False,successful,2017-07-16 15:03:04,"{""web"":{""project"":""https://www.kickstarter.com...",3115.0
3,3,"{""id"":1070386695,""name"":""Jeremy & Tiffany Park...","{""id"":307,""name"":""Drinks"",""slug"":""food/drinks""...",61,US,2017-06-11 02:02:05,2017-09-09 19:17:10,250000.0,550544673,2017-08-10 19:17:10,Parker's Cup Coffee & Tea,61.0,https://www.kickstarter.com/discover/categorie...,False,False,failed,2017-09-09 19:17:10,"{""web"":{""project"":""https://www.kickstarter.com...",61.0
4,1,"{""id"":87714877,""name"":""Robert"",""is_registered""...","{""id"":241,""name"":""Metal"",""slug"":""music/metal"",...",5,US,2015-08-18 02:36:53,2015-10-18 21:17:17,125.0,49016224,2015-08-19 21:17:17,Disorderly Conduct's First Album,5.0,https://www.kickstarter.com/discover/categorie...,False,False,failed,2015-10-18 21:17:17,"{""web"":{""project"":""https://www.kickstarter.com...",5.0


## Include only completed projects

In [1]:
df = df.loc[df['state'].isin(['successful', 'failed'])]

NameError: name 'df' is not defined

## Cleaning columns read in as JSON

In [11]:
# Convert all 'category' and 'urls' JSON columns to dictionaries
for col in ['category', 'urls']:
    df[col] = df[col].apply(ast.literal_eval)

In [12]:
def split_dict_columns(row):
    """
    Takes in a dictionary and returns the value at specific keys. this function deals with extracting
    the category names and project urls from converted JSON columns

    :param row (dict): A standard dictionary
    :return: Any object
    """

    if 'slug' in row.keys():
        return row['slug']
    elif 'web' in row.keys():
        return row['web']['project']

In [13]:
# Create a new column for categories
category_lists = df['category'].apply(split_dict_columns).map(lambda x: x.split('/'))
df['new_category'] = category_lists.map(lambda x: x[0])
df['new_category']

0              crafts
1                food
2          publishing
3                food
4               music
              ...    
1776730        design
1776731         dance
1776732          food
1776733    publishing
1776734    publishing
Name: new_category, Length: 1776735, dtype: object

In [14]:
# Create a new column for subcategories
df['subcategory'] = category_lists.map(lambda x: x[-1])
df['subcategory']

0               woodworking
1               food trucks
2          children's books
3                    drinks
4                     metal
                 ...       
1776730      product design
1776731               dance
1776732              drinks
1776733          nonfiction
1776734         periodicals
Name: subcategory, Length: 1776735, dtype: object

In [15]:
# Create a new column for project urls
df['project_url'] = df['urls'].apply(split_dict_columns)
df['project_url']

0          https://www.kickstarter.com/projects/983022919...
1          https://www.kickstarter.com/projects/372111659...
2          https://www.kickstarter.com/projects/133655246...
3          https://www.kickstarter.com/projects/107038669...
4          https://www.kickstarter.com/projects/87714877/...
                                 ...                        
1776730    https://www.kickstarter.com/projects/leathers/...
1776731    https://www.kickstarter.com/projects/126176310...
1776732    https://www.kickstarter.com/projects/730281381...
1776733    https://www.kickstarter.com/projects/louisvill...
1776734    https://www.kickstarter.com/projects/175269081...
Name: project_url, Length: 1776735, dtype: object

In [16]:
# Drop extra columns
df.drop(['creator', 'state_changed_at', 'id', 'pledged', 'category', 'urls', 'converted_pledged_amount', 'country', 'source_url'], axis=1, inplace=True)

## Filter dataframe on the 3 categories used for analysis

In [17]:
# Filter dataframe to include the 3 categories used for analysis
categories = ['games', 'design', 'technology']
df = df.loc[df['new_category'].isin(categories)]
df.rename(columns={'new_category': 'category'}, inplace=True)

In [18]:
df.shape

(324391, 13)

In [19]:
df.reset_index(drop=True, inplace=True)

## Create time-based columns

In [20]:
# "Project lifetime" --> creation date to funding deadline (rounded to the nearest day)
# df['project_life'] = df['deadline'].sub(df['created_at'], axis=0)
df['project_life'] = (df['deadline'] - df['created_at']).dt.days
df['project_life'] = round(df['project_life'], 0)
# df['project_life'] = df['project_life'].dt.round('d').dt.days

# Length of the campaign (rounded to the nearest day)
# df['campaign_length'] = df['deadline'].sub(df['launched_at'], axis=0)
df['campaign_length'] = (df['deadline'] - df['launched_at']).dt.days
df['campaign_length'] = round(df['campaign_length'], 0)
# df['campaign_length'] = df['campaign_length'].dt.round('d').dt.days

# Year and month of launch
df['launch_month'] = pd.to_datetime(df['launched_at']).dt.to_period('M')

# year and month of deadline
df['deadline_month'] = pd.to_datetime(df['deadline']).dt.to_period('M')

## Aggregate monthly data to get averages and totals

In [21]:
df.head(5)

Unnamed: 0,backers_count,created_at,deadline,goal,launched_at,name,spotlight,staff_pick,state,usd_pledged,category,subcategory,project_url,project_life,campaign_length,launch_month,deadline_month
0,8,2015-10-21 00:29:19,2015-12-04 15:05:15,18000.0,2015-11-04 15:05:15,Sound Affections - old fashion Greeting Cards ...,True,False,successful,19120.0,technology,sound,https://www.kickstarter.com/projects/206159179...,44,30,2015-11,2015-12
1,246,2018-12-12 13:46:33,2019-01-14 01:30:02,100.0,2018-12-15 01:30:02,Dragon Scales: A Tabletop RPG Game of Chance™,True,False,successful,11940.0,games,tabletop games,https://www.kickstarter.com/projects/308858186...,32,30,2018-12,2019-01
2,457,2017-03-25 22:41:02,2017-05-28 20:59:00,10000.0,2017-04-28 18:00:23,Pokitto - easy-to-learn and program Gaming Gad...,True,True,successful,27235.0,technology,diy electronics,https://www.kickstarter.com/projects/175428417...,63,30,2017-04,2017-05
3,171,2018-09-07 19:37:50,2018-10-15 01:00:00,6000.0,2018-09-11 22:38:15,Kitten Stuff Done: Playfully Productive,True,False,successful,6973.0,games,playing cards,https://www.kickstarter.com/projects/victoragr...,37,33,2018-09,2018-10
4,3,2019-01-15 16:47:50,2019-03-17 19:19:16,150.0,2019-01-16 20:19:16,Wooden Blocks for Windows,False,False,live,4.0,games,video games,https://www.kickstarter.com/projects/101699361...,61,59,2019-01,2019-03


In [22]:
# Group the data by name and find the latest data (maximum pledge amounts)
max_amounts = df.groupby('name', as_index=False).usd_pledged.max()

In [23]:
# Inner join the data to the original dataset to filter for unique projects
kickstarter_df = pd.merge(df, max_amounts, how='inner', left_on=['name', 'usd_pledged'], right_on=['name', 'usd_pledged'])
kickstarter_df.drop_duplicates('name', inplace=True)

In [24]:
print(kickstarter_df.shape)
kickstarter_df.head(5)

(30967, 17)


Unnamed: 0,backers_count,created_at,deadline,goal,launched_at,name,spotlight,staff_pick,state,usd_pledged,category,subcategory,project_url,project_life,campaign_length,launch_month,deadline_month
0,8,2015-10-21 00:29:19,2015-12-04 15:05:15,18000.0,2015-11-04 15:05:15,Sound Affections - old fashion Greeting Cards ...,True,False,successful,19120.0,technology,sound,https://www.kickstarter.com/projects/206159179...,44,30,2015-11,2015-12
12,246,2018-12-12 13:46:33,2019-01-14 01:30:02,100.0,2018-12-15 01:30:02,Dragon Scales: A Tabletop RPG Game of Chance™,True,False,successful,11940.0,games,tabletop games,https://www.kickstarter.com/projects/308858186...,32,30,2018-12,2019-01
27,457,2017-03-25 22:41:02,2017-05-28 20:59:00,10000.0,2017-04-28 18:00:23,Pokitto - easy-to-learn and program Gaming Gad...,True,True,successful,27235.0,technology,diy electronics,https://www.kickstarter.com/projects/175428417...,63,30,2017-04,2017-05
44,171,2018-09-07 19:37:50,2018-10-15 01:00:00,6000.0,2018-09-11 22:38:15,Kitten Stuff Done: Playfully Productive,True,False,successful,6973.0,games,playing cards,https://www.kickstarter.com/projects/victoragr...,37,33,2018-09,2018-10
59,3,2019-01-15 16:47:50,2019-03-17 19:19:16,150.0,2019-01-16 20:19:16,Wooden Blocks for Windows,False,False,live,4.0,games,video games,https://www.kickstarter.com/projects/101699361...,61,59,2019-01,2019-03


In [25]:
# Calculate daily backers and funding
kickstarter_df['daily_amount_pledged'] = round(kickstarter_df['usd_pledged'] / kickstarter_df['campaign_length'], 0)
kickstarter_df['daily_backers'] = round(kickstarter_df['backers_count'] / kickstarter_df['campaign_length'], 0)

In [26]:
print(kickstarter_df.daily_amount_pledged.min(), kickstarter_df.daily_amount_pledged.max())
print(kickstarter_df.daily_backers.min(), kickstarter_df.daily_backers.max())

0.0 332646.0
0.0 3049.0


In [27]:
kickstarter_df.reset_index(drop=True, inplace=True)
kickstarter_df.head()

Unnamed: 0,backers_count,created_at,deadline,goal,launched_at,name,spotlight,staff_pick,state,usd_pledged,category,subcategory,project_url,project_life,campaign_length,launch_month,deadline_month,daily_amount_pledged,daily_backers
0,8,2015-10-21 00:29:19,2015-12-04 15:05:15,18000.0,2015-11-04 15:05:15,Sound Affections - old fashion Greeting Cards ...,True,False,successful,19120.0,technology,sound,https://www.kickstarter.com/projects/206159179...,44,30,2015-11,2015-12,637.0,0.0
1,246,2018-12-12 13:46:33,2019-01-14 01:30:02,100.0,2018-12-15 01:30:02,Dragon Scales: A Tabletop RPG Game of Chance™,True,False,successful,11940.0,games,tabletop games,https://www.kickstarter.com/projects/308858186...,32,30,2018-12,2019-01,398.0,8.0
2,457,2017-03-25 22:41:02,2017-05-28 20:59:00,10000.0,2017-04-28 18:00:23,Pokitto - easy-to-learn and program Gaming Gad...,True,True,successful,27235.0,technology,diy electronics,https://www.kickstarter.com/projects/175428417...,63,30,2017-04,2017-05,908.0,15.0
3,171,2018-09-07 19:37:50,2018-10-15 01:00:00,6000.0,2018-09-11 22:38:15,Kitten Stuff Done: Playfully Productive,True,False,successful,6973.0,games,playing cards,https://www.kickstarter.com/projects/victoragr...,37,33,2018-09,2018-10,211.0,5.0
4,3,2019-01-15 16:47:50,2019-03-17 19:19:16,150.0,2019-01-16 20:19:16,Wooden Blocks for Windows,False,False,live,4.0,games,video games,https://www.kickstarter.com/projects/101699361...,61,59,2019-01,2019-03,0.0,0.0


In [28]:
# Save dataframe to pickle file
with open('kickstarter_data.pickle', 'wb') as outfile:
    pickle.dump(kickstarter_df, outfile)

## Save DataFrame to pickle File

In [37]:
# Save dataframe to pickle file
with open('kickstarter_data.pickle', 'wb') as outfile:
    pickle.dump(kickstarter_df, outfile)