In [1]:
import pandas as pd 
import numpy as np 
import json 
import os

from tqdm import tqdm 

In [2]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 50)
np.set_printoptions(suppress=True)

In [3]:
#file_loc = "/Users/benji/Downloads/Kickstarter_2021-04-15T03_20_08_451Z/"
#file_loc = "/Users/benji/Downloads/Kickstarter_2021-03-18T03_20_11_507Z/"
file_loc = "/Users/benji/Downloads/Kickstarter_2021-02-11T03_20_07_976Z/"
files = sorted(os.listdir(file_loc))

In [4]:
def process_df(path_to_file):
    """
    Note: main speed limitation is json.loads() which is ~ slow 
    Averages around 1.65 seconds per iteration w/ a dataframe ~ 3k rows 
    """
    
    # load df 
    df = pd.read_csv(path_to_file)
    # filter rows to finished projects  
    df = df[df['state'].isin(['successful', 'failed'])]
    # add some features 
    df = df.assign(
        usd_goal = lambda x:x['goal'] * x['fx_rate'], 
        available_time = lambda x:x['deadline'] - x['launched_at'], # figure out how time is encoded?? 
        blurb_len = lambda x:x['blurb'].str.len()
    )
    # split up category 
    cat_cols_to_keep = ["id", "position", "parent_id", "color"]
    tmp = df['category'].apply(json.loads).apply(pd.Series)[cat_cols_to_keep]
    tmp.columns = "cat_" + tmp.columns 
    cat_cols_to_keep = ["cat_" + w for w in cat_cols_to_keep]
    df = pd.concat((df, tmp), axis = 1)
    # split up location 
    loc_cols_to_keep = ["id", "type", "state"]
    tmp = df['location'].fillna('{}').apply(json.loads).apply(pd.Series)[loc_cols_to_keep]
    tmp.columns = "loc_" + tmp.columns 
    loc_cols_to_keep = ["loc_" + w for w in loc_cols_to_keep]
    df = pd.concat((df, tmp), axis=1)
    #### 
    df_cols_to_keep = [
         "state", "usd_goal", "available_time", "blurb_len", "launched_at", "deadline", "blurb", 
        "name", "currency", "country", "is_starred", "is_starrable", "spotlight", "staff_pick", "photo"
    ]
    return df[df_cols_to_keep + cat_cols_to_keep + loc_cols_to_keep]

In [None]:
dfs = []
for f in tqdm(files):
    dfs.append( process_df(file_loc + f) )

In [None]:
dfs = pd.concat(dfs)

In [None]:
dfs.shape

In [None]:
dfs.to_csv("2021_02_processed_df.csv")