In [2]:
import duckdb

# Raw data

In [35]:
conn = duckdb.connect()

In [36]:
# filenames are of the format upworthy-archive-confirmatory-packages-03.12.2020.csv,
# with 'confirmatory', 'exploratory', 'holdout', and 'undeployed'
filename_descs_and_dates = [('confirmatory','-03.12.2020'),
                            ('exploratory','-03.12.2020'),
                            ('holdout','-03.12.2020'),
                            ('undeployed','.01.12.2021')]
def get_filename(desc, date):
    return f'datasets/upworthy-archive-{desc}-packages{date}.csv'

def get_df_from_sql(sql):
    return conn.execute(sql).fetchdf()

In [37]:
def get_file_details(file_desc, file_date):
    df = get_df_from_sql(
    f"""
    SELECT
        *
    FROM read_csv_auto('{get_filename(file_desc, file_date)}');                             
    """)
    
    return len(df), df.columns

for file_desc, file_date in filename_descs_and_dates:
    print(f'File: {file_desc}')
    print(get_file_details(file_desc, file_date))

File: confirmatory
(105551, Index(['column00', 'created_at', 'updated_at', 'clickability_test_id',
       'excerpt', 'headline', 'lede', 'slug', 'eyecatcher_id', 'impressions',
       'clicks', 'significance', 'first_place', 'winner', 'share_text',
       'square', 'test_week'],
      dtype='object'))
File: exploratory
(22666, Index(['column00', 'created_at', 'updated_at', 'clickability_test_id',
       'excerpt', 'headline', 'lede', 'slug', 'eyecatcher_id', 'impressions',
       'clicks', 'significance', 'first_place', 'winner', 'share_text',
       'square', 'test_week'],
      dtype='object'))
File: holdout
(22600, Index(['column00', 'created_at', 'updated_at', 'clickability_test_id',
       'excerpt', 'headline', 'lede', 'slug', 'eyecatcher_id', 'impressions',
       'clicks', 'significance', 'first_place', 'winner', 'share_text',
       'square', 'test_week'],
      dtype='object'))
File: undeployed
(78232, Index(['_id', 'created_at', 'updated_at', 'clickability_test_id', 'excerpt

In [43]:
105551 + 22666 + 22600 + 78232

229049

# Create a single table in a persisted duckdb file

In [39]:
create_table_sql = f"""
    CREATE TABLE exps AS
    SELECT column00 as id, * EXCLUDE (column00), 'confirmatory' as source_type FROM '{get_filename('confirmatory', '-03.12.2020')}'
    UNION ALL
    SELECT column00 as id, * EXCLUDE (column00), 'exploratory' as source_type FROM '{get_filename('exploratory', '-03.12.2020')}'
    UNION ALL
    SELECT column00 as id, * EXCLUDE (column00), 'holdout' as source_type FROM '{get_filename('holdout', '-03.12.2020')}'
    UNION ALL
    SELECT _id as id, * EXCLUDE (_id),  NULL as test_week, 'undeployed' as source_type FROM '{get_filename('undeployed', '.01.12.2021')}'
"""
print(create_table_sql)


    CREATE TABLE exps AS
    SELECT column00 as id, * EXCLUDE (column00), 'confirmatory' as source_type FROM 'datasets/upworthy-archive-confirmatory-packages-03.12.2020.csv'
    UNION ALL
    SELECT column00 as id, * EXCLUDE (column00), 'exploratory' as source_type FROM 'datasets/upworthy-archive-exploratory-packages-03.12.2020.csv'
    UNION ALL
    SELECT column00 as id, * EXCLUDE (column00), 'holdout' as source_type FROM 'datasets/upworthy-archive-holdout-packages-03.12.2020.csv'
    UNION ALL
    SELECT _id as id, * EXCLUDE (_id),  NULL as test_week, 'undeployed' as source_type FROM 'datasets/upworthy-archive-undeployed-packages.01.12.2021.csv'



In [40]:
file_conn = duckdb.connect('experiments.duckdb')
file_conn.execute(create_table_sql)

<_duckdb.DuckDBPyConnection at 0x1176388f0>

In [44]:
exps_df = file_conn.execute('FROM exps').fetchdf()
len(exps_df)

229049

In [46]:
exps_df[:3]

Unnamed: 0,id,created_at,updated_at,clickability_test_id,excerpt,headline,lede,slug,eyecatcher_id,impressions,clicks,significance,first_place,winner,share_text,square,test_week,source_type
0,11,2014-11-20 11:33:26.475,2016-04-02 16:25:54.046,546dd17e26714c82cc00001c,Things that matter. Pass 'em on.,"Let’s See … Hire Cops, Pay Teachers, Buy Books...",<p>Iff you start with the basic fact that inno...,let-s-see-hire-cops-pay-teachers-buy-books-for...,546dce659ad54ec65b000041,3118,8,0.1,False,False,,,201446,confirmatory
1,12,2014-11-20 15:00:01.032,2016-04-02 16:25:54.128,546e01d626714c6c4400004e,Things that matter. Pass 'em on.,People Sent This Lesbian Questions And Her Rai...,<p>I'll be honest. I've wondered about 7.</p>,people-sent-this-lesbian-questions-and-her-rai...,546d1b4bfd3617f091000041,4587,130,55.8,False,False,,,201446,confirmatory
2,13,2014-11-20 11:33:51.973,2016-04-02 16:25:54.069,546dd17e26714c82cc00001c,Things that matter. Pass 'em on.,$3 Million Is What It Takes For A State To Leg...,<p>Iff you start with the basic fact that inno...,3-million-is-what-it-takes-for-a-state-to-lega...,546dce659ad54ec65b000041,3017,19,26.9,False,False,,,201446,confirmatory


In [47]:
exps_df['source_type'].value_counts(dropna=False)

source_type
confirmatory    105551
undeployed       78232
exploratory      22666
holdout          22600
Name: count, dtype: int64

In [48]:
exps_df.iloc[0]

id                                                                     11
created_at                                     2014-11-20 11:33:26.475000
updated_at                                     2016-04-02 16:25:54.046000
clickability_test_id                             546dd17e26714c82cc00001c
excerpt                                  Things that matter. Pass 'em on.
headline                Let’s See … Hire Cops, Pay Teachers, Buy Books...
lede                    <p>Iff you start with the basic fact that inno...
slug                    let-s-see-hire-cops-pay-teachers-buy-books-for...
eyecatcher_id                                    546dce659ad54ec65b000041
impressions                                                          3118
clicks                                                                  8
significance                                                          0.1
first_place                                                         False
winner                                