In [1]:
import duckdb

# Raw data

In [2]:
local_conn = duckdb.connect()

In [3]:
# filenames are of the format upworthy-archive-confirmatory-packages-03.12.2020.csv,
# with 'confirmatory', 'exploratory', 'holdout', and 'undeployed'
filename_descs_and_dates = [('confirmatory','-03.12.2020'),
                            ('exploratory','-03.12.2020'),
                            ('holdout','-03.12.2020'),
                            ('undeployed','.01.12.2021')]
def get_filename(desc, date):
    return f'datasets/upworthy-archive-{desc}-packages{date}.csv'

def get_df_from_sql(conn, sql):
    return conn.execute(sql).fetchdf()

In [4]:
def get_file_details(file_desc, file_date):
    df = get_df_from_sql(local_conn, 
    f"""
    SELECT
        *
    FROM read_csv_auto('{get_filename(file_desc, file_date)}');                             
    """)
    
    return len(df), df.columns

for file_desc, file_date in filename_descs_and_dates:
    print(f'File: {file_desc}')
    print(get_file_details(file_desc, file_date))

File: confirmatory
(105551, Index(['column00', 'created_at', 'updated_at', 'clickability_test_id',
       'excerpt', 'headline', 'lede', 'slug', 'eyecatcher_id', 'impressions',
       'clicks', 'significance', 'first_place', 'winner', 'share_text',
       'square', 'test_week'],
      dtype='object'))
File: exploratory
(22666, Index(['column00', 'created_at', 'updated_at', 'clickability_test_id',
       'excerpt', 'headline', 'lede', 'slug', 'eyecatcher_id', 'impressions',
       'clicks', 'significance', 'first_place', 'winner', 'share_text',
       'square', 'test_week'],
      dtype='object'))
File: holdout
(22600, Index(['column00', 'created_at', 'updated_at', 'clickability_test_id',
       'excerpt', 'headline', 'lede', 'slug', 'eyecatcher_id', 'impressions',
       'clicks', 'significance', 'first_place', 'winner', 'share_text',
       'square', 'test_week'],
      dtype='object'))
File: undeployed
(78232, Index(['_id', 'created_at', 'updated_at', 'clickability_test_id', 'excerpt

In [5]:
105551 + 22666 + 22600 + 78232

229049

# Create a single table in a persisted duckdb file

In [6]:
create_table_sql = f"""
    CREATE OR REPLACE TABLE exps AS
    SELECT column00 as id, * EXCLUDE (column00), 'confirmatory' as source_type FROM '{get_filename('confirmatory', '-03.12.2020')}'
    UNION ALL
    SELECT column00 as id, * EXCLUDE (column00), 'exploratory' as source_type FROM '{get_filename('exploratory', '-03.12.2020')}'
    UNION ALL
    SELECT column00 as id, * EXCLUDE (column00), 'holdout' as source_type FROM '{get_filename('holdout', '-03.12.2020')}'
    UNION ALL
    SELECT _id as id, * EXCLUDE (_id),  NULL as test_week, 'undeployed' as source_type FROM '{get_filename('undeployed', '.01.12.2021')}'
"""
print(create_table_sql)


    CREATE OR REPLACE TABLE exps AS
    SELECT column00 as id, * EXCLUDE (column00), 'confirmatory' as source_type FROM 'datasets/upworthy-archive-confirmatory-packages-03.12.2020.csv'
    UNION ALL
    SELECT column00 as id, * EXCLUDE (column00), 'exploratory' as source_type FROM 'datasets/upworthy-archive-exploratory-packages-03.12.2020.csv'
    UNION ALL
    SELECT column00 as id, * EXCLUDE (column00), 'holdout' as source_type FROM 'datasets/upworthy-archive-holdout-packages-03.12.2020.csv'
    UNION ALL
    SELECT _id as id, * EXCLUDE (_id),  NULL as test_week, 'undeployed' as source_type FROM 'datasets/upworthy-archive-undeployed-packages.01.12.2021.csv'



In [7]:
file_conn = duckdb.connect('experiments.duckdb')
file_conn.execute(create_table_sql)

<_duckdb.DuckDBPyConnection at 0x1100346f0>

In [9]:
exps_df = get_df_from_sql(file_conn, 'FROM exps')
len(exps_df)

229049

In [10]:
exps_df[:3]

Unnamed: 0,id,created_at,updated_at,clickability_test_id,excerpt,headline,lede,slug,eyecatcher_id,impressions,clicks,significance,first_place,winner,share_text,square,test_week,source_type
0,11,2014-11-20 11:33:26.475,2016-04-02 16:25:54.046,546dd17e26714c82cc00001c,Things that matter. Pass 'em on.,"Let’s See … Hire Cops, Pay Teachers, Buy Books...",<p>Iff you start with the basic fact that inno...,let-s-see-hire-cops-pay-teachers-buy-books-for...,546dce659ad54ec65b000041,3118,8,0.1,False,False,,,201446,confirmatory
1,12,2014-11-20 15:00:01.032,2016-04-02 16:25:54.128,546e01d626714c6c4400004e,Things that matter. Pass 'em on.,People Sent This Lesbian Questions And Her Rai...,<p>I'll be honest. I've wondered about 7.</p>,people-sent-this-lesbian-questions-and-her-rai...,546d1b4bfd3617f091000041,4587,130,55.8,False,False,,,201446,confirmatory
2,13,2014-11-20 11:33:51.973,2016-04-02 16:25:54.069,546dd17e26714c82cc00001c,Things that matter. Pass 'em on.,$3 Million Is What It Takes For A State To Leg...,<p>Iff you start with the basic fact that inno...,3-million-is-what-it-takes-for-a-state-to-lega...,546dce659ad54ec65b000041,3017,19,26.9,False,False,,,201446,confirmatory


In [11]:
exps_df['source_type'].value_counts(dropna=False)

source_type
confirmatory    105551
undeployed       78232
exploratory      22666
holdout          22600
Name: count, dtype: int64

In [12]:
exps_df.iloc[0]

id                                                                     11
created_at                                     2014-11-20 11:33:26.475000
updated_at                                     2016-04-02 16:25:54.046000
clickability_test_id                             546dd17e26714c82cc00001c
excerpt                                  Things that matter. Pass 'em on.
headline                Let’s See … Hire Cops, Pay Teachers, Buy Books...
lede                    <p>Iff you start with the basic fact that inno...
slug                    let-s-see-hire-cops-pay-teachers-buy-books-for...
eyecatcher_id                                    546dce659ad54ec65b000041
impressions                                                          3118
clicks                                                                  8
significance                                                          0.1
first_place                                                         False
winner                                

In [13]:
exps_df['headline'].nunique()

101005

In [18]:
get_df_from_sql(file_conn, 'SELECT COUNT(DISTINCT headline), COUNT(DISTINCT clickability_test_id) FROM exps')

Unnamed: 0,count(DISTINCT headline),count(DISTINCT clickability_test_id)
0,101005,50995


In [21]:
get_df_from_sql(file_conn, 'SELECT COUNT(DISTINCT clickability_test_id), COUNT(*), source_type FROM exps GROUP BY source_type')

Unnamed: 0,count(DISTINCT clickability_test_id),count_star(),source_type
0,4871,22600,holdout
1,18514,78232,undeployed
2,22743,105551,confirmatory
3,4873,22666,exploratory


In [22]:
get_df_from_sql(file_conn, "SELECT * FROM exps WHERE source_type='exploratory' ORDER BY clickability_test_id LIMIT 10")

Unnamed: 0,id,created_at,updated_at,clickability_test_id,excerpt,headline,lede,slug,eyecatcher_id,impressions,clicks,significance,first_place,winner,share_text,square,test_week,source_type
0,4953,2013-02-24 07:10:53,2016-04-02 16:24:07.081,51436061220cb800020001e7,Religion and science can indeed be friends.,Creationism Has Nothing To Do With Christianit...,"<p>In a debate about creationism, one guy got ...",creationism-has-nothing-to-do-with-christianit...,5332ba141fae79f09f002c4f,2551,39,0.8,False,False,,A_Priest_Ridicules_Creationist_-_YouTube.jpg,201308,exploratory
1,38929,2013-02-24 07:08:08,2016-04-02 16:26:25.064,51436061220cb800020001e7,"Good show, father.",The One Where A Creationist Picks A Fight And ...,"<p>In a debate about creationism, one guy got ...",the-one-where-a-creationist-picks-a-fight-and-...,5332ba141fae79f09f002c4f,2629,68,100.0,True,True,,A_Priest_Ridicules_Creationist_-_YouTube.jpg,201308,exploratory
2,40172,2013-02-24 07:07:13,2016-04-02 16:26:28.801,51436061220cb800020001e7,Why is he wearing that costume?,Creationism Shouldn't Be Taught In Science Cla...,"<p>In a debate about creationism, one guy got ...",creationism-shouldnt-be-taught-in-science-clas...,5332ba141fae79f09f002c4f,2539,49,11.9,False,False,,A_Priest_Ridicules_Creationist_-_YouTube.jpg,201308,exploratory
3,40173,2013-02-24 07:09:18,2016-04-02 16:26:28.804,51436061220cb800020001e7,"Well played, God. Well played.",God Finds Out About Creationism And Sends A Re...,"<p>In a debate about creationism, one guy got ...",god-finds-out-about-creationism-and-sends-a-re...,5332ba141fae79f09f002c4f,2661,63,61.4,False,False,,A_Priest_Ridicules_Creationist_-_YouTube.jpg,201308,exploratory
4,5016,2013-03-17 07:38:21,2016-04-02 16:24:11.525,51436069220cb800020005ae,"Honesty is so refreshing. Also, rude.",The One Where Your Advertisements Talk To You ...,Learn what advertisements say to each other be...,the-one-where-your-advertisements-talk-to-you-...,5332b5961fae79f09f000636,1950,15,0.0,False,False,,Screenshot_3_17_13_1_32_AM_MDT.jpg,201311,exploratory
5,39352,2013-03-17 07:36:27,2016-04-02 16:26:26.456,51436069220cb800020005ae,It's about time we had some truth in advertising.,"The Advertisements You Read Every Day, Only Naked",Learn what advertisements say to each other be...,the-advertisements-you-read-every-day-only-nak...,5332b5961fae79f09f000636,1986,44,100.0,True,True,,Screenshot_3_17_13_1_32_AM_MDT.jpg,201311,exploratory
6,39353,2013-03-17 07:37:01,2016-04-02 16:26:26.459,51436069220cb800020005ae,"Finally, some honesty.",Wouldn't It Be Great If Ads Told The Truth?,"Refreshing, horrifying honesty.",wouldnt-it-be-great-if-ads-told-the-truth,5332b5961fae79f09f000636,2003,21,1.1,False,False,,Screenshot_3_17_13_1_32_AM_MDT.jpg,201311,exploratory
7,39354,2013-03-17 07:37:25,2016-04-02 16:26:26.462,51436069220cb800020005ae,Rude.,What Your Advertisement Say About You Behind C...,"Nice to have a little honesty for once, no?",what-your-advertisement-say-about-you-behind-c...,5332b5961fae79f09f000636,1907,15,0.1,False,False,,Screenshot_3_17_13_1_32_AM_MDT.jpg,201311,exploratory
8,39355,2013-03-17 07:42:10,2016-04-02 16:26:26.464,51436069220cb800020005ae,"It's about time, ads!",What If Advertisements Just Said What They Mea...,"Nice to have a little honesty for once, no?\n",what-if-advertisements-just-said-what-they-meant,5332b5961fae79f09f000636,1951,16,0.0,False,False,,Screenshot_3_17_13_1_32_AM_MDT.jpg,201311,exploratory
9,39087,2013-02-07 05:41:34,2016-04-02 16:26:25.696,51436069220cb800020005bd,You have so much potential. Why are you wastin...,"Mr. President, I'm Not Mad. I'm Just Disappoin...","<p>The President made a lot of promises, and h...",mr-president-im-not-mad-im-just-disappointed-n...,5332ba141fae79f09f002c47,5363,89,23.0,False,False,,disappointed-obama.jpg,201305,exploratory
