# Metadata dataset and influencer dataset JOIN

This script merges the metadata with influencers data, and cleanes captions

In [1]:
import duckdb, os, unicodedata, re

In [2]:
# Configuration and connection
DB_PATH = "D:/db/meta.duckdb"                 

con = duckdb.connect(DB_PATH)

con.execute("PRAGMA threads=2;") 
con.execute("SET memory_limit='5GB';") 
con.execute("SET preserve_insertion_order=false;") 

print("\n Set up ready")


 Set up ready


In [3]:
# Set input and output paths
BASE_PATH = "D:/dataset/meta_before_join"
OUT_PATH = "D:/dataset/meta_with_er"
os.makedirs(OUT_PATH, exist_ok=True)

def join_and_save(year, month):
    in_file = os.path.join(BASE_PATH, f"year={year}", f"month={month}", "data_0.parquet")
    out_dir = os.path.join(OUT_PATH, f"year={year}", f"month={month}")
    os.makedirs(out_dir, exist_ok=True)
    out_file = os.path.join(out_dir, "data_0.parquet")

    # View on the parquet file 
    con.sql(f"""
    CREATE OR REPLACE VIEW v_meta_before_join AS
    SELECT * FROM '{in_file}';
    """)

    con.sql("""
    SELECT * FROM influencers;
    """)

    # JOIN and ER computation
    con.sql(f"""
    CREATE OR REPLACE VIEW v_joined AS
    SELECT 
        md.*,
        inf.category,
        inf.followers,
        inf.followees,
        inf.posts,
        CASE 
            WHEN inf.followers IS NULL OR inf.followers = 0 THEN NULL
            ELSE ROUND((CAST(md.like_count + md.comment_count AS DOUBLE) / inf.followers)*100, 2)
        END AS engagement_rate,
    FROM v_meta_before_join AS md
    LEFT JOIN influencers AS inf
        ON md.username = inf.username;
    """)


    # Export
    con.sql(f"""
    COPY v_joined TO '{out_file}' (FORMAT PARQUET);
    """)

    print(f"Saved: {out_file}")


# Loop on years and months
for year in range(2012, 2020):
    if year == 2012:
        months = range(2, 13)
    elif year == 2019:
        months = range(1, 6)
    else:
        months = range(1, 13)
    
    for month in months:
        join_and_save(year, month)

Saved: D:/dataset/meta_with_er\year=2012\month=2\data_0.parquet
Saved: D:/dataset/meta_with_er\year=2012\month=3\data_0.parquet
Saved: D:/dataset/meta_with_er\year=2012\month=4\data_0.parquet
Saved: D:/dataset/meta_with_er\year=2012\month=5\data_0.parquet
Saved: D:/dataset/meta_with_er\year=2012\month=6\data_0.parquet
Saved: D:/dataset/meta_with_er\year=2012\month=7\data_0.parquet
Saved: D:/dataset/meta_with_er\year=2012\month=8\data_0.parquet
Saved: D:/dataset/meta_with_er\year=2012\month=9\data_0.parquet
Saved: D:/dataset/meta_with_er\year=2012\month=10\data_0.parquet
Saved: D:/dataset/meta_with_er\year=2012\month=11\data_0.parquet
Saved: D:/dataset/meta_with_er\year=2012\month=12\data_0.parquet
Saved: D:/dataset/meta_with_er\year=2013\month=1\data_0.parquet
Saved: D:/dataset/meta_with_er\year=2013\month=2\data_0.parquet
Saved: D:/dataset/meta_with_er\year=2013\month=3\data_0.parquet
Saved: D:/dataset/meta_with_er\year=2013\month=4\data_0.parquet
Saved: D:/dataset/meta_with_er\year=2

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2017\month=5\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2017\month=6\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2017\month=7\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2017\month=8\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2017\month=9\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2017\month=10\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2017\month=11\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2017\month=12\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2018\month=1\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2018\month=2\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2018\month=3\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2018\month=4\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2018\month=5\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2018\month=6\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2018\month=7\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2018\month=8\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2018\month=9\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2018\month=10\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2018\month=11\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2018\month=12\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2019\month=1\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2019\month=2\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2019\month=3\data_0.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Saved: D:/dataset/meta_with_er\year=2019\month=4\data_0.parquet
Saved: D:/dataset/meta_with_er\year=2019\month=5\data_0.parquet


In [4]:
print(con.sql("""SELECT * FROM information_schema.tables""").fetchdf())

   table_catalog table_schema                 table_name  table_type  \
0           meta     features             captions_clean  BASE TABLE   
1           meta     features       img_handcrafted_post  BASE TABLE   
2           meta         main                clean_files  BASE TABLE   
3           meta         main            images_manifest  BASE TABLE   
4           meta         main        images_manifest1718  BASE TABLE   
5           meta         main  images_manifest1718_clean  BASE TABLE   
6           meta         main      images_manifest_clean  BASE TABLE   
7           meta         main                 img_splits  BASE TABLE   
8           meta         main                influencers  BASE TABLE   
9           meta         main                   metadata  BASE TABLE   
10          meta         main               metadata1718  BASE TABLE   
11          meta         main         metadata1718_ready  BASE TABLE   
12          meta         main             train_balanced  BASE T

In [5]:
# It is a month-year
print(con.sql("""SELECT COUNT(*) FROM v_joined"""))

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│        43670 │
└──────────────┘



In [6]:
con.execute("""DROP VIEW v_joined""")

<_duckdb.DuckDBPyConnection at 0x1bdb76aea70>

In [7]:
con.execute("""
CREATE OR REPLACE VIEW v_meta_after_join AS
SELECT *
FROM 'D:/dataset/meta_with_er/year=*/month=*/data_0.parquet';
""")

<_duckdb.DuckDBPyConnection at 0x1bdb76aea70>

In [8]:
# Check null and 0 followers, and potential ER errors
print(con.sql("""
SELECT
    COUNT(*) AS total_rows,
    COUNT(CASE WHEN followers IS NULL THEN 1 END) AS followers_null,
    COUNT(CASE WHEN followers = 0 THEN 1 END) AS followers_zero,
    COUNT(CASE WHEN engagement_rate IS NOT NULL AND (followers IS NULL OR followers = 0) THEN 1 END) AS bad_engagement_cases
FROM v_meta_after_join;
"""))

# Check ER
print(con.sql("""
SELECT
    COUNT(*) AS total_rows,
    COUNT(CASE WHEN engagement_rate < 0 THEN 1 END) AS negative_er,
    COUNT(CASE WHEN engagement_rate == 0 THEN 1 END) AS zero_er,
    COUNT(CASE WHEN engagement_rate > 1 THEN 1 END) AS over_100_percent,
    COUNT(CASE WHEN engagement_rate < 0.05 THEN 1 END) AS under_5_percent,
    COUNT(CASE WHEN engagement_rate > 0.05 AND engagement_rate <= 0.2 THEN 1 END) AS from_5_to_20,
    COUNT(CASE WHEN engagement_rate > 0.2 AND engagement_rate <= 0.4 THEN 1 END) AS from_20_to_40,
    COUNT(CASE WHEN engagement_rate > 0.4 AND engagement_rate <= 0.6 THEN 1 END) AS from_40_to_60,
    COUNT(CASE WHEN engagement_rate > 0.6 AND engagement_rate <= 0.8 THEN 1 END) AS from_60_to_80,
    COUNT(CASE WHEN engagement_rate > 0.8 AND engagement_rate <= 1 THEN 1 END) AS from_80_to_100,
    COUNT(CASE WHEN engagement_rate IS NULL THEN 1 END) AS null_er

FROM v_meta_after_join;
"""))


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌────────────┬────────────────┬────────────────┬──────────────────────┐
│ total_rows │ followers_null │ followers_zero │ bad_engagement_cases │
│   int64    │     int64      │     int64      │        int64         │
├────────────┼────────────────┼────────────────┼──────────────────────┤
│    9726198 │         159547 │              0 │                    0 │
└────────────┴────────────────┴────────────────┴──────────────────────┘

┌────────────┬─────────────┬─────────┬──────────────────┬─────────────────┬──────────────┬───────────────┬───────────────┬───────────────┬────────────────┬─────────┐
│ total_rows │ negative_er │ zero_er │ over_100_percent │ under_5_percent │ from_5_to_20 │ from_20_to_40 │ from_40_to_60 │ from_60_to_80 │ from_80_to_100 │ null_er │
│   int64    │    int64    │  int64  │      int64       │      int64      │    int64     │     int64     │     int64     │     int64     │     int64      │  int64  │
├────────────┼─────────────┼─────────┼──────────────────┼────────────

In [9]:
print(con.sql("""
SELECT username, like_count, comment_count, followers, engagement_rate
FROM v_meta_after_join
WHERE engagement_rate > 100
LIMIT 10
"""))

print(con.sql("""
SELECT username, followers, like_count, comment_count, engagement_rate
FROM v_meta_after_join
WHERE engagement_rate = 0
LIMIT 10
"""))

┌────────────────────┬────────────┬───────────────┬───────────┬─────────────────┐
│      username      │ like_count │ comment_count │ followers │ engagement_rate │
│      varchar       │   int32    │     int32     │   int32   │     double      │
├────────────────────┼────────────┼───────────────┼───────────┼─────────────────┤
│ ohsnapianduncan    │       2834 │             8 │      2708 │          104.95 │
│ amysoub            │       2458 │           161 │      2329 │          112.45 │
│ funnieronline      │      31948 │           277 │     22545 │          142.94 │
│ funnieronline      │      54042 │           581 │     22545 │          242.28 │
│ funnieronline      │      33587 │           279 │     22545 │          150.22 │
│ funnieronline      │      23821 │           165 │     22545 │          106.39 │
│ funnieronline      │      34612 │           295 │     22545 │          154.83 │
│ funnieronline      │      43349 │           556 │     22545 │          194.74 │
│ pennylaneisthe

In [10]:
print(con.sql("""
SELECT username, like_count, comment_count, followers, engagement_rate
FROM v_meta_after_join
WHERE username = 'chefbenedetti' AND year = 2012
ORDER BY like_count DESC
LIMIT 20
"""))

┌───────────────┬────────────┬───────────────┬───────────┬─────────────────┐
│   username    │ like_count │ comment_count │ followers │ engagement_rate │
│    varchar    │   int32    │     int32     │   int32   │     double      │
├───────────────┼────────────┼───────────────┼───────────┼─────────────────┤
│ chefbenedetti │         44 │            15 │     81765 │            0.07 │
│ chefbenedetti │         40 │             1 │     81765 │            0.05 │
│ chefbenedetti │         34 │             4 │     81765 │            0.05 │
│ chefbenedetti │         33 │             1 │     81765 │            0.04 │
│ chefbenedetti │         32 │             4 │     81765 │            0.04 │
│ chefbenedetti │         31 │             0 │     81765 │            0.04 │
│ chefbenedetti │         30 │             0 │     81765 │            0.04 │
│ chefbenedetti │         30 │             0 │     81765 │            0.04 │
│ chefbenedetti │         29 │             3 │     81765 │            0.04 │

In [11]:
print(con.sql("""
SELECT filename, username, COUNT(*) AS n
FROM v_meta_after_join
WHERE followers IS NULL
GROUP BY filename, username"""))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌──────────────────────────────────────────────────┬──────────────────────┬───────┐
│                     filename                     │       username       │   n   │
│                     varchar                      │       varchar        │ int64 │
├──────────────────────────────────────────────────┼──────────────────────┼───────┤
│ em_ilylonergan-605388545853741369.info           │ em_ilylonergann      │     1 │
│ katieljohnson-618788608676230513.info            │ catherinelou.j       │     1 │
│ katieljohnson-618791707428025799.info            │ catherinelou.j       │     1 │
│ madamnormajean-850041745260428252.info           │ mnj_gang             │     1 │
│ madamnormajean-850112611457448194.info           │ mnj_gang             │     1 │
│ madamnormajean-855628741848611672.info           │ mnj_gang             │     1 │
│ madamnormajean-855633465658325052.info           │ mnj_gang             │     1 │
│ madamnormajean-861017589357230695.info           │ mnj_gang             │ 

In [12]:
print(con.sql("""
SELECT 
    username,
    COUNT(followers) AS n_valid,
    COUNT(*) - COUNT(followers) AS n_null
FROM v_meta_after_join
WHERE username = 'matthiasplinke' OR username = 'bayybass' OR username = 'cassandra_duostudios'
GROUP BY username
"""))

┌──────────────────────┬─────────┬────────┐
│       username       │ n_valid │ n_null │
│       varchar        │  int64  │ int64  │
├──────────────────────┼─────────┼────────┤
│ bayybass             │       0 │    298 │
│ cassandra_duostudios │       0 │    300 │
│ matthiasplinke       │       0 │    300 │
└──────────────────────┴─────────┴────────┘



In [13]:
print(con.sql("""
SELECT COUNT(*) AS n_null
FROM v_meta_after_join
WHERE followers IS NULL
"""))

print(con.sql("""
SELECT COUNT(*) AS n_rows
FROM v_meta_after_join
"""))

┌────────┐
│ n_null │
│ int64  │
├────────┤
│ 159547 │
└────────┘

┌─────────┐
│ n_rows  │
│  int64  │
├─────────┤
│ 9726198 │
└─────────┘



In [3]:
con.execute("""
CREATE OR REPLACE TABLE metadata AS
SELECT *
FROM v_meta_after_join
WHERE followers IS NOT NULL
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<_duckdb.DuckDBPyConnection at 0x1e8062e4cb0>

In [4]:
# CHECK

# Check null and 0 followers, and potential ER errors
print(con.sql("""
SELECT
    COUNT(*) AS total_rows,
    COUNT(CASE WHEN followers IS NULL THEN 1 END) AS followers_null,
    COUNT(CASE WHEN followers = 0 THEN 1 END) AS followers_zero,
    COUNT(CASE WHEN engagement_rate IS NOT NULL AND (followers IS NULL OR followers = 0) THEN 1 END) AS bad_engagement_cases
FROM metadata;
"""))

# Check ER
print(con.sql("""
SELECT
    COUNT(*) AS total_rows,
    COUNT(CASE WHEN engagement_rate < 0 THEN 1 END) AS negative_er,
    COUNT(CASE WHEN engagement_rate == 0 THEN 1 END) AS zero_er,
    COUNT(CASE WHEN engagement_rate > 1 THEN 1 END) AS over_100_percent,
    COUNT(CASE WHEN engagement_rate < 0.05 THEN 1 END) AS under_5_percent,
    COUNT(CASE WHEN engagement_rate > 0.05 AND engagement_rate <= 0.2 THEN 1 END) AS from_5_to_20,
    COUNT(CASE WHEN engagement_rate > 0.2 AND engagement_rate <= 0.4 THEN 1 END) AS from_20_to_40,
    COUNT(CASE WHEN engagement_rate > 0.4 AND engagement_rate <= 0.6 THEN 1 END) AS from_40_to_60,
    COUNT(CASE WHEN engagement_rate > 0.6 AND engagement_rate <= 0.8 THEN 1 END) AS from_60_to_80,
    COUNT(CASE WHEN engagement_rate > 0.8 AND engagement_rate <= 1 THEN 1 END) AS from_80_to_100,
    COUNT(CASE WHEN engagement_rate IS NULL THEN 1 END) AS null_er

FROM metadata;
"""))


┌────────────┬────────────────┬────────────────┬──────────────────────┐
│ total_rows │ followers_null │ followers_zero │ bad_engagement_cases │
│   int64    │     int64      │     int64      │        int64         │
├────────────┼────────────────┼────────────────┼──────────────────────┤
│    9566651 │              0 │              0 │                    0 │
└────────────┴────────────────┴────────────────┴──────────────────────┘

┌────────────┬─────────────┬─────────┬──────────────────┬─────────────────┬──────────────┬───────────────┬───────────────┬───────────────┬────────────────┬─────────┐
│ total_rows │ negative_er │ zero_er │ over_100_percent │ under_5_percent │ from_5_to_20 │ from_20_to_40 │ from_40_to_60 │ from_60_to_80 │ from_80_to_100 │ null_er │
│   int64    │    int64    │  int64  │      int64       │      int64      │    int64     │     int64     │     int64     │     int64     │     int64      │  int64  │
├────────────┼─────────────┼─────────┼──────────────────┼────────────

In [5]:
print(con.sql("""SELECT min(engagement_rate) as min, max(engagement_rate) as max
FROM metadata"""))

┌────────┬─────────┐
│  min   │   max   │
│ double │ double  │
├────────┼─────────┤
│    0.0 │ 1409.59 │
└────────┴─────────┘



In [6]:
print(con.sql("""SELECT engagement_rate
FROM metadata
ORDER BY engagement_rate DESC
LIMIT 20"""))

print(con.sql("""SELECT engagement_rate
FROM metadata
ORDER BY engagement_rate
LIMIT 20"""))

┌─────────────────┐
│ engagement_rate │
│     double      │
├─────────────────┤
│         1409.59 │
│         1258.49 │
│         1151.95 │
│          1139.1 │
│         1136.78 │
│          953.46 │
│          895.05 │
│          823.29 │
│          818.53 │
│          787.64 │
│          771.36 │
│          759.88 │
│          727.81 │
│           702.4 │
│          687.57 │
│          678.76 │
│          640.52 │
│          627.81 │
│          604.65 │
│          597.99 │
├─────────────────┤
│     20 rows     │
└─────────────────┘

┌─────────────────┐
│ engagement_rate │
│     double      │
├─────────────────┤
│             0.0 │
│             0.0 │
│             0.0 │
│             0.0 │
│             0.0 │
│             0.0 │
│             0.0 │
│             0.0 │
│             0.0 │
│             0.0 │
│             0.0 │
│             0.0 │
│             0.0 │
│             0.0 │
│             0.0 │
│             0.0 │
│             0.0 │
│             0.0 │
│             0.0 │

In [7]:
print(con.sql("""SELECT COUNT(*) AS viral
FROM metadata
WHERE engagement_rate >100"""))

┌───────┐
│ viral │
│ int64 │
├───────┤
│  1693 │
└───────┘



In [8]:
con.execute("""
-- Compute log version of ER (+1) for the 0 cases
ALTER TABLE metadata ADD COLUMN IF NOT EXISTS er_log DOUBLE;

UPDATE metadata
SET er_log = ROUND(LOG10(1 + engagement_rate), 6);
""")

<_duckdb.DuckDBPyConnection at 0x1e8062e4cb0>

In [9]:
# Check the normalization
con.sql("""
SELECT MIN(er_log) AS min_log, MAX(er_log) AS max_log,
FROM metadata;
""").df()

Unnamed: 0,min_log,max_log
0,0.0,3.149401


In [10]:
con.execute("""
CREATE OR REPLACE TABLE er_quantiles AS
SELECT
  quantile_cont(er_log, 0.20) AS q20,
  quantile_cont(er_log, 0.40) AS q40,
  quantile_cont(er_log, 0.60) AS q60,
  quantile_cont(er_log, 0.80) AS q80
FROM metadata;
""")

con.execute("""
ALTER TABLE metadata ADD COLUMN IF NOT EXISTS er_bins VARCHAR;

UPDATE metadata
SET er_bins = CASE
    WHEN er_log < (SELECT q20 FROM er_quantiles) THEN 'very_low'
    WHEN er_log < (SELECT q40 FROM er_quantiles) THEN 'low'
    WHEN er_log < (SELECT q60 FROM er_quantiles) THEN 'medium'
    WHEN er_log < (SELECT q80 FROM er_quantiles) THEN 'high'
    ELSE 'very_high'
END;
""")


<_duckdb.DuckDBPyConnection at 0x1e8062e4cb0>

In [11]:
print(con.sql("""SELECT er_bins, COUNT(*) AS n_posts, MIN(engagement_rate) AS min, MAX(engagement_rate) AS max, AVG(engagement_rate) as avg
FROM metadata
GROUP BY er_bins
ORDER BY ANY_VALUE(engagement_rate)"""))

┌───────────┬─────────┬────────┬─────────┬────────────────────┐
│  er_bins  │ n_posts │  min   │   max   │        avg         │
│  varchar  │  int64  │ double │ double  │       double       │
├───────────┼─────────┼────────┼─────────┼────────────────────┤
│ very_low  │ 1903501 │    0.0 │     1.3 │ 0.7418407135065145 │
│ low       │ 1914195 │   1.31 │     2.4 │ 1.8423006903694297 │
│ medium    │ 1921815 │   2.41 │    3.86 │ 3.0851293230620174 │
│ high      │ 1911205 │   3.87 │    6.46 │  5.000273220298658 │
│ very_high │ 1915935 │   6.47 │ 1409.59 │ 11.375043250423689 │
└───────────┴─────────┴────────┴─────────┴────────────────────┘



In [6]:
print(con.sql("""SELECT er_bins, COUNT(*) AS n_posts, MIN(engagement_rate) AS min, MAX(engagement_rate) AS max, AVG(engagement_rate) as avg
FROM md1718
GROUP BY er_bins
ORDER BY ANY_VALUE(engagement_rate)"""))

┌───────────┬─────────┬────────┬─────────┬────────────────────┐
│  er_bins  │ n_posts │  min   │   max   │        avg         │
│  varchar  │  int64  │ double │ double  │       double       │
├───────────┼─────────┼────────┼─────────┼────────────────────┤
│ very_low  │  309727 │    0.0 │     1.3 │ 0.7728436332641109 │
│ low       │  332319 │   1.31 │     2.4 │  1.842062957579938 │
│ medium    │  331165 │   2.41 │    3.86 │ 3.0835372699408286 │
│ high      │  326683 │   3.87 │    6.46 │  4.996691165441903 │
│ very_high │  309532 │   6.47 │ 1409.59 │ 11.399102483750013 │
└───────────┴─────────┴────────┴─────────┴────────────────────┘



In [4]:
print(con.sql("""SELECT er_bins3, COUNT(*) AS n_posts, MIN(engagement_rate) AS min, MAX(engagement_rate) AS max, AVG(engagement_rate) as avg
FROM md1718
GROUP BY er_bins3
ORDER BY ANY_VALUE(engagement_rate)"""))

┌──────────┬─────────┬────────┬─────────┬────────────────────┐
│ er_bins3 │ n_posts │  min   │   max   │        avg         │
│ varchar  │  int64  │ double │ double  │       double       │
├──────────┼─────────┼────────┼─────────┼────────────────────┤
│ low      │  526637 │    0.0 │     2.0 │ 1.1342861781454838 │
│ medium   │  544228 │   2.01 │    4.45 │ 3.0882210948355038 │
│ high     │  538561 │   4.46 │ 1409.59 │  8.829733809169408 │
└──────────┴─────────┴────────┴─────────┴────────────────────┘



In [5]:
print(con.sql("""SELECT er_bins2, COUNT(*) AS n_posts, MIN(engagement_rate) AS min, MAX(engagement_rate) AS max, AVG(engagement_rate) as avg
FROM md1718
GROUP BY er_bins2
ORDER BY ANY_VALUE(engagement_rate)"""))

┌──────────┬─────────┬────────┬─────────┬────────────────────┐
│ er_bins2 │ n_posts │  min   │   max   │        avg         │
│ varchar  │  int64  │ double │ double  │       double       │
├──────────┼─────────┼────────┼─────────┼────────────────────┤
│ low      │  804349 │    0.0 │    3.04 │ 1.6065133667102887 │
│ high     │  805077 │   3.05 │ 1409.59 │  7.131251184669312 │
└──────────┴─────────┴────────┴─────────┴────────────────────┘



In [41]:
print(con.sql("""SELECT AVG(engagement_rate)
FROM metadata"""))

┌──────────────────────┐
│ avg(engagement_rate) │
│        double        │
├──────────────────────┤
│   4.4130444175291865 │
└──────────────────────┘



In [13]:
print(con.sql("""
SELECT COUNT(*) AS n_null
FROM metadata
WHERE followers IS NULL
"""))

print(con.sql("""
SELECT COUNT(*) AS n_rows
FROM metadata
"""))

print(con.sql("""
SELECT COUNT(*) AS n_null
FROM metadata
WHERE engagement_rate IS NULL
"""))

print(con.sql("""
SELECT COUNT(*) AS n_null
FROM metadata
WHERE er_log IS NULL
"""))

print(con.sql("""
SELECT COUNT(*) AS n_null
FROM metadata
WHERE er_bins IS NULL
"""))

┌────────┐
│ n_null │
│ int64  │
├────────┤
│      0 │
└────────┘

┌─────────┐
│ n_rows  │
│  int64  │
├─────────┤
│ 9566651 │
└─────────┘

┌────────┐
│ n_null │
│ int64  │
├────────┤
│      0 │
└────────┘

┌────────┐
│ n_null │
│ int64  │
├────────┤
│      0 │
└────────┘

┌────────┐
│ n_null │
│ int64  │
├────────┤
│      0 │
└────────┘



In [14]:
con.sql("""SELECT * FROM information_schema.tables""").fetchdf()

Unnamed: 0,table_catalog,table_schema,table_name,table_type,self_referencing_column_name,reference_generation,user_defined_type_catalog,user_defined_type_schema,user_defined_type_name,is_insertable_into,is_typed,commit_action,TABLE_COMMENT
0,meta,features,captions_clean,BASE TABLE,,,,,,YES,NO,,
1,meta,features,img_handcrafted,BASE TABLE,,,,,,YES,NO,,
2,meta,features,img_handcrafted_post,BASE TABLE,,,,,,YES,NO,,
3,meta,features,img_handcrafted_post4,BASE TABLE,,,,,,YES,NO,,
4,meta,features,img_handcrafted_post_new,BASE TABLE,,,,,,YES,NO,,
5,meta,main,caption_clean_1718,BASE TABLE,,,,,,YES,NO,,
6,meta,main,caption_lang_1718,BASE TABLE,,,,,,YES,NO,,
7,meta,main,clean_files,BASE TABLE,,,,,,YES,NO,,
8,meta,main,er_quantiles,BASE TABLE,,,,,,YES,NO,,
9,meta,main,images_manifest,BASE TABLE,,,,,,YES,NO,,


In [None]:
con.execute("""DROP TABLE er_quantiles""")

In [49]:
con.sql("PRAGMA table_info('metadata');").fetchdf()

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,filename,VARCHAR,False,,False
1,1,username,VARCHAR,False,,False
2,2,like_count,INTEGER,False,,False
3,3,comment_count,INTEGER,False,,False
4,4,width,INTEGER,False,,False
5,5,height,INTEGER,False,,False
6,6,time_utc,TIMESTAMP,False,,False
7,7,caption,VARCHAR,False,,False
8,8,aspect_ratio,DOUBLE,False,,False
9,9,area,INTEGER,False,,False


In [17]:
con.execute("""
CREATE OR REPLACE TABLE metadata1718 AS
SELECT *
FROM metadata
WHERE year IN (2017, 2018)
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<_duckdb.DuckDBPyConnection at 0x1e8062e4cb0>

In [18]:
print(con.sql("""
SELECT COUNT(*) AS n_rows
FROM metadata"""))

print(con.sql("""
SELECT COUNT(*) AS n_rows_1718
FROM metadata1718"""))

┌─────────┐
│ n_rows  │
│  int64  │
├─────────┤
│ 9566651 │
└─────────┘

┌─────────────┐
│ n_rows_1718 │
│    int64    │
├─────────────┤
│     6936079 │
└─────────────┘



In [19]:
# Check duplicates (metadata)
con.sql("""SELECT COUNT(*) AS tot, COUNT(DISTINCT filename) AS single_posts, COUNT(*) - COUNT(DISTINCT filename) AS duplicates FROM metadata1718 """).df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,tot,single_posts,duplicates
0,6936079,6936079,0


In [20]:
con.close()

# LANGUAGE

In [2]:
from lingua import Language, LanguageDetectorBuilder, IsoCode639_1
import unicodedata
import re

In [3]:
SUPPORTED_LANGS = [
    "ar","bg","ca","cs","da","de","el","en","es","et","fa","fi","fr","gu","he",
    "hi","hr","hu","hy","id","it","ja","ka","ko","lt","lv","mk","mn","mr","ms",
    "nb","nl","pl","pt","ro","ru","sk","sl","sq","sr","sv","th","tr","uk","ur","vi"
]

In [4]:
iso_objs = [IsoCode639_1(code) for code in SUPPORTED_LANGS]
LINGUA_LANGS = [Language.from_iso_code_639_1(iso) for iso in iso_objs]
LINGUA_LANGS

[Language.ARABIC,
 Language.BULGARIAN,
 Language.CATALAN,
 Language.CZECH,
 Language.DANISH,
 Language.GERMAN,
 Language.GREEK,
 Language.ENGLISH,
 Language.SPANISH,
 Language.ESTONIAN,
 Language.PERSIAN,
 Language.FINNISH,
 Language.FRENCH,
 Language.GUJARATI,
 Language.HEBREW,
 Language.HINDI,
 Language.CROATIAN,
 Language.HUNGARIAN,
 Language.ARMENIAN,
 Language.INDONESIAN,
 Language.ITALIAN,
 Language.JAPANESE,
 Language.GEORGIAN,
 Language.KOREAN,
 Language.LITHUANIAN,
 Language.LATVIAN,
 Language.MACEDONIAN,
 Language.MONGOLIAN,
 Language.MARATHI,
 Language.MALAY,
 Language.BOKMAL,
 Language.DUTCH,
 Language.POLISH,
 Language.PORTUGUESE,
 Language.ROMANIAN,
 Language.RUSSIAN,
 Language.SLOVAK,
 Language.SLOVENE,
 Language.ALBANIAN,
 Language.SERBIAN,
 Language.SWEDISH,
 Language.THAI,
 Language.TURKISH,
 Language.UKRAINIAN,
 Language.URDU,
 Language.VIETNAMESE]

In [5]:
detector = LanguageDetectorBuilder.from_languages(*LINGUA_LANGS).build()

In [7]:
import re
import unicodedata
import emoji

# Regex
URL_RE      = re.compile(r'https?://\S+|www\.\S+')
EMAIL_RE    = re.compile(r'\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b')
MENTION_RE  = re.compile(r'@\w+') 
AT_HASH_RE  = re.compile(r'@#') 
CAMEL_RE = re.compile(r'(?<=[a-z])(?=[A-Z])')
SYMBOL_RE = re.compile(r'[^0-9A-Za-zÀ-ÖØ-öø-ÿ\s]+')


# Base cleaning for all captions
def clean_base(text: str) -> str:
    if not isinstance(text, str):
        return ""

    s = unicodedata.normalize("NFKC", text)
    s = URL_RE.sub(" ", s)
    s = EMAIL_RE.sub(" ", s)
    s = MENTION_RE.sub(" ", s)
    s = AT_HASH_RE.sub(" ", s)
    s = emoji.replace_emoji(s, replace=" ")
    s = s.replace("#", " ")
    s = CAMEL_RE.sub(" ", s)
    s = SYMBOL_RE.sub(" ", s)
    s = re.sub(r"\s+", " ", s)

    return s.strip()

# For TF-IDF captions are lowercased
def clean_for_tfidf(text: str) -> str:
    s = clean_base(text)
    s = s.lower()   # solo tf-idf
    return s

# Bert and clip normal
def clean_for_bert(text: str) -> str:
    s = clean_base(text)
    return s

# Language detection
def clean_for_language(text: str) -> str:
    if not isinstance(text, str):
        return ""

    s = clean_base(text)

    # If the caption is non linguistic use empty caption
    if not any(c.isalpha() for c in s):
        return ""

    return s


def detect_lang_lingua(text: str) -> str:
    s = clean_for_language(text)

    if not s:
        return "none"

    lang = detector.detect_language_of(s)
    if lang is None:
        return "unknown"

    code = lang.iso_code_639_1.name.lower()
    return code if code in SUPPORTED_LANGS else "unsupported"

In [4]:
# Connection

DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


In [17]:
con.execute("DROP TABLE IF EXISTS caption_lang_1718")

<_duckdb.DuckDBPyConnection at 0x1ec40136bf0>

In [18]:
query = """
    SELECT post_id, caption
    FROM metadata1718
"""

con.execute("""
    CREATE TABLE IF NOT EXISTS caption_lang_1718 (
        post_id TEXT,
        caption TEXT,
        caption_lang TEXT,
        caption_tfidf TEXT,
        caption_bert_clip TEXT
    )
""")

batch_size = 100000
offset = 0
batch_id = 0

while True:
    batch = con.execute(f"""
        SELECT post_id, caption
        FROM metadata1718
        ORDER BY post_id
        LIMIT {batch_size} OFFSET {offset}
    """).df()

    if len(batch) == 0:
        break

    print(f"Processing batch {batch_id} ({len(batch)} rows)...")

    batch["caption_lang"] = batch["caption"].apply(detect_lang_lingua)
    batch["caption_clean_tfidf"]  = batch["caption"].apply(clean_for_tfidf)
    batch["caption_clean_bert"]   = batch["caption"].apply(clean_for_bert)

    con.register("tmp_chunk", batch)
    con.execute("INSERT INTO caption_lang_1718 SELECT * FROM tmp_chunk")
    con.unregister("tmp_chunk")

    offset += batch_size
    batch_id += 1

print("Completato!")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 0 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 1 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 2 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 3 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 4 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 5 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 6 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 7 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 8 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 9 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 10 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 11 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 12 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 13 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 14 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 15 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 16 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 17 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 18 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 19 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 20 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 21 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 22 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 23 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 24 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 25 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 26 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 27 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 28 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 29 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 30 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 31 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 32 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 33 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 34 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 35 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 36 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 37 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 38 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 39 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 40 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 41 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 42 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 43 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 44 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 45 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 46 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 47 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 48 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 49 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 50 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 51 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 52 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 53 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 54 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 55 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 56 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 57 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 58 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 59 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 60 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 61 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 62 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 63 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 64 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 65 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 66 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 67 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 68 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 69 (36079 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Completato!


In [3]:
con.sql("""select caption_lang, COUNT(*) AS n 
FROM caption_lang_1718 
GROUP BY caption_lang 
ORDER BY n DESC""").df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,caption_lang,n
0,en,5455700
1,it,227167
2,none,198356
3,es,191961
4,pt,175956
5,de,164740
6,fr,133559
7,nl,65944
8,nb,38668
9,sv,35532


In [20]:
# Random check before and after cleaning
con.sql("""SELECT caption, caption_tfidf, caption_bert_clip FROM caption_lang_1718 WHERE caption_lang = 'en' LIMIT 10""").df()

Unnamed: 0,caption,caption_tfidf,caption_bert_clip
0,Latergram. #rednation #rockets #houstonrockets...,latergram rednation rockets houstonrockets clu...,Latergram rednation rockets houstonrockets clu...
1,Cassell appreciation night. Outfit last night ...,cassell appreciation night outfit last night v...,Cassell appreciation night Outfit last night v...
2,From the Clippers game! Loving this Harden mas...,from the clippers game loving this harden mask...,From the Clippers game Loving this Harden mask...
3,"""Everybody keep all your 2017 gear. We're goin...",everybody keep all your 2017 gear we re going ...,Everybody keep all your 2017 gear We re going ...
4,New Year's Eve game outfit. Rowdies' tradition...,new year s eve game outfit rowdies tradition o...,New Year s Eve game outfit Rowdies tradition o...
5,Rockets vs the Wizards outfit. Starter pullov...,rockets vs the wizards outfit starter pullover...,Rockets vs the Wizards outfit Starter pullover...
6,Thursday's game. Rowdies wore cartoon logo gea...,thursday s game rowdies wore cartoon logo gear...,Thursday s game Rowdies wore cartoon logo gear...
7,Texans game today! & a WIN!! Haven't been to a...,texans game today a win haven t been to a game...,Texans game today a WIN Haven t been to a game...
8,Outfit tonight vs the Hornets. 9 in a row!! Su...,outfit tonight vs the hornets 9 in a row such ...,Outfit tonight vs the Hornets 9 in a row Such ...
9,Pajama Night a week ago vs Memphis. Finally to...,pajama night a week ago vs memphis finally too...,Pajama Night a week ago vs Memphis Finally too...


In [21]:
con.sql("""SELECT COUNT(*) AS empty_captions
FROM metadata1718
WHERE caption_len_char IS NULL""").df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,empty_captions
0,53298


In [24]:
con.sql("""SELECT COUNT(*) AS n FROM caption_lang_1718""").df()

Unnamed: 0,n
0,6936079


In [43]:
con.sql("""
SELECT COUNT(*) AS N
FROM caption_lang_1718
WHERE caption_lang = 'en'
   OR (caption_lang = 'none' AND LEN(caption_tfidf) == 0)
""").df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,N
0,5643339


In [44]:
con.execute("""CREATE OR REPLACE TABLE caption_lang_1718_restricted AS
SELECT * 
FROM caption_lang_1718
WHERE caption_lang = 'en'
   OR (caption_lang = 'none' AND LEN(caption_tfidf) == 0)""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<_duckdb.DuckDBPyConnection at 0x1ec40136bf0>

In [14]:
con.sql("""SELECT * FROM caption_lang_1718_restricted LIMIT 10""").df()

Unnamed: 0,post_id,caption,caption_lang,caption_tfidf,caption_bert_clip
0,00_rocketgirl-1417779452246899116,Latergram. #rednation #rockets #houstonrockets...,en,latergram rednation rockets houstonrockets clu...,Latergram rednation rockets houstonrockets clu...
1,00_rocketgirl-1417781639316341557,Cassell appreciation night. Outfit last night ...,en,cassell appreciation night outfit last night v...,Cassell appreciation night Outfit last night v...
2,00_rocketgirl-1419026053921620717,From the Clippers game! Loving this Harden mas...,en,from the clippers game loving this harden mask...,From the Clippers game Loving this Harden mask...
3,00_rocketgirl-1419105391337727407,"""Everybody keep all your 2017 gear. We're goin...",en,everybody keep all your 2017 gear we re going ...,Everybody keep all your 2017 gear We re going ...
4,00_rocketgirl-1419400753193450161,New Year's Eve game outfit. Rowdies' tradition...,en,new year s eve game outfit rowdies tradition o...,New Year s Eve game outfit Rowdies tradition o...
5,00_rocketgirl-1422630785915391847,Rockets vs the Wizards outfit. Starter pullov...,en,rockets vs the wizards outfit starter pullover...,Rockets vs the Wizards outfit Starter pullover...
6,00_rocketgirl-1422672031652483552,Thursday's game. Rowdies wore cartoon logo gea...,en,thursday s game rowdies wore cartoon logo gear...,Thursday s game Rowdies wore cartoon logo gear...
7,00_rocketgirl-1422925437709109949,Texans game today! & a WIN!! Haven't been to a...,en,texans game today a win haven t been to a game...,Texans game today a WIN Haven t been to a game...
8,00_rocketgirl-1425217048153545387,Outfit tonight vs the Hornets. 9 in a row!! Su...,en,outfit tonight vs the hornets 9 in a row such ...,Outfit tonight vs the Hornets 9 in a row Such ...
9,00_rocketgirl-1432201383469474534,Pajama Night a week ago vs Memphis. Finally to...,en,pajama night a week ago vs memphis finally too...,Pajama Night a week ago vs Memphis Finally too...


In [9]:
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords

In [10]:
# Load English stopwords once
EN_STOPWORDS = set(stopwords.words("english"))

def remove_stopwords(text):
    if not isinstance(text, str) or not text.strip():
        return text

    tokens = text.split()
    filtered = [t for t in tokens if t.lower() not in EN_STOPWORDS]
    return " ".join(filtered)

In [11]:
con.execute("""DROP TABLE caption_clean_1718""")

<_duckdb.DuckDBPyConnection at 0x17d5b7cb4f0>

In [12]:
con.execute("""
    CREATE TABLE IF NOT EXISTS caption_clean_1718 (
        post_id TEXT,
        caption TEXT,
        caption_lang TEXT,
        caption_tfidf TEXT,
        caption_bert_clip TEXT
    )
""")

<_duckdb.DuckDBPyConnection at 0x17d5b7cb4f0>

In [15]:
batch_size = 100000
offset = 0
batch_id = 0

while True:
    batch = con.execute(f"""
        SELECT post_id, caption, caption_lang, caption_tfidf, caption_bert_clip
        FROM caption_lang_1718_restricted
        ORDER BY post_id
        LIMIT {batch_size} OFFSET {offset}
    """).df()

    if len(batch) == 0:
        break

    print(f"Processing batch {batch_id} ({len(batch)} rows)...")

    batch["caption_tfidf"] = [remove_stopwords(txt) for txt in batch["caption_tfidf"]]
    batch["caption_bert_clip"] = [remove_stopwords(txt) for txt in batch["caption_bert_clip"]]

    con.register("tmp_chunk", batch)
    con.execute("""
        INSERT INTO caption_clean_1718
        SELECT post_id, caption, caption_lang, caption_tfidf, caption_bert_clip
        FROM tmp_chunk
    """)
    con.unregister("tmp_chunk")

    offset += batch_size
    batch_id += 1

print("Completato!")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 0 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 1 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 2 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 3 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 4 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 5 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 6 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 7 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 8 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 9 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 10 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 11 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 12 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 13 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 14 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 15 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 16 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 17 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 18 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 19 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 20 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 21 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 22 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 23 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 24 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 25 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 26 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 27 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 28 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 29 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 30 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 31 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 32 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 33 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 34 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 35 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 36 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 37 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 38 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 39 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 40 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 41 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 42 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 43 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 44 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 45 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 46 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 47 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 48 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 49 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 50 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 51 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 52 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 53 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 54 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 55 (100000 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing batch 56 (43339 rows)...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Completato!


In [16]:
# Check duplicates (language detector)
con.sql("""SELECT COUNT(*) AS tot, COUNT(DISTINCT post_id) AS single_posts, COUNT(*) - COUNT(DISTINCT post_id) AS duplicates FROM caption_lang_1718_restricted """).df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,tot,single_posts,duplicates
0,5643339,5643339,0


In [17]:
# Check duplicates (metadata)
con.sql("""SELECT COUNT(*) AS tot, COUNT(DISTINCT post_id) AS single_posts, COUNT(*) - COUNT(DISTINCT post_id) AS duplicates FROM metadata1718 """).df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,tot,single_posts,duplicates
0,6936079,6936079,0


In [18]:
# Check duplicates (after removing stopwords)
con.sql("""SELECT COUNT(*) AS tot, COUNT(DISTINCT post_id) AS single_posts, COUNT(*) - COUNT(DISTINCT post_id) AS duplicates FROM caption_clean_1718 """).df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,tot,single_posts,duplicates
0,5643339,5643339,0


In [19]:
con.sql("""SELECT caption, caption_tfidf, caption_bert_clip, LEN(caption) AS len_before, LEN(caption_tfidf) AS len_after
FROM caption_clean_1718
LIMIT 20
""").df()

Unnamed: 0,caption,caption_tfidf,caption_bert_clip,len_before,len_after
0,Latergram. #rednation #rockets #houstonrockets...,latergram rednation rockets houstonrockets clu...,Latergram rednation rockets houstonrockets clu...,58,53
1,Cassell appreciation night. Outfit last night ...,cassell appreciation night outfit last night v...,Cassell appreciation night Outfit last night v...,116,109
2,From the Clippers game! Loving this Harden mas...,clippers game loving harden mask remember vote...,Clippers game Loving Harden mask Remember vote...,188,158
3,"""Everybody keep all your 2017 gear. We're goin...",everybody keep 2017 gear going need nba finals...,Everybody keep 2017 gear going need NBA finals...,285,202
4,New Year's Eve game outfit. Rowdies' tradition...,new year eve game outfit rowdies tradition bla...,New Year Eve game outfit Rowdies tradition bla...,252,197
5,Rockets vs the Wizards outfit. Starter pullov...,rockets vs wizards outfit starter pullover jac...,Rockets vs Wizards outfit Starter pullover jac...,290,240
6,Thursday's game. Rowdies wore cartoon logo gea...,thursday game rowdies wore cartoon logo gear t...,Thursday game Rowdies wore cartoon logo gear t...,259,215
7,Texans game today! & a WIN!! Haven't been to a...,texans game today win game since 2008 2007 abs...,Texans game today WIN game since 2008 2007 abs...,260,169
8,Outfit tonight vs the Hornets. 9 in a row!! Su...,outfit tonight vs hornets 9 row fun time rocke...,Outfit tonight vs Hornets 9 row fun time Rocke...,654,458
9,Pajama Night a week ago vs Memphis. Finally to...,pajama night week ago vs memphis finally took ...,Pajama Night week ago vs Memphis Finally took ...,477,328


In [None]:
con.sql("""PRAGMA table_info(metadata1718)""").df()

In [20]:
con.execute("""
CREATE OR REPLACE TABLE metadata1718_restricted AS

SELECT m.*, c.caption_lang, c.caption_tfidf, c.caption_bert_clip
FROM metadata1718 m JOIN caption_clean_1718 c ON m.post_id = c.post_id
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<_duckdb.DuckDBPyConnection at 0x17d5b7cb4f0>

In [21]:
con.sql("""SELECT COUNT(*) AS T FROM metadata1718_restricted""")

┌─────────┐
│    T    │
│  int64  │
├─────────┤
│ 5643339 │
└─────────┘

In [22]:
con.sql("""SELECT caption, caption_language, caption_tfidf, caption_bert_clip 
FROM metadata1718_restricted
LIMIT 20""").df()

Unnamed: 0,caption,caption_language,caption_tfidf,caption_bert_clip
0,"Terrible Tuesday, fixed. @mrburgertruck",en,terrible tuesday fixed,Terrible Tuesday fixed
1,Holy shut. @biggestburgers101,en,holy shut,Holy shut
2,If you don’t like royal stacks then I don’t li...,en,like royal stacks like,like royal stacks like
3,Bruh. @balwyncanteen are throwing round some o...,en,bruh throwing round best burgs game thoughts,Bruh throwing round best burgs game Thoughts
4,COMPETITION TIME! 🍔 🍻 Keen to try out Melbour...,en,competition time keen try melbourne best new s...,COMPETITION TIME Keen try Melbourne best new s...
5,Don’t exhale. Repost from the biggest piece o...,en,exhale repost biggest piece shit burger blogger,exhale Repost biggest piece shit burger blogger
6,Sometimes it’s the simplest of changes that ca...,en,sometimes simplest changes make biggest impact...,Sometimes simplest changes make biggest impact...
7,Happy birthday #rigbyandfitz! An honest look a...,en,happy birthday rigbyandfitz honest look adopti...,Happy birthday rigbyandfitz honest look adopti...
8,My house is not this clean right now. Perhaps ...,en,house clean right perhaps tomorrow get act tog...,house clean right Perhaps tomorrow get act tog...
9,Saturday morning recovering from being sponsor...,en,saturday morning recovering sponsors kids scho...,Saturday morning recovering sponsors kids scho...


In [23]:
con.close()