In [1]:
import os
import re
import duckdb
import pandas as pd

In [2]:
# Configuration
MAPPING_FILE = "D:/dataset/JSON-Image_files_mapping.txt"
DB_PATH = "D:/db/meta.duckdb"

In [3]:
# Connection

DB_PATH = "D:/db/meta.duckdb"
con = duckdb.connect(DB_PATH)
try:
    con.execute("PRAGMA threads=8;")
except duckdb.InvalidInputException:
    pass

print("Set up ready")

Set up ready


In [4]:
con.sql("""SELECT COUNT(*) AS tot_img FROM images_manifest i INNER JOIN metadata1718_restricted m ON i.post_id = m.post_id""").df() 

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,tot_img
0,7037456


In [6]:
# Upload the mapping file
df_map = pd.read_csv(
    MAPPING_FILE,
    sep="\t",
    names=["username", "json_file", "image_files"],
    header=0
)

# Clean image_files column: remove [] and ""
df_map["image_files"] = df_map["image_files"].str.strip("[]")       
df_map["image_files"] = df_map["image_files"].str.replace("'", "")

# If there are more than one image in the list, it separates them and them place them in separate lines. Then removes any leftover spaces
df_map = df_map.assign(image_file=df_map["image_files"].str.split(","))
df_map = df_map.explode("image_file")
df_map["image_file"] = df_map["image_file"].str.strip()

# Removes the extention from the json_file referring to the post, and define its column post_id
df_map["post"] = df_map["json_file"].str.replace(".info", "", regex=False)

# Final manifest
df_manifest = df_map[["username", "post", "image_file"]]

print("\n Clean manifest head:")
print(df_manifest.head())


 Clean manifest head:
        username                 post               image_file
0  00_rocketgirl  1188140434601337485  1188140434601337485.jpg
1  00_rocketgirl  1195378513372308151  1195378513372308151.jpg
2  00_rocketgirl  1198311457711775891  1198311457711775891.jpg
3  00_rocketgirl  1198947568398164749  1198947568398164749.jpg
4  00_rocketgirl  1199462506388578918  1199462506388578918.jpg


In [8]:
# Create the manifest for all records
con.register("df_manifest", df_manifest) # register the pd dataframe inside duckdb so that it is able to process it
con.execute("""
    CREATE OR REPLACE TABLE images_manifest AS
    SELECT * FROM df_manifest
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<_duckdb.DuckDBPyConnection at 0x27075361e70>

In [9]:
print(con.sql("""SELECT * FROM images_manifest LIMIT 10"""))

┌───────────────┬─────────────────────┬─────────────────────────┐
│   username    │        post         │       image_file        │
│    varchar    │       varchar       │         varchar         │
├───────────────┼─────────────────────┼─────────────────────────┤
│ 00_rocketgirl │ 1188140434601337485 │ 1188140434601337485.jpg │
│ 00_rocketgirl │ 1195378513372308151 │ 1195378513372308151.jpg │
│ 00_rocketgirl │ 1198311457711775891 │ 1198311457711775891.jpg │
│ 00_rocketgirl │ 1198947568398164749 │ 1198947568398164749.jpg │
│ 00_rocketgirl │ 1199462506388578918 │ 1199462506388578918.jpg │
│ 00_rocketgirl │ 1207015291657184046 │ 1207015291657184046.jpg │
│ 00_rocketgirl │ 1208427977485902265 │ 1208427977485902265.jpg │
│ 00_rocketgirl │ 1208434140696863285 │ 1208434140696863285.jpg │
│ 00_rocketgirl │ 1208948523948630869 │ 1208948523948630869.jpg │
│ 00_rocketgirl │ 1210280536441938113 │ 1210280536441938113.jpg │
├───────────────┴─────────────────────┴─────────────────────────┤
│ 10 rows 

In [10]:
con.execute("""ALTER TABLE images_manifest ADD COLUMN post_id TEXT;

UPDATE images_manifest
SET post_id = username || '-' || post;
""")


<_duckdb.DuckDBPyConnection at 0x27075361e70>

In [11]:
con.execute("""ALTER TABLE images_manifest ADD COLUMN full_image_file TEXT;

UPDATE images_manifest
SET full_image_file = username || '-' || image_file
""")

<_duckdb.DuckDBPyConnection at 0x27075361e70>

In [12]:
print(con.sql("""SELECT * FROM images_manifest LIMIT 10"""))

┌───────────────┬─────────────────────┬─────────────────────────┬───────────────────────────────────┬───────────────────────────────────────┐
│   username    │        post         │       image_file        │              post_id              │            full_image_file            │
│    varchar    │       varchar       │         varchar         │              varchar              │                varchar                │
├───────────────┼─────────────────────┼─────────────────────────┼───────────────────────────────────┼───────────────────────────────────────┤
│ 00_rocketgirl │ 1188140434601337485 │ 1188140434601337485.jpg │ 00_rocketgirl-1188140434601337485 │ 00_rocketgirl-1188140434601337485.jpg │
│ 00_rocketgirl │ 1195378513372308151 │ 1195378513372308151.jpg │ 00_rocketgirl-1195378513372308151 │ 00_rocketgirl-1195378513372308151.jpg │
│ 00_rocketgirl │ 1198311457711775891 │ 1198311457711775891.jpg │ 00_rocketgirl-1198311457711775891 │ 00_rocketgirl-1198311457711775891.jpg │
│ 00_r

In [48]:
# Check how the filename is recorded in the metadata for the join
print(con.sql("""SELECT filename from metadata LIMIT 10"""))

┌─────────────────────────────────────────────────┐
│                    filename                     │
│                     varchar                     │
├─────────────────────────────────────────────────┤
│ christopher__williams__-309535128597189548.info │
│ corinnemunsch-294062078983787266.info           │
│ matt.leskopics-311034462970745983.info          │
│ mopofosho-313854271065802795.info               │
│ kallste-308061680557409132.info                 │
│ gi.bosha-314593465816336046.info                │
│ christopher__williams__-309511998042428110.info │
│ corinnemunsch-292548439395127822.info           │
│ matt.leskopics-305575320168600833.info          │
│ mopofosho-310773094553602201.info               │
├─────────────────────────────────────────────────┤
│                     10 rows                     │
└─────────────────────────────────────────────────┘



In [4]:
# Remove .info to make it consistent with the user_post_id in the images_manifest table, and add a column username_filename
con.execute("""ALTER TABLE metadata ADD COLUMN post_id TEXT; 

UPDATE metadata
SET post_id = REGEXP_REPLACE(filename, '\\.info$', '')
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<_duckdb.DuckDBPyConnection at 0x1e19e06ff70>

In [5]:
print(con.sql("""SELECT filename, post_id FROM metadata LIMIT 10"""))

┌─────────────────────────────────────────────────┬────────────────────────────────────────────┐
│                    filename                     │                  post_id                   │
│                     varchar                     │                  varchar                   │
├─────────────────────────────────────────────────┼────────────────────────────────────────────┤
│ christopher__williams__-309535128597189548.info │ christopher__williams__-309535128597189548 │
│ corinnemunsch-294062078983787266.info           │ corinnemunsch-294062078983787266           │
│ matt.leskopics-311034462970745983.info          │ matt.leskopics-311034462970745983          │
│ mopofosho-313854271065802795.info               │ mopofosho-313854271065802795               │
│ kallste-308061680557409132.info                 │ kallste-308061680557409132                 │
│ gi.bosha-314593465816336046.info                │ gi.bosha-314593465816336046                │
│ christopher__williams__-3095

In [6]:
# Check posts not in the manifest: ALL POSTS ARE IN THE MANIFEST
con.sql("""
SELECT COUNT(*) AS posts_notinthemanifest
FROM images_manifest AS im
JOIN metadata AS m 
ON im.post_id = m.post_id
WHERE im.full_image_file IS NULL
""").fetchdf()

Unnamed: 0,posts_notinthemanifest
0,0


In [7]:
# Remove .info to make it consistent with the user_post_id in the images_manifest table, and add a column username_filename
con.execute("""ALTER TABLE metadata1718 ADD COLUMN post_id TEXT; 

UPDATE metadata1718
SET post_id = REGEXP_REPLACE(filename, '\\.info$', '')
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<_duckdb.DuckDBPyConnection at 0x1e19e06ff70>

In [8]:
print(con.sql("""SELECT filename, post_id FROM metadata1718 LIMIT 10"""))

┌───────────────────────────────────────────┬──────────────────────────────────────┐
│                 filename                  │               post_id                │
│                  varchar                  │               varchar                │
├───────────────────────────────────────────┼──────────────────────────────────────┤
│ momendeavors-1622822447027290024.info     │ momendeavors-1622822447027290024     │
│ momendeavors-1624382483813729721.info     │ momendeavors-1624382483813729721     │
│ momendeavors-1625846565628594234.info     │ momendeavors-1625846565628594234     │
│ momendeavors-1628770804958624054.info     │ momendeavors-1628770804958624054     │
│ momendeavors-1630139708478039074.info     │ momendeavors-1630139708478039074     │
│ momendeavors-1632373161072096880.info     │ momendeavors-1632373161072096880     │
│ momendeavors-1633095406497186653.info     │ momendeavors-1633095406497186653     │
│ momentsoffashion-1616230832582468314.info │ momentsoffashion-16

In [9]:
# There are duplicates in the images manifest
print(con.sql("""
SELECT COUNT(*) AS total_images_manifest, 
    COUNT(DISTINCT full_image_file) AS distinct_files_manifest,
    COUNT(*) - COUNT(DISTINCT full_image_file) AS duplicates
FROM images_manifest
"""))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌───────────────────────┬─────────────────────────┬────────────┐
│ total_images_manifest │ distinct_files_manifest │ duplicates │
│         int64         │          int64          │   int64    │
├───────────────────────┼─────────────────────────┼────────────┤
│              12788311 │                12788311 │          0 │
└───────────────────────┴─────────────────────────┴────────────┘



In [20]:
# Check the duplicates
print(con.sql("""
SELECT full_image_file
FROM images_manifest
GROUP BY full_image_file
HAVING COUNT(*) > 1
"""))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌─────────────────────────────────────────┐
│             full_image_file             │
│                 varchar                 │
├─────────────────────────────────────────┤
│ alifaa_official-1823855317402873435.jpg │
│ alifaa_official-1905663485423190964.jpg │
│ bbcbreakfast-1550561612795775428.jpg    │
│ bbcbreakfast-1550580032232861485.jpg    │
│ bbcbreakfast-1643760591976738239.jpg    │
│ bbcbreakfast-1720272291897831528.jpg    │
│ bbcbreakfast-1796244788253962545.jpg    │
│ bbcbreakfast-1809562876198030871.jpg    │
│ bbcbreakfast-1845549468913970241.jpg    │
│ bbcbreakfast-1891333173703746351.jpg    │
│                  ·                      │
│                  ·                      │
│                  ·                      │
│ whoisapneet-1763922393938282961.jpg     │
│ whoisapneet-1823448344446421476.jpg     │
│ estefi_varela-1765711360384478453.jpg   │
│ estefi_varela-1776346646105595896.jpg   │
│ estefi_varela-1824149912407090345.jpg   │
│ estefi_varela-1932069836722340

In [21]:
# Check that the duplicates are actually duplicates
print(con.sql("""
SELECT *
FROM images_manifest
WHERE full_image_file = '561finestt-1734106947009198393.jpg'
"""))

┌────────────┬─────────────────────┬─────────────────────────┬────────────────────────────────┬────────────────────────────────────┐
│  username  │        post         │       image_file        │            post_id             │          full_image_file           │
│  varchar   │       varchar       │         varchar         │            varchar             │              varchar               │
├────────────┼─────────────────────┼─────────────────────────┼────────────────────────────────┼────────────────────────────────────┤
│ 561finestt │ 1734106947009198393 │ 1734106947009198393.jpg │ 561finestt-1734106947009198393 │ 561finestt-1734106947009198393.jpg │
│ 561finestt │ 1734106947009198393 │ 1734106947009198393.jpg │ 561finestt-1734106947009198393 │ 561finestt-1734106947009198393.jpg │
└────────────┴─────────────────────┴─────────────────────────┴────────────────────────────────┴────────────────────────────────────┘



In [22]:
# New manifest version without duplicates
con.execute("""
CREATE OR REPLACE TABLE images_manifest AS
SELECT *
FROM (
    SELECT *,
           ROW_NUMBER() OVER (PARTITION BY full_image_file ORDER BY full_image_file) AS rn
    FROM images_manifest
)
WHERE rn = 1;
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<_duckdb.DuckDBPyConnection at 0x27075361e70>

In [10]:
# Check duplicates are removed
print(con.sql("""
SELECT COUNT(*) AS tot_rows, COUNT(DISTINCT full_image_file) AS distinct_files, COUNT(*)- COUNT(DISTINCT full_image_file) AS duplicates
FROM images_manifest
"""))

┌──────────┬────────────────┬────────────┐
│ tot_rows │ distinct_files │ duplicates │
│  int64   │     int64      │   int64    │
├──────────┼────────────────┼────────────┤
│ 12788311 │       12788311 │          0 │
└──────────┴────────────────┴────────────┘



In [24]:
EXPORT_PATH = "D:/dataset/manifest.parquet"

# Export images manifest table
con.execute(f"""
    COPY images_manifest TO '{EXPORT_PATH}' (FORMAT PARQUET);
""")

print(f"Completed: {EXPORT_PATH}")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Completed: D:/dataset/manifest.parquet


In [25]:
# Create the manifest for the years 2017-2018
con.execute("""
CREATE OR REPLACE TABLE images_manifest1718 AS
SELECT im.*
FROM images_manifest AS im
JOIN metadata1718 AS m 
ON im.post_id = m.post_id
""")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<_duckdb.DuckDBPyConnection at 0x27075361e70>

In [11]:
# Duplicates in 2017-2018
print(con.sql("""
SELECT COUNT(*) AS tot_rows, COUNT(DISTINCT full_image_file) AS distinct_files, COUNT(*)- COUNT(DISTINCT full_image_file) AS duplicates
FROM images_manifest1718
"""))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌──────────┬────────────────┬────────────┐
│ tot_rows │ distinct_files │ duplicates │
│  int64   │     int64      │   int64    │
├──────────┼────────────────┼────────────┤
│  8666384 │        8666384 │          0 │
└──────────┴────────────────┴────────────┘



In [27]:
EXPORT_PATH = "D:/dataset/manifest1718.parquet"

# Export the images manifest table for years 2017 and 2018
con.execute(f"""
    COPY images_manifest1718 TO '{EXPORT_PATH}' (FORMAT PARQUET);
""")

print(f"Completed: {EXPORT_PATH}")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Completed: D:/dataset/manifest1718.parquet


In [28]:
# Check if images associated to the same post are kept together YES THERE ARE MORE IMAGES FOR EACH POST
print(con.sql("""
SELECT post_id, COUNT(*) AS n_images 
FROM images_manifest
GROUP BY post_id
HAVING COUNT(*) > 1
LIMIT 20
"""))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌──────────────────────────────────────┬──────────┐
│               post_id                │ n_images │
│               varchar                │  int64   │
├──────────────────────────────────────┼──────────┤
│ nigerianmuslim-1929786633266030459   │        4 │
│ noguiltlife-1748039212744408371      │        5 │
│ nylonmag-1978095663916942074         │        6 │
│ o.snelly-1925238736008693243         │        4 │
│ ownbyfemme-1919275620778211886       │        3 │
│ palegirlrambling-1984162358607819088 │        4 │
│ parkeryorksmith-1973002269877882363  │        2 │
│ passionforbaking-1960414447883209100 │        9 │
│ pinkpeonyhome-2006211272940553643    │        9 │
│ pourmwabeauty-1909734534960479580    │        7 │
│ pralexsoares-1738961985967593646     │       10 │
│ julianaalvesiam-2027822269306141837  │        5 │
│ juliaroga-2003587493332540266        │        3 │
│ jungeun5869-2034110558933274797      │        9 │
│ k_ara7977-1779579682158955784        │        6 │
│ kalakleind

In [29]:
# How the number of images per post is distributed
df_dist = con.sql("""
SELECT 
  n_images,
  COUNT(*) AS n_posts
FROM (
    SELECT post_id, COUNT(*) AS n_images
    FROM images_manifest
    GROUP BY post_id
)
GROUP BY n_images
ORDER BY n_images
""").fetchdf()

print(df_dist.head(30))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

    n_images  n_posts
0          1  8938584
1          2   523522
2          3   244834
3          4   127746
4          5    73780
5          6    46023
6          7    29842
7          8    22096
8          9    18687
9         10    35713
10        11       10
11        12       12
12        13       13
13        14        6
14        15        4
15        16        5
16        17        2
17        18        5
18        19        5
19        20        2
20        21        1
21        25        2
22        28        1
23        29        1
24        30        1
25        34        2
26        52        1


In [30]:
# Check that the the filename is made up by username+post_id

print(con.sql(""" SELECT username, post_id
from metadata1718
where username = 'beyanaverage_' and post_id = 'beyanaverage_-1656889677721269922'
"""))

print(con.sql(""" select username, post_id, image_file
from images_manifest1718
where username = 'beyanaverage_' and post_id = 'beyanaverage_-1656889677721269922'
"""))

┌───────────────┬───────────────────────────────────┐
│   username    │              post_id              │
│    varchar    │              varchar              │
├───────────────┼───────────────────────────────────┤
│ beyanaverage_ │ beyanaverage_-1656889677721269922 │
└───────────────┴───────────────────────────────────┘

┌───────────────┬───────────────────────────────────┬─────────────────────────┐
│   username    │              post_id              │       image_file        │
│    varchar    │              varchar              │         varchar         │
├───────────────┼───────────────────────────────────┼─────────────────────────┤
│ beyanaverage_ │ beyanaverage_-1656889677721269922 │ 1656889677721269922.jpg │
└───────────────┴───────────────────────────────────┴─────────────────────────┘



In [31]:
# Total number of images in the manifest 
print(con.sql("""SELECT COUNT(*) AS n FROM images_manifest"""))

# Number of images in the manifest in 2017 and 2018
print(con.sql("""
SELECT COUNT(*) AS n_images_1718
FROM images_manifest1718"""))

┌──────────┐
│    n     │
│  int64   │
├──────────┤
│ 12788311 │
└──────────┘

┌───────────────┐
│ n_images_1718 │
│     int64     │
├───────────────┤
│       8666384 │
└───────────────┘



In [32]:
print(con.sql("""
SELECT COUNT(*) AS n FROM "D:/dataset/manifest1718.parquet"
"""))

┌─────────┐
│    n    │
│  int64  │
├─────────┤
│ 8666384 │
└─────────┘



In [4]:
# Create the manifest for the years 2017-2018
con.sql("""
SELECT COUNT(*) AS new
FROM images_manifest AS im
JOIN metadata1718_restricted AS m 
ON im.post_id = m.post_id
""").df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,new
0,7037456


In [7]:
con.sql("""PRAGMA table_info('metadata1718_restricted')""").df()

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,filename,VARCHAR,False,,False
1,1,username,VARCHAR,False,,False
2,2,like_count,INTEGER,False,,False
3,3,comment_count,INTEGER,False,,False
4,4,width,INTEGER,False,,False
5,5,height,INTEGER,False,,False
6,6,time_utc,TIMESTAMP,False,,False
7,7,caption,VARCHAR,False,,False
8,8,aspect_ratio,DOUBLE,False,,False
9,9,area,INTEGER,False,,False
