# Influencers.txt file cleaning and export

The script imports the influencers.txt file and cleans it.
Then selects the relevant columns and prepares a Parquet export.

In [2]:
# Import libraries
import os, duckdb

In [4]:
# Paths configuration
DB_PATH     = "D:/db/meta.duckdb"
INPUT_PATH  = "D:/dataset/influencers.txt" 
OUT_DIR     = "D:/dataset/dim" 
EXPORT_PARQUET = True
EXPORT_CSV     = False

# Start connection
con = duckdb.connect(DB_PATH)

# Set the threads, otherwise duckdb uses the maximum available by default
try:
    con.execute("PRAGMA threads=8;") 
except duckdb.InvalidInputException:
    print("PRAGMA threads non supportato in questa versione, continuo con default.")

print("Set up ready")

Set up ready


In [14]:
con.execute("""DROP TABLE influencers_raw""")

<duckdb.duckdb.DuckDBPyConnection at 0x26950a2b230>

In [16]:
# 1) Import raw data from the original file 
con.execute(f"""
CREATE OR REPLACE VIEW influencers_raw AS
SELECT * FROM read_csv(
  '{INPUT_PATH.replace("'", "''")}',
  delim='\t', header=true,
  columns={{'Username':'VARCHAR','Category':'VARCHAR','#Followers':'VARCHAR','#Followees':'VARCHAR','#Posts':'VARCHAR'}},
  quote='', escape='',
  comment='=',          
  null_padding=true,
  ignore_errors=true
);
""")
print("Step 1 completed", '\n')

Step 1 completed 



In [34]:
print(con.sql("""
SELECT COUNT(*) as null_username
FROM influencers_raw
WHERE Username IS NULL
"""))

┌───────────────┐
│ null_username │
│     int64     │
├───────────────┤
│             0 │
└───────────────┘



In [36]:
print(con.sql("""
SELECT COUNT(*) as null_username
FROM influencers_raw
WHERE Category = ''
"""))

┌───────────────┐
│ null_username │
│     int64     │
├───────────────┤
│             0 │
└───────────────┘



In [22]:
print(con.sql("""
SELECT COUNT(*) as duplicate_username
FROM influencers_raw
GROUP BY Username
HAVING COUNT(*) > 1
"""))

┌────────────────────┐
│ duplicate_username │
│       int64        │
├────────────────────┤
│       0 rows       │
└────────────────────┘



In [24]:
print(con.sql("""
SELECT *
FROM influencers_raw
LIMIT 10
"""))

┌───────────────────┬──────────┬────────────┬────────────┬─────────┐
│     Username      │ Category │ #Followers │ #Followees │ #Posts  │
│      varchar      │ varchar  │  varchar   │  varchar   │ varchar │
├───────────────────┼──────────┼────────────┼────────────┼─────────┤
│ makeupbynvs       │ beauty   │ 1432       │ 1089       │ 363     │
│ jaquelinevandoski │ beauty   │ 137600     │ 548        │ 569     │
│ anisaartistry     │ beauty   │ 64644      │ 289        │ 391     │
│ rubina_muartistry │ beauty   │ 496406     │ 742        │ 887     │
│ beautyxabbi       │ beauty   │ 2050       │ 1423       │ 751     │
│ vemakeup713       │ beauty   │ 99226      │ 613        │ 536     │
│ glaminfusion      │ beauty   │ 4717       │ 506        │ 287     │
│ paisleymattes     │ beauty   │ 108193     │ 1240       │ 1008    │
│ meghanmakeup_     │ beauty   │ 1430       │ 1160       │ 284     │
│ mariana.beautyy   │ beauty   │ 6460       │ 2330       │ 469     │
├───────────────────┴──────────┴──

In [63]:
# 3) Cleaning and normalizing features
con.execute(r"""
CREATE OR REPLACE TABLE influencers AS
WITH cleaned AS (
  SELECT
    lower(trim(regexp_replace(Username, '^@', ''))) AS username,
    lower(trim(Category)) AS category_raw,
    regexp_replace(lower("#Followers"), '\s+', '') AS followers_txt,
    regexp_replace("#Followees", '[^0-9]', '') AS followees_txt,
    regexp_replace("#Posts", '[^0-9]', '') AS posts_txt
  FROM influencers_raw
  WHERE Username IS NOT NULL
),
parsed AS (
  SELECT
    username,
    COALESCE(NULLIF(category_raw,''), 'other') AS category,
    CASE
      WHEN followers_txt ~ 'm$'
        THEN CAST(regexp_extract(followers_txt, '([0-9]+\.?[0-9]*)', 1) AS DOUBLE)*1e6
      WHEN followers_txt ~ 'k$'
        THEN CAST(regexp_extract(followers_txt, '([0-9]+\.?[0-9]*)', 1) AS DOUBLE)*1e3
      ELSE CAST(regexp_replace(followers_txt, '[^0-9]', '') AS BIGINT)
    END::BIGINT AS followers,
    TRY_CAST(followees_txt AS BIGINT) AS followees,
    TRY_CAST(posts_txt AS BIGINT)     AS posts
  FROM cleaned
)
SELECT
  username,
  any_value(category) AS category,
  max(followers)      AS followers,
  max(followees)      AS followees,
  max(posts)          AS posts
FROM parsed
WHERE username IS NOT NULL AND username <> ''
GROUP BY 1;
""")
print("Step 3 completed", '\n')

Step 1 completed 

Step 2 completed 

Step 3 completed 



In [40]:
# 2) Cleaning and normalizing features
con.execute(r"""
CREATE OR REPLACE TABLE influencers AS
SELECT
    lower(trim(Username)) AS username,
    lower(trim(Category)) AS category,
    TRY_CAST("#Followers" AS INTEGER) AS followers,
    TRY_CAST("#Followees" AS INTEGER) AS followees,
    TRY_CAST("#Posts" AS INTEGER) AS posts
FROM influencers_raw
""")
print("Step 2 completed", '\n')

Step 2 completed 



In [46]:
# CHECKS
print("Starting checks", "\n")

# 1) Check the number of rows and the presence of users with 0 or null #followers
print(con.execute("SELECT COUNT(*) AS n_rows, COUNT(DISTINCT username) AS n_users FROM influencers").fetchdf(), "\n")
print(con.execute("""
SELECT
  SUM(followers IS NULL) AS n_null_followers,
  SUM(followers = 0)     AS n_zero_followers
FROM influencers
""").fetchdf(), "\n")


# 2) Table schema
print(con.execute("DESCRIBE influencers;").fetchdf(), '\n')

# 3) Show first ten rows, and top 10 users by number of followers
print(con.execute("SELECT * FROM influencers LIMIT 10;").fetchdf(), '\n')
print(con.execute("""
    SELECT username, category, followers, followees, posts
    FROM influencers
    ORDER BY followers DESC NULLS LAST
    LIMIT 10;
""").fetchdf())

# 4) Check the categories and the number of users for each category
print(con.sql("""
SELECT category, COUNT(*) AS n_users_per_category
FROM influencers 
GROUP BY category
ORDER BY n_users_per_category DESC
"""))

Starting checks 

   n_rows  n_users
0   33935    33935 

   n_null_followers  n_zero_followers
0               0.0               0.0 

  column_name column_type null   key default extra
0    username     VARCHAR  YES  None    None  None
1    category     VARCHAR  YES  None    None  None
2   followers     INTEGER  YES  None    None  None
3   followees     INTEGER  YES  None    None  None
4       posts     INTEGER  YES  None    None  None 

            username category  followers  followees  posts
0        makeupbynvs   beauty       1432       1089    363
1  jaquelinevandoski   beauty     137600        548    569
2      anisaartistry   beauty      64644        289    391
3  rubina_muartistry   beauty     496406        742    887
4        beautyxabbi   beauty       2050       1423    751
5        vemakeup713   beauty      99226        613    536
6       glaminfusion   beauty       4717        506    287
7      paisleymattes   beauty     108193       1240   1008
8      meghanmakeup_   be

In [65]:
# Category correction

con.execute("""
UPDATE influencers
SET category = 'fashion'
WHERE category IN ('fasion', 'fashion 0.5')
""")
print("Table updated")

# Check
print(con.sql("""
SELECT category, COUNT(*) AS n_users_per_category
FROM influencers
GROUP BY category
ORDER BY n_users_per_category DESC
"""))

Table updated
┌──────────┬──────────────────────┐
│ category │ n_users_per_category │
│ varchar  │        int64         │
├──────────┼──────────────────────┤
│ fashion  │                11913 │
│ other    │                 5720 │
│ travel   │                 4210 │
│ family   │                 4070 │
│ food     │                 3565 │
│ beauty   │                 1542 │
│ interior │                 1195 │
│ fitness  │                 1133 │
│ pet      │                  587 │
└──────────┴──────────────────────┘



In [67]:
# Export either Parquet or CSV format
if EXPORT_PARQUET or EXPORT_CSV:
    os.makedirs(OUT_DIR, exist_ok=True)

if EXPORT_PARQUET:
    con.execute(f"""
    COPY (SELECT * FROM influencers)
    TO '{os.path.join(OUT_DIR, "influencers.parquet").replace("'", "''")}'
    (FORMAT PARQUET, COMPRESSION ZSTD);
    """)
    print("\n[Parquet salvato] ", os.path.join(OUT_DIR, "influencers.parquet"))

if EXPORT_CSV:
    con.execute(f"""
    COPY (SELECT * FROM influencers)
    TO '{os.path.join(OUT_DIR, "influencers.csv").replace("'", "''")}'
    (HEADER, DELIMITER ',');
    """)
    print("[CSV saved] ", os.path.join(OUT_DIR, "influencers.csv"))
print("The cleaning process is done.")


[Parquet salvato]  D:/dataset/dim\influencers.parquet
The cleaning process is done.


In [69]:
# Check
con.sql("""
CREATE OR REPLACE VIEW v_influencers AS
SELECT *
FROM "D:/dataset/dim/influencers.parquet";""")

print(con.sql("""SELECT category, COUNT(*) AS n_users_per_category
FROM v_influencers
GROUP BY category
ORDER BY n_users_per_category DESC
"""))

┌──────────┬──────────────────────┐
│ category │ n_users_per_category │
│ varchar  │        int64         │
├──────────┼──────────────────────┤
│ fashion  │                11913 │
│ other    │                 5720 │
│ travel   │                 4210 │
│ family   │                 4070 │
│ food     │                 3565 │
│ beauty   │                 1542 │
│ interior │                 1195 │
│ fitness  │                 1133 │
│ pet      │                  587 │
└──────────┴──────────────────────┘



In [71]:
# Close the connection
con.close()
print("Connection closed")

Connection closed
