In [1]:
import os
import random
import json

data_dir = "./info"
sample_size = 20
sampled_paths = []

# Reservoir sampling from a directory iterator
with os.scandir(data_dir) as entries:
    count = 0
    for entry in entries:
        if entry.is_file() and entry.name.endswith(".json"):
            count += 1
            if len(sampled_paths) < sample_size:
                sampled_paths.append(entry.path)
            else:
                s = random.randint(0, count - 1)
                if s < sample_size:
                    sampled_paths[s] = entry.path

# Load and print the sampled JSON contents
for path in sampled_paths:
    print(f"--- {os.path.basename(path)} ---")
    try:
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            print(json.dumps(data, indent=2))
    except json.JSONDecodeError:
        print(f"Error decoding {path}")

KeyboardInterrupt: 

In [8]:
import os
import json
data_dir = "./info"

max_files_to_check = 3

with os.scandir(data_dir) as entries:
    count = 0
    for entry in entries:
        if entry.is_file():
            print(f"\n--- {entry.name} ---")
            try:
                with open(entry.path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    try:
                        parsed = json.loads(content)
                        print(json.dumps(parsed, indent=2))
                    except json.JSONDecodeError:
                        print("(Not valid JSON)\n")
                        print(content[:500])  # Show first 500 characters
            except Exception as e:
                print(f"Error reading file: {e}")
            count += 1
        if count >= max_files_to_check:
            break




--- camilamayrink-1820701983913779268.info ---
{
  "gating_info": null,
  "viewer_can_reshare": true,
  "display_resources": [
    {
      "src": "https://scontent-lax3-1.cdninstagram.com/vp/6af7cf9a6c3e32272a1204da16f3b7e9/5DC14E8A/t51.2885-15/sh0.08/e35/p640x640/36159903_1045684415595469_1279112771954278400_n.jpg?_nc_ht=scontent-lax3-1.cdninstagram.com",
      "config_width": 640,
      "config_height": 800
    },
    {
      "src": "https://scontent-lax3-1.cdninstagram.com/vp/5bf55880eeab2d5a58b20e24e8597832/5DA45E4E/t51.2885-15/sh0.08/e35/p750x750/36159903_1045684415595469_1279112771954278400_n.jpg?_nc_ht=scontent-lax3-1.cdninstagram.com",
      "config_width": 750,
      "config_height": 937
    },
    {
      "src": "https://scontent-lax3-1.cdninstagram.com/vp/15998fdc05f3cd437479748966644145/5DBA61E0/t51.2885-15/e35/36159903_1045684415595469_1279112771954278400_n.jpg?_nc_ht=scontent-lax3-1.cdninstagram.com",
      "config_width": 1080,
      "config_height": 1350
    }
  ],
  "

In [10]:
import os
import json

def print_structure(obj, indent=0):
    prefix = "  " * indent
    if isinstance(obj, dict):
        for key, value in obj.items():
            value_type = type(value).__name__
            print(f"{prefix}- {key} ({value_type})")
            print_structure(value, indent + 1)
    elif isinstance(obj, list):
        print(f"{prefix}- [list of {len(obj)} items]")
        if obj:
            print_structure(obj[0], indent + 1)  # just inspect the first item
    else:
        pass  # leave it here, you already printed the type above

# === Main Logic ===

data_dir = "./info"
max_files_to_check = 3

with os.scandir(data_dir) as entries:
    count = 0
    for entry in entries:
        if entry.is_file():
            print(f"\n--- {entry.name} ---")
            try:
                with open(entry.path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    try:
                        parsed = json.loads(content)
                        print_structure(parsed)
                    except json.JSONDecodeError:
                        print("Not valid JSON.")
            except Exception as e:
                print(f"Error reading file: {e}")
            count += 1
        if count >= max_files_to_check:
            break



--- camilamayrink-1820701983913779268.info ---
- gating_info (NoneType)
- viewer_can_reshare (bool)
- display_resources (list)
  - [list of 3 items]
    - src (str)
    - config_width (int)
    - config_height (int)
- viewer_in_photo_of_you (bool)
- viewer_has_saved_to_collection (bool)
- viewer_has_saved (bool)
- owner (dict)
  - username (str)
  - is_unpublished (bool)
  - requested_by_viewer (bool)
  - followed_by_viewer (bool)
  - has_blocked_viewer (bool)
  - profile_pic_url (str)
  - full_name (str)
  - blocked_by_viewer (bool)
  - is_verified (bool)
  - id (str)
  - is_private (bool)
- viewer_has_liked (bool)
- id (str)
- should_log_client_event (bool)
- edge_media_preview_like (dict)
  - count (int)
  - edges (list)
    - [list of 0 items]
- edge_media_to_tagged_user (dict)
  - edges (list)
    - [list of 1 items]
      - node (dict)
        - y (float)
        - x (float)
        - user (dict)
          - username (str)
          - profile_pic_url (str)
          - is_verifie

In [13]:
import os
import json
import pandas as pd

def extract_parent_comments_with_replies(directory, max_files=10000):
    results = []
    file_count = 0

    with os.scandir(directory) as entries:
        for entry in entries:
            if not entry.name.endswith('.info') or not entry.is_file():
                continue

            if file_count >= max_files:
                break

            try:
                with open(entry.path, 'r') as f:
                    data = json.load(f)

                parent_edges = data.get("edge_media_to_parent_comment", {}).get("edges", [])
                for edge in parent_edges:
                    parent_node = edge.get("node", {})
                    replies = parent_node.get("edge_threaded_comments", {}).get("edges", [])
                    if not replies:
                        continue  # Skip if no replies

                    for reply in replies:
                        reply_node = reply.get("node", {})
                        results.append({
                            "post_file": entry.name,
                            "parent_id": parent_node.get("id"),
                            "parent_text": parent_node.get("text"),
                            "parent_username": parent_node.get("owner", {}).get("username"),
                            "reply_id": reply_node.get("id"),
                            "reply_text": reply_node.get("text"),
                            "reply_username": reply_node.get("owner", {}).get("username"),
                            "reply_created_at": reply_node.get("created_at"),
                        })

                file_count += 1

            except (json.JSONDecodeError, IOError) as e:
                print(f"Error reading {entry.name}: {e}")

    return pd.DataFrame(results)

# === Usage ===
directory_path = "./info"  # Replace with your actual path
df = extract_parent_comments_with_replies(directory_path)

# Save or inspect the result
df.to_csv("parent_comments_with_replies.csv", index=False)
print(df.head())


                                post_file          parent_id  \
0  camilamayrink-1820701983913779268.info  17948873974106177   
1  camilamayrink-1820701983913779268.info  17948873974106177   
2   amy_greaves1-1999073510287538981.info  17874886636317849   
3   amy_greaves1-1999073510287538981.info  18013846654180946   
4   amy_greaves1-1999073510287538981.info  18043568035026146   

                                         parent_text  \
0  Gentem neh possível só eu acho ela idêntica a ...   
1  Gentem neh possível só eu acho ela idêntica a ...   
2                                        Love you ❤️   
3  Cannot wait for Friday!! Hope you’re feeling b...   
4                     Hope you’re ok @amy_greaves1 😘   

             parent_username           reply_id  \
0             itsabrinapires  17932118980162111   
1             itsabrinapires  17847289990301702   
2               staceyrose95  18046594462050034   
3  mummyandherbusylittlebees  17844791821380406   
4              kirsty_l

In [22]:
import os
import json
import pandas as pd
from langdetect import detect, LangDetectException

def is_english(text):
    try:
        language = detect(text)
        #print(f"Detected language for text: {text[:30]}... -> {language}")  # Print the first 30 chars
        return language == 'en'
    except LangDetectException:
        return False

def extract_parent_comments_with_replies(directory, max_files=2000):
    results = []
    file_count = 0

    with os.scandir(directory) as entries:
        for entry in entries:
            if not entry.name.endswith('.info') or not entry.is_file():
                continue

            if file_count >= max_files:
                break

            try:
                with open(entry.path, 'r') as f:
                    data = json.load(f)

                parent_edges = data.get("edge_media_to_parent_comment", {}).get("edges", [])
                #print(f"Found {len(parent_edges)} parent comments in {entry.name}")  # Debug print

                for edge in parent_edges:
                    parent_node = edge.get("node", {})
                    parent_text = parent_node.get("text", "")
                    replies = parent_node.get("edge_threaded_comments", {}).get("edges", [])

                    if not replies:
                        #print(f"Skipping parent comment {parent_node.get('id')} with no replies.")
                        continue  # Skip if no replies

                    # Filter out non-English parent comment
                    if not is_english(parent_text):
                        #print(f"Skipping non-English parent comment: {parent_text[:30]}...")
                        continue

                    for reply in replies:
                        reply_node = reply.get("node", {})
                        reply_text = reply_node.get("text", "")

                        # Filter out non-English replies
                        if not is_english(reply_text):
                            #print(f"Skipping non-English reply: {reply_text[:30]}...")
                            continue

                        results.append({
                            "post_file": entry.name,
                            "parent_id": parent_node.get("id"),
                            "parent_text": parent_text,
                            "parent_username": parent_node.get("owner", {}).get("username"),
                            "reply_id": reply_node.get("id"),
                            "reply_text": reply_text,
                            "reply_username": reply_node.get("owner", {}).get("username"),
                            "reply_created_at": reply_node.get("created_at"),
                        })

                file_count += 1

            except (json.JSONDecodeError, IOError) as e:
                print(f"Error reading {entry.name}: {e}")

    return pd.DataFrame(results)

# === Usage ===
directory_path = "./info"  # Replace with your actual path
df = extract_parent_comments_with_replies(directory_path,  max_files=20000)

# Save or inspect the result
df.to_csv("english_parent_comments_with_replies.csv", index=False)
print(df.head())


                                 post_file          parent_id  \
0    amy_greaves1-1999073510287538981.info  18013846654180946   
1    amy_greaves1-1999073510287538981.info  18043568035026146   
2    amy_greaves1-1999073510287538981.info  18043568035026146   
3  unswayedbeauty-1999456657348645708.info  17926718065301908   
4        nobiggie-1999149379567945592.info  17856514567362355   

                                         parent_text  \
0  Cannot wait for Friday!! Hope you’re feeling b...   
1                     Hope you’re ok @amy_greaves1 😘   
2                     Hope you’re ok @amy_greaves1 😘   
3                                           Lashes 😍   
4             And it makes them no slip! Great idea!   

             parent_username           reply_id  \
0  mummyandherbusylittlebees  17844791821380406   
1              kirsty_leigh3  18045575728012493   
2              kirsty_leigh3  17866566469341344   
3            just_lowkey_che  18013112428149339   
4                

In [6]:
import os
import json
import pandas as pd
from langdetect import detect, LangDetectException
import re
import spacy

nlp = spacy.load("en_core_web_sm")


def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

def is_long_enough(text, min_words=5):

    return len(text.strip().split()) >= min_words

def count_sentences_spacy(text):
    doc = nlp(text)
    return len(list(doc.sents))

def contains_second_person(text):
    # \b ensures whole-word matching; re.I makes it case-insensitive
    return bool(re.search(r"\b(you|your|you're)\b", text, re.I))

def remove_hashtags_emojis(text):
    # Remove hashtags (words starting with #)
    text = re.sub(r'#\w+', '', text)
    
    # Remove emojis (using Unicode ranges for emojis)
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002700-\U000027BF"  # Dingbats
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U00002600-\U000026FF"  # Misc symbols
        "\U00002B50-\U00002BFF"  # Additional miscellaneous symbols
        "]+",
        flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)
    
    # Optionally, remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def extract_parent_comments_with_replies(directory, max_files=2000):
    results = []
    file_count = 0

    with os.scandir(directory) as entries:
        for entry in entries:
            if not entry.name.endswith('.info') or not entry.is_file():
                continue

            if file_count >= max_files:
                break

            try:
                with open(entry.path, 'r') as f:
                    data = json.load(f)

                # Get post caption and timestamp (if available)
                caption_edges = data.get("edge_media_to_caption", {}).get("edges", [])
                caption_text = caption_edges[0]["node"]["text"] if caption_edges else None
                post_timestamp = data.get("taken_at_timestamp")

                parent_edges = data.get("edge_media_to_parent_comment", {}).get("edges", [])

                for edge in parent_edges:
                    parent_node = edge.get("node", {})
                    parent_text = parent_node.get("text", "")
                    parent_timestamp = parent_node.get("created_at")
                    replies = parent_node.get("edge_threaded_comments", {}).get("edges", [])

                    if not replies:
                        continue

        

                    # Filter: parent must be English and long enough
                    if not (is_english(parent_text) and is_long_enough(parent_text, min_words=20)):
                        continue

                    for reply in replies:
                        reply_node = reply.get("node", {})
                        reply_text = reply_node.get("text", "")
                        reply_timestamp = reply_node.get("created_at")

                        preprocess_reply = remove_hashtags_emojis(reply_text)
                        
                        # Filter: reply must be English and long enough
                        if not (is_english(reply_text) and is_long_enough(reply_text, min_words=20) and contains_second_person(reply_text)):
                            continue
                        #print(reply_text, preprocess_reply)
                        results.append({
                            "post_file": entry.name,
                            "post_caption": caption_text,
                            "post_timestamp": post_timestamp,
                            "parent_id": parent_node.get("id"),
                            "parent_text": parent_text,
                            "parent_username": parent_node.get("owner", {}).get("username"),
                            "parent_timestamp": parent_timestamp,
                            "reply_id": reply_node.get("id"),
                            "reply_text": reply_text,
                            "pre-processed_reply_text": remove_hashtags_emojis(reply_text),
                            "reply_username": reply_node.get("owner", {}).get("username"),
                            "reply_timestamp": reply_timestamp,
                            "reply_sentences": count_sentences_spacy(reply_text),
                        })

                file_count += 1

            except (json.JSONDecodeError, IOError) as e:
                continue  # silently skip bad files

    return pd.DataFrame(results)

# === Usage ===
directory_path = "./info"  # Replace with your actual path
df = extract_parent_comments_with_replies(directory_path, max_files=500000)

# Save the final result
df.to_csv("lg_english_parent_comments_with_replies.csv", index=False)

# Preview
print(df.head())


                                    post_file  \
0       brogantatexo-1936310624556763513.info   
1        yuckylavado-1760855587001880768.info   
2  theshrinkingmomma-1918327642138974144.info   
3   alexandralee1016-1779172089367121793.info   
4       iloveandreea-1948890621332093120.info   

                                        post_caption  post_timestamp  \
0  So Brogan, what did you get up to this weekend...      1545046235   
1  Volendam, Netherlands is beautiful... and so a...      1524130365   
2  I’m not counting points today but that doesn’t...      1542902497   
3  Please VOTE for me!!! 🗳 This updo 👆🏼placed me ...      1526313862   
4  Talking about favourite bronzers ... ❤️ Mine h...      1546545888   

           parent_id                                        parent_text  \
0  18003253969078267  You should get a jigsaw puzzle app on the iPad...   
1  17925863074113423  Enjoy🇳🇱🧀❣️My favourite country & city is Londo...   
2  17928608815234893  The recalled packages had

In [45]:
import os
import json
import pandas as pd
from langdetect import detect, LangDetectException
import re
import spacy

nlp = spacy.load("en_core_web_sm")


def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

def is_long_enough(text, min_words=5):

    return len(text.strip().split()) >= min_words

def count_sentences_spacy(text):
    doc = nlp(text)
    return len(list(doc.sents))

def contains_second_person(text):
    # \b ensures whole-word matching; re.I makes it case-insensitive
    return bool(re.search(r"\b(you|your|you're)\b", text, re.I))

def remove_hashtags_emojis(text):
    # Remove hashtags (words starting with #)
    text = re.sub(r'#\w+', '', text)
    
    # Remove emojis (using Unicode ranges for emojis)
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002700-\U000027BF"  # Dingbats
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U00002600-\U000026FF"  # Misc symbols
        "\U00002B50-\U00002BFF"  # Additional miscellaneous symbols
        "]+",
        flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)
    
    # Optionally, remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def extract_parent_comments_with_replies(directory, max_files=2000):
    results = []
    file_count = 0

    with os.scandir(directory) as entries:
        for entry in entries:
            if not entry.name.endswith('.info') or not entry.is_file():
                continue

            if file_count >= max_files:
                break

            try:
                with open(entry.path, 'r') as f:
                    data = json.load(f)

                # Get post caption and timestamp (if available)
                owner = data.get("owner", {}).get("username")
                caption_edges = data.get("edge_media_to_caption", {}).get("edges", [])
                caption_text = caption_edges[0]["node"]["text"] if caption_edges else None
                post_timestamp = data.get("taken_at_timestamp")

                parent_edges = data.get("edge_media_to_parent_comment", {}).get("edges", [])
                results.append({
                            "influence": owner,
                            "post_file": entry.name,
                            "post_caption": caption_text,
                            "post_timestamp": post_timestamp,
                        })
                
                
                file_count += 1

            except (json.JSONDecodeError, IOError) as e:
                continue  # silently skip bad files

    return pd.DataFrame(results)

# === Usage ===
directory_path = "./info"  # Replace with your actual path
df = extract_parent_comments_with_replies(directory_path, max_files=500000)

# Save the final result
df.to_csv("lg_english_parent_comments_with_influencers.csv", index=False)

# Preview
print(df.head())


         influence                                 post_file  \
0    camilamayrink    camilamayrink-1820701983913779268.info   
1     amy_greaves1     amy_greaves1-1999073510287538981.info   
2     patty.pepper     patty.pepper-1988498168844843264.info   
3  virginiasvieira  virginiasvieira-1807593278284415666.info   
4        katgraham        katgraham-1810877982156672389.info   

                                        post_caption  post_timestamp  
0  Poetry is what happens\nWhen your mind stops w...      1531264611  
1  YOU ARE EVERYTHING ❤ •\n•\nI'm so glad we're h...      1552528154  
2              Repeat after me... YOU ARE ENOUGH! ❤️      1551267475  
3  Um dos presentes que tenho cogitado em colocar...      1529701932  
4  Thank you @jay_art18 To my beautiful fans, fro...      1530093499  


In [28]:
influencers = pd.read_csv('/Users/amandacurry/Downloads/influencers.txt', sep='\t')

In [31]:
influencers

Unnamed: 0,Username,Category,#Followers,#Followees,#Posts
0,makeupbynvs,beauty,1432,1089,363
1,jaquelinevandoski,beauty,137600,548,569
2,anisaartistry,beauty,64644,289,391
3,rubina_muartistry,beauty,496406,742,887
4,beautyxabbi,beauty,2050,1423,751
...,...,...,...,...,...
33930,alingzhang,other,4597,881,365
33931,flower.jini,other,27093,1599,664
33932,ester_starling,other,20621,23,783
33933,lovely___yul,other,12381,811,223


In [36]:
df['influencer'] = df['post_file'].str.split('-').str[0]

In [39]:
df = df.merge(influencers, how='left', left_on='influencer', right_on='Username', indicator=True)
df.head()

Unnamed: 0,post_file,post_caption,post_timestamp,parent_id,parent_text,parent_username,parent_timestamp,reply_id,reply_text,reply_username,reply_timestamp,influencer,Username,Category,#Followers,#Followees,#Posts,_merge
0,amy_greaves1-1999073510287538981.info,YOU ARE EVERYTHING ❤ •\n•\nI'm so glad we're h...,1552528154,18013846654180946,Cannot wait for Friday!! Hope you’re feeling b...,mummyandherbusylittlebees,1552575409,17844791821380406,@mummyandherbusylittlebees thank you lovely 😘,amy_greaves1,1552578907,amy_greaves1,amy_greaves1,family,2782,2023,545,both
1,nobiggie-1999149379567945592.info,Here's a fun idea to add a little more green t...,1552537198,17856514567362355,And it makes them no slip! Great idea!,marciebpat,1552543000,17911468999289338,@marciebpat yes! Thanks so much!💚,nobiggie,1552588261,nobiggie,nobiggie,food,20076,338,2262,both
2,passion_art_photography-1722702416497304149.info,Another one from my trip to south tyrol. I had...,1519582152,17915305093116829,This place blew my mind,runawayrocio,1519669599,17911811977121752,@runawayrocio glad u like it...thanks a lot fo...,passion_art_photography,1519671559,passion_art_photography,passion_art_photography,travel,5145,659,643,both
3,passion_art_photography-1722702416497304149.info,Another one from my trip to south tyrol. I had...,1519582152,17927062030027443,This is ridiculously beautiful. Such an amazin...,thedronelad,1519727868,17912251591124802,@thedronelad thank u so much 😉✌🏼,passion_art_photography,1519937627,passion_art_photography,passion_art_photography,travel,5145,659,643,both
4,brookesouza-1760003657572814735.info,Throwing salt on my name will only season my s...,1524028807,17920524391083586,Yaaas! So great to meet you! You gonna come BI...,c_hirata_photography,1524030716,17861572000244890,@c_hirata_photography Yassss need to go there ...,brookesouza,1524031519,brookesouza,brookesouza,fashion,11296,571,235,both


In [41]:
df.Category.value_counts()


Category
fashion     7005
family      4282
food        3836
travel      2886
other       2193
interior    2092
fitness      928
beauty       704
pet          525
Name: count, dtype: int64

In [46]:
sampled_df = df.groupby(['Category'], group_keys=False).apply(lambda x: x.sample(n=110))


  sampled_df = df.groupby(['Category'], group_keys=False).apply(lambda x: x.sample(n=110))


### Youtube Sampling

In [68]:
youtube = pd.concat([pd.read_csv('/Users/amandacurry/gender bias in cvs/youtube_comments_with_replies.csv'), pd.read_csv('/Users/amandacurry/gender bias in cvs/youtube_comments_with_replies_pt2.csv')])

youtube = youtube[youtube['reply_text'].notna()]
youtube = youtube[youtube['comment_text'].apply(lambda x: isinstance(x, str) and len(x.strip().split()) >= 5)]

youtube.channel_title.value_counts()

channel_title
JennaMarbles                                6767
ContraPoints                                6049
Sam                                         4902
David Dobrik                                2175
LiamKyleSullivan                            1854
shakiraVEVO                                 1225
WIRED                                       1045
Bob Ross                                     621
Flashgitz                                    461
tasha                                        201
Elite Facts    1.8m views     2 days ago     172
Hannah Louise Poston                         150
Kendall Rae                                  135
Dude Perfect                                 109
Robert Welsh                                 107
producerdan                                  104
Nefertiti ASMR                                86
Adult Swim                                    35
Peter Bragiel                                 25
Team Coco                                      9
TeamFo

In [70]:
youtube_sample = youtube.sample(n=2000)
# Set number of samples per channel (or fewer if not enough)
n_samples = 100  # or any number suitable for your analysis

youtube_sample = youtube.groupby('channel_title', group_keys=False).apply(
    lambda x: x.sample(n=min(len(x), n_samples), random_state=42)
)
youtube_sample.channel_title.value_counts()


  youtube_sample = youtube.groupby('channel_title', group_keys=False).apply(


channel_title
LiamKyleSullivan                            100
JennaMarbles                                100
shakiraVEVO                                 100
producerdan                                 100
WIRED                                       100
Sam                                         100
Robert Welsh                                100
Kendall Rae                                 100
tasha                                       100
Hannah Louise Poston                        100
Flashgitz                                   100
Elite Facts    1.8m views     2 days ago    100
Dude Perfect                                100
David Dobrik                                100
ContraPoints                                100
Bob Ross                                    100
Nefertiti ASMR                               86
Adult Swim                                   35
Peter Bragiel                                25
Team Coco                                     9
Barnacules Nerdgasm       

In [73]:
youtube_sample.sample(frac=1).to_csv('youtube_comment_sample_with_replies_only.csv', index=False)

In [None]:
import re
from langdetect import detect, LangDetectException
from googleapiclient.discovery import build
import time
from googleapiclient.errors import HttpError


API_KEY = 'AIzaSyBjenBURlJbaJvME5JwjU3chM9HvLRwiCo'
REGION_CODE = "US"  # change as needed
MAX_VIDEOS = 200     # number of trending videos to process

youtube = build("youtube", "v3", developerKey=API_KEY)

def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False

def count_sentences(text):
    # Simple sentence splitter (can be improved)
    sentences = re.split(r'[.!?]+', text)
    return len([s for s in sentences if s.strip() != ''])

def count_words_excluding_emojis(text):
    # Remove emojis (Unicode ranges)
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"
        "\U0001F300-\U0001F5FF"
        "\U0001F680-\U0001F6FF"
        "\U0001F1E0-\U0001F1FF"
        "\U00002700-\U000027BF"
        "\U0001F900-\U0001F9FF"
        "\U00002600-\U000026FF"
        "\U00002B50-\U00002BFF"
        "]+",
        flags=re.UNICODE
    )
    text_no_emoji = emoji_pattern.sub(r'', text)
    words = text_no_emoji.strip().split()
    return len(words)

def contains_second_person(text):
    return bool(re.search(r"\b(you|your|you're)\b", text, re.I))

def get_trending_videos(region_code, max_results=200):
    request = youtube.videos().list(
        part="snippet",
        chart="mostPopular",
        regionCode=region_code,
        maxResults=max_results
    )
    response = request.execute()
    return response.get("items", [])

def get_comments_and_replies(video_id):
    results = []
    try:
        # Get top-level comments
        request = youtube.commentThreads().list(
            part="snippet,replies",
            videoId=video_id,
            maxResults=1000,
            textFormat="plainText"
        )
        while request:
            response = request.execute()

            for item in response.get("items", []):
                comment = item["snippet"]["topLevelComment"]["snippet"]
                comment_text = comment["textDisplay"]

                # Replies may not always be included; fetch if present
                replies = item.get("replies", {}).get("comments", [])

                for reply in replies:
                    reply_text = reply["snippet"]["textDisplay"]

                    # Apply filters on reply
                    if not is_english(reply_text):
                        continue
                    if count_sentences(reply_text) <= 2:
                        continue
                    if count_words_excluding_emojis(reply_text) <= 20:
                        continue
                    if not contains_second_person(reply_text):
                        continue

                    results.append({
                        "comment_text": comment_text,
                        "reply_text": reply_text
                    })

            request = youtube.commentThreads().list_next(request, response)
    except HttpError as e:
        reason = e.error_details[0].get("reason", "")
        if reason == "commentsDisabled":
            print(f"[SKIPPED] Video {video_id}: Comments disabled.")
        else:
            print(f"[ERROR] Video {video_id}: {reason}")
    return results

def main():
    trending_videos = get_trending_videos(REGION_CODE, MAX_VIDEOS)

    all_data = []

    for video in trending_videos:
        time.sleep(1)  # To avoid hitting API rate limits
        vid_id = video["id"]
        snippet = video["snippet"]
        title = snippet["title"]
        channel_title = snippet["channelTitle"]
        category_id = snippet["categoryId"]
        description = snippet.get("description", "")

        print(f"Processing video: {title}")

        comment_replies = get_comments_and_replies(vid_id)

        for cr in comment_replies:
            all_data.append({
                "video_id": vid_id,
                "video_title": title,
                "channel_title": channel_title,
                "video_description": description,
                "category_id": category_id,
                "comment_text": cr["comment_text"],
                "reply_text": cr["reply_text"],
            })

    print(f"Total filtered replies collected: {len(all_data)}")

    # Example: save to CSV
    df = pd.DataFrame(all_data)
    df.to_csv("youtube_comments_replies_filtered.csv", index=False)

if __name__ == "__main__":
    main()


Processing video: I Made Millions by Openly Ripping Off Every Person Who Hired Me - Cash Cleaner Simulator
Processing video: ELDEN RING NIGHTREIGN | Bring Me to Life Trailer
Processing video: Yailin La Mas Viral - Todos Mienten (Video Oficial)
Processing video: Can you Break a Diamond with a Tank?
Processing video: Inter Miami CF vs. CF Montréal | Full Match Highlights | Messi and Luis Suárez Score 4 Goals!
Processing video: Is Elden Ring Nightreign ACTUALLY Good?
Processing video: 2025 State of Origin Match Highlights | Maroons v Blues | Game 1
Processing video: KAZE’S ORIGIN STORY! The first Ultra Legendary Brawler! (Brawl Stars Animation)
Processing video: BABYMETAL x Slaughter To Prevail - Song 3 (OFFICIAL MUSIC VIDEO)
Processing video: Jourdan Blue's "Breakeven" Cover Receives a GOLDEN BUZZER From Howie Mandel! | Auditions | AGT 2025
Processing video: JEFF PROBST ANNOUNCES THE CAST OF SURVIVOR 50!
Processing video: Scottie Scheffler VS Nelly Korda
Processing video: Manchester Uni

In [48]:
from googleapiclient.discovery import build

YOUTUBE_API_SERVICE_NAME = 'youtube'
YOUTUBE_API_VERSION = 'v3'

def get_trending_video_ids(max_results=50, region_code='US'):
    youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=API_KEY)
    video_ids = []
    next_page_token = None

    while len(video_ids) < max_results:
        request = youtube.videos().list(
            part="id",
            chart="mostPopular",
            regionCode=region_code,
            maxResults=min(50, max_results - len(video_ids)),
            pageToken=next_page_token
        )
        response = request.execute()

        ids = [item['id'] for item in response['items']]
        video_ids.extend(ids)

        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break  # No more pages

    return video_ids

# Usage
video_ids = get_trending_video_ids()
print(video_ids)


['jtemgoebPPE', 'rLprbptDcwI', 'ynrSkSYirB0', 'e6IO3Yuzkl4', 'Ot6j_qqYQaM', '4lXEyZR2UCg', 'jJUNWeyCXzM', 'xtSu7662jOQ', 'ZlFZtEzEQ1E', 'WjL0sp18T9c', 'B5vN0qCSg_4', 'xpVkt2iba4k', 'l-jALlnNNS0', 'KEMVgy51kPE', 'UXAdn8uwyaY', '2xLAXA1ViIo', 'OKtLPp7YYOE', 'NYS5HSUVdz8', 'hAdLlG9Rfd4', 'bvtfOWlSmzc', '-kEsQ_4ZliI', 'xbrrYy-5llg', 'v08YeRYPmKY', '7yrT5eImSMU', 'ht9fG0-PLVQ', 'G7AXRrKnUDE', '3PQLBCD1WxQ', '7nueUpy_xZY', 'qh2bUSFFEYw', 'nsqHCfO1ayQ', '_P8-zxZid4Y', 'scOooV7j8fk', 'V6y80kO4_Tc', 'hyGGFlfm-Pg', 'wDNg1pwrTVg', 'ZXc9_u6BYr8', 'OSCOQ6vnLwU', 'A1MdThqGarI', '93otzGCvijI', 'IeaOeyrfe7k', 'AWrXpJQBJF0', 'Ubx-bTFzC5g', 'By8onCWSO9c', 'IIYw0wDjSyw', 'DR4234R9spc', 'CVsrbSpJCX8', 'BIqvS6BpkLo', 'WYf9-xfm6t8', 'Rg_jppK1I2Q', 'ybktw-KV_s0']


In [49]:
len(video_ids)

50

In [32]:
from googleapiclient.discovery import build
API_KEY = 'AIzaSyBjenBURlJbaJvME5JwjU3chM9HvLRwiCo'

youtube = build("youtube", "v3", developerKey=API_KEY)

video_id = "xbrrYy-5llg"  # Replace with a confirmed public video ID

request = youtube.commentThreads().list(
    part="snippet,replies",
    videoId=video_id,
    maxResults=100,
    textFormat="plainText"
)

response = request.execute()

for item in response["items"]:
    top_comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
    print("Top comment:", top_comment)
    for reply in item.get("replies", {}).get("comments", []):
        print(" ↳ Reply:", reply["snippet"]["textDisplay"])


Top comment: 👏 👏 🔥
Top comment: She always sings the same songs🤮
Top comment: They tossing her around like a ragged doll - I’m sure what Diddy did back in the day as she was looking for her come up
Top comment: Question: At a 2025  music award show, if there's no Taylor Swift, is it really a legitimate award show?
Answer: No
Side note:  Jennifer Lopez??????
Top comment: Holy heck she looks old AF
Top comment: Her only talent is masking her lack of singing talent 😂
Top comment: Dance again ♥️
Top comment: WTF is happening
Top comment: Groooosssss 🤮
Top comment: 🔥🔥🔥🔥🔥🔥
Top comment: Janet Jackson and JLo can still make moves...
Top comment: This woman never disappoints! They don't make entertainers like this anymore - take notes👏🏽
Top comment: Overdanged Beyoncé, no choreography from her on this cow boy tour
Top comment: This JL was straight up TIGHT 👌
Top comment: Wow!!!!! Amazing!! And in her 50’s!
Top comment: La reyna jennifer lopez
Top comment: only here to see the apt part
Top comme

In [45]:
import pandas as pd

instagram = pd.read_csv('extracted data/instagram_both_dimensions.csv')

In [46]:
instagram

Unnamed: 0,post_file,post_caption,post_timestamp,parent_id,parent_text,parent_username,parent_timestamp,reply_id,reply_text,pre-processed_reply_text,reply_username,reply_timestamp,reply_sentences,dimension,comment_dimension
0,brogantatexo-1936310624556763513.info,"So Brogan, what did you get up to this weekend...",1545046235,18003253969078267,You should get a jigsaw puzzle app on the iPad...,happyhannahw,1545050943,17985144319191805,@brogantatexo I agree....there's nothing like ...,@brogantatexo I agree....there's nothing like ...,muchmore2explore,1545055543,4,respect,knowledge
1,yuckylavado-1760855587001880768.info,"Volendam, Netherlands is beautiful... and so a...",1524130365,17925863074113423,Enjoy🇳🇱🧀❣️My favourite country & city is Londo...,kinuko.0901,1524131001,17939869570051845,@kinuko.0901 I love London! One of my favorite...,@kinuko.0901 I love London! One of my favorite...,yuckylavado,1524132511,7,respect,fun
2,theshrinkingmomma-1918327642138974144.info,I’m not counting points today but that doesn’t...,1542902497,17928608815234893,The recalled packages had specific expiration ...,decicj,1542978005,17887230364303863,"@decicj well, with the Romain scare it seems l...","@decicj well, with the Romain scare it seems l...",theshrinkingmomma,1543530196,2,social_support,conflict
3,alexandralee1016-1779172089367121793.info,Please VOTE for me!!! 🗳 This updo 👆🏼placed me ...,1526313862,17944507045020289,I used your hair design on my page and my clie...,lauramakowska,1526389139,17937195760078503,@lauramakowska that is so awesome Laura! Thank...,@lauramakowska that is so awesome Laura! Thank...,alexandralee1016,1526389736,4,respect,romance
4,iloveandreea-1948890621332093120.info,Talking about favourite bronzers ... ❤️ Mine h...,1546545888,17989606480146922,Have you tried Laura Mercier Matte Radiance Ba...,dee.mahar,1546896546,17988652918152685,@dee.mahar Thank you so much for your recommen...,@dee.mahar Thank you so much for your recommen...,iloveandreea,1546896980,2,respect,knowledge
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10051,freefromfarm-1817672650483650562.info,One of the positive things allergies has broug...,1530903487,17842155424304847,I enjoyed you talk about travelling with aller...,myallergykitchen,1530944614,17886329536228502,"@myallergykitchen Ha ha, if there's one thing ...","@myallergykitchen Ha ha, if there's one thing ...",freefromfarm,1530951070,4,social_support,social_support
10052,gemmaruthbrown-1888004040837493311.info,I’ve been spending a lot of time on the floor ...,1539287642,17870343604280490,I love this. I often ask myself what story I’m...,bethanyjoydawson,1539295311,17973077107104267,@bethanyjoydawson so glad it came at a good ti...,@bethanyjoydawson so glad it came at a good ti...,gemmaruthbrown,1539333888,4,romance,romance
10053,the.blonde.blog-1723096018550960195.info,🌱 Happy Monday - the start of a new week and o...,1519629073,17915485426097414,"Happy Monday love! Mix of cleaning, life admin...",mini_and_mabel,1519638047,17898372823163311,@mini_and_mabel Hope you had a fab Monday! Oh ...,@mini_and_mabel Hope you had a fab Monday! Oh ...,the.blonde.blog,1519716402,4,fun,fun
10054,jasxcharlotte-1957207065818891755.info,I choose happy ❤️ it’s been just over a year n...,1547537285,18020474908056637,"Gosh, I didn't realise it was only just over a...",lucyludreams,1547555276,17931472921253971,@lucyludreams aw thank you!! Yeah crazy really...,@lucyludreams aw thank you!! Yeah crazy really...,jasxcharlotte,1547588890,4,social_support,power


In [47]:
youtube = pd.concat([pd.read_csv('extracted data/youtube_with_labelled_dimensions.csv'), pd.read_csv('extracted data/comment_reply_pairs_with_dimensions_pt2.csv'), pd.read_csv('extracted data/comment_reply_pairs_with_dimensions_pt3.csv')])

youtube

Unnamed: 0,video_id,comment,reply,comment_date,reply_date,video_title,category_id_x,channel_name,category_id_y,category,reply_dimension,comment_dimension
0,SNE7-rirdQw,Is Ott o better\nThanks again Auto better is a...,I have to go get back right away so I'm just g...,2018-02-09T10:34:27Z,2018-02-09T10:38:31Z,ARMS Global Testpunch Gameplay Part 5 (Nintend...,20,ZackScottGames,20,Gaming,social_support,conflict
1,SNE7-rirdQw,"Hey Zack! If you see this, then I commented ab...",Nathan Coe Sub 4 sub? Just reply saying done. ...,2017-05-29T17:25:00Z,2017-05-29T19:59:01Z,ARMS Global Testpunch Gameplay Part 5 (Nintend...,20,ZackScottGames,20,Gaming,respect,social_support
2,PrZ8-TorUXA,"I'm pretty sure the ""A3000(T)"" line means ""A30...","yeah im dumb, the 4000 and 4000T should be sep...",2023-08-30T05:41:22Z,2023-08-30T10:57:33Z,Amiga 4000 tower i wouldn't do that if i was ...,28,Chris Edwards Restoration,28,Science & Technology,knowledge,conflict
3,PrZ8-TorUXA,For all those folks who want to get into Amiga...,I've learned all my patience from Amiga. Other...,2023-08-29T17:50:33Z,2023-08-29T21:26:00Z,Amiga 4000 tower i wouldn't do that if i was ...,28,Chris Edwards Restoration,28,Science & Technology,conflict,respect
4,PrZ8-TorUXA,"AFAIR, ""maprom file FORCE"" should go to a diff...",not on warm boot only a cold boot it remains r...,2023-08-29T15:18:06Z,2023-08-29T18:09:00Z,Amiga 4000 tower i wouldn't do that if i was ...,28,Chris Edwards Restoration,28,Science & Technology,knowledge,knowledge
...,...,...,...,...,...,...,...,...,...,...,...,...
480,a0H8aY-6Bg4,Discrimination against LGBTQ+ individuals is w...,Yeah we should act better and not act like ass...,2023-06-23T21:26:38Z,2023-06-25T16:43:56Z,Gay Republicans,24,Art Bezrukavenko,24,Entertainment,conflict,conflict
481,a0H8aY-6Bg4,Don't ask people to accept your beliefs if you...,But it's insane to vote for people who are out...,2023-06-23T20:08:24Z,2023-06-23T21:15:30Z,Gay Republicans,24,Art Bezrukavenko,24,Entertainment,conflict,knowledge
482,a0H8aY-6Bg4,"Well, that's sad that he of all people would j...",He is only saying what he thought was right to...,2023-06-23T19:31:53Z,2023-06-24T00:21:05Z,Gay Republicans,24,Art Bezrukavenko,24,Entertainment,conflict,conflict
483,a0H8aY-6Bg4,I think there needs to be more centrists in th...,I’m trying to see the negative in being far le...,2023-06-23T19:27:46Z,2023-06-23T21:00:17Z,Gay Republicans,24,Art Bezrukavenko,24,Entertainment,conflict,conflict


In [48]:
import pandas as pd
import re

reddit = pd.read_csv('reddit_sample_20k.csv')

import re
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0  # Ensures consistent language detection

import re
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0  # For consistent results

def clean_reddit_comments(df, author_col='author', comment_col='comment', subreddit_col='subreddit',
                          min_words=20, max_words=200, max_linebreaks=2):
    import re
    from langdetect import detect, DetectorFactory
    DetectorFactory.seed = 0

    # 1. Bot detection
    bot_author_pattern = r'(bot|automoderator)'
    bot_comment_pattern = r'(i am a bot|beep boop|performed automatically)'

    # 2. Sexual content filter
    sexual_terms = [
        r'\bsex\b', r'\bsexy\b', r'\bporn\b', r'\bnude\b', r'\bnaked\b', r'\bfuck\b',
        r'\bboobs?\b', r'\btits?\b', r'\bass\b', r'\bcock\b', r'\bdick\b', r'\bpussy\b',
        r'\bblowjob\b', r'\bhandjob\b', r'\bcum\b', r'\borgasm\b', r'\banal\b', r'\bxxx\b'
    ]
    sexual_pattern = '|'.join(sexual_terms)

    # 3. Spam patterns
    spam_patterns = [
        r'\b(add me|send gift|please follow|join us)\b',
        r'\b\d{6,}\b',  # long digit sequences
        r'[A-Za-z]+\d{2,}',  # usernames
    ]
    spam_pattern = '|'.join(spam_patterns)

    def looks_like_spam(text):
        letters = len(re.findall(r'[a-zA-Z]', text))
        digits = len(re.findall(r'\d', text))
        return digits > letters * 2 or re.search(spam_pattern, text, re.IGNORECASE)

    # 4. Meta/moderation comments
    def is_meta_comment(text, subreddit_name):
        meta_keywords = [
            r'community guidelines', r'read the rules', r'moderator', r'why is this in',
            r'rules', r'removed', r'violates', r'deleted'
        ]
        meta_pattern = '|'.join(meta_keywords)
        return (
            re.search(meta_pattern, text, re.IGNORECASE)
            or (subreddit_name and re.search(rf'/r/{re.escape(subreddit_name)}', text, re.IGNORECASE))
        )

    # 5. Link detection
    link_pattern = r'(https?://\S+|www\.\S+|\[.*?\]\(https?://.*?\)|\b\S+\.(com|org|net|io|co)\b)'

    # 6. Language detection
    def is_english(text):
        try:
            return detect(text) == 'en'
        except:
            return False

    # Apply checks
    df['is_english'] = df[comment_col].apply(is_english)
    df['is_spam'] = df[comment_col].apply(looks_like_spam)
    df['is_meta'] = df.apply(lambda x: is_meta_comment(x[comment_col], x[subreddit_col]), axis=1)
    df['has_link'] = df[comment_col].str.contains(link_pattern, case=False, na=False)
    df['linebreaks'] = df[comment_col].str.count(r'\n')

    # Word count
    word_counts = df[comment_col].str.split().str.len()

    # Masks
    mask_bots = (
        df[author_col].str.contains(bot_author_pattern, case=False, na=False) |
        df[comment_col].str.contains(bot_comment_pattern, case=False, na=False)
    )
    mask_sexual = df[comment_col].str.contains(sexual_pattern, case=False, na=False)
    mask_spam = df['is_spam']
    mask_meta = df['is_meta']
    mask_links = df['has_link']
    mask_linebreaks = df['linebreaks'] > max_linebreaks
    mask_short = word_counts < min_words
    mask_long = word_counts > max_words
    mask_non_english = ~df['is_english']

    # Combine all
    filtered_df = df[~(mask_bots | mask_sexual | mask_spam | mask_meta | mask_links |
                       mask_linebreaks | mask_short | mask_long | mask_non_english)].reset_index(drop=True)

    return filtered_df.drop(columns=['is_english', 'is_spam', 'is_meta', 'has_link', 'linebreaks'])


reddit = clean_reddit_comments(reddit, author_col='author', comment_col='body', min_words=20, max_words=40)
reddit


  df['has_link'] = df[comment_col].str.contains(link_pattern, case=False, na=False)
  df[author_col].str.contains(bot_author_pattern, case=False, na=False) |
  df[comment_col].str.contains(bot_comment_pattern, case=False, na=False)


Unnamed: 0,author,subreddit,id,hash,body,parent_id,link_id
0,gujsehambi,movies,k2r818x,-8262967982834420000,Dark Crystal: Age of Resistance tv show. It wa...,t3_16v8doo,t3_16v8doo
1,pengie9290,danganronpa,hoic4r5,8558485645609105000,"Yes. And in my case, my partner wouldn't even ...",t3_rg3qop,t3_rg3qop
2,davine05,playblackdesert,ghrqpb6,5785165072527266000,"I been having the same problem, think i’m goin...",t3_koht8a,t3_koht8a
3,VDKay,seduction,i4jd0kc,-2938326051424433000,Yep!\n\nAlso helps if you are regular to some ...,t3_u29nn9,t3_u29nn9
4,alexis_mightytravels,awardtravel,j6hlfko,1205449271875501000,Have you looked into booking flights with Sing...,t3_10nvkug,t3_10nvkug
...,...,...,...,...,...,...,...
4602,Un_Lucky,EscapefromTarkov,eq22ose,-5868884038573806000,I want my mil-sim crazy friend who clearly spe...,t3_bwvxxr,t3_bwvxxr
4603,grynch43,Metallica,j2xx0k1,8953608239843179000,Creeping Death-because it was the first one I ...,t3_1035r33,t3_1035r33
4604,GrantUsEyez,pumparum,d2jqrfq,4334055555251146000,I'll help cause I'm bored and would like some ...,t3_4gq8bj,t3_4gq8bj
4605,HurricaneLogic,MultipleSclerosis,hxkmain,-6336475575526161000,I had to go completely gluten free. If I have ...,t3_sso67e,t3_sso67e


In [16]:
reddit.to_csv('cleaned_reddit_sample.csv')

In [None]:
reddit.sample(100)

In [49]:
insta_sample = instagram.sample(2000)
instagram.columns
insta_sample = insta_sample[['reply_id', 'reply_text',  'reply_username']]
insta_sample.columns = ['id', 'text',  'author']
insta_sample['dataset'] = 'instagram'
insta_sample


Unnamed: 0,id,text,author,dataset
6024,17947762111223265,@mummytries oooh yes forgot you were going awa...,arthurwears,instagram
4517,17955211612047032,@shonawhittington many thanks for your advice!...,rachelsshoppe.co,instagram
9572,17940972940180666,@runnerrhea run early is huge! Strangely so is...,run_rhandi_run,instagram
2107,17910196465083673,@t.beaston i just started watching your clean ...,leblanc_mama,instagram
2831,17866915447170221,@zebrastripes8546 sometime they offer the 3.99...,frogtornado,instagram
...,...,...,...,...
1931,17980486468057831,@makeuncomfortablecomfortable please feel fre...,hannahjayne_1990,instagram
6708,17887359343019299,@thetezzyfiles I don't know why I thought you ...,ishitaunblogged,instagram
7372,17876284288233761,"@katherinecenter oh my gosh, so sweet of you t...",emilynwilkinson,instagram
2771,17940603640227456,@jonelleyoga I should be yes and then going to...,jennyeclark,instagram


In [50]:
youtube.columns
youtube = youtube[['video_id', 'reply', 'reply_date', 'category']]
youtube.columns = ['video_id', 'text', 'reply_date', 'category']
youtube['dataset'] = 'youtube'
youtube['id'] = [f"id_{i}" for i in range(len(youtube))]
youtube_sample = youtube.sample(2000)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  youtube['dataset'] = 'youtube'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  youtube['id'] = [f"id_{i}" for i in range(len(youtube))]


In [34]:
youtube_sample

Unnamed: 0,video_id,text,reply_date,category,dataset,id
343,JgncST-h55U,It is not PERSONAL information in the PUBLIC s...,2024-07-18T16:07:39Z,People & Blogs,youtube,id_343
708,xg0F-bWLmMk,Thank you so much! You can refer to this tikto...,2021-02-24T04:06:02Z,Howto & Style,youtube,id_1619
73,H1Kb-ODQ3lw,Do you also hold right when you are about to g...,2021-10-27T16:29:39Z,Gaming,youtube,id_984
657,7c-3YU-3u8M,"ts bs ik , I am very skinny (not underweight) ...",2024-07-20T18:41:03Z,People & Blogs,youtube,id_657
464,nHdx-ZqAotE,I'm sooo sorry but realrosa must be crying rn ...,2025-06-09T09:40:45Z,Gaming,youtube,id_2221
...,...,...,...,...,...,...
152,aqz-KE-bpKQ,"It's insane how far Blender's come, now it's e...",2021-11-07T21:01:47Z,Film & Animation,youtube,id_1909
270,Fk5F-u4r-Yk,"lol You are the one who doesn't know history, ...",2024-03-03T09:45:04Z,People & Blogs,youtube,id_2027
483,SQxu_-l_p2c,I agree 100%. People would cross oceans and co...,2024-01-01T19:10:09Z,Entertainment,youtube,id_1394
320,Fg7U-BhiZGE,Same here sober from alcohol and partying. Sin...,2025-05-22T12:14:34Z,People & Blogs,youtube,id_1231


In [51]:
reddit.columns
reddit = reddit[['author', 'subreddit', 'id',  'body']]
reddit.columns = ['author', 'subreddit', 'id',  'text']
reddit['dataset'] = 'reddit'
reddit_sample = reddit.sample(2000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reddit['dataset'] = 'reddit'


In [52]:
pd.concat([reddit_sample, youtube_sample, insta_sample]).to_csv('large_sample_annotations.csv')