In [None]:
!pip install praw

In [None]:
!pip install google-api-python-client

In [None]:
!pip install detoxify

In [None]:
from detoxify import Detoxify
tox_detector = Detoxify('unbiased-small')

In [None]:
import praw

reddit = praw.Reddit(
    client_id=...,
    client_secret=...,
    user_agent=...,
    username=...,
    password=...,
)

In [None]:
import json
def append_json_to_file(data, file_path):
    with open(file_path, 'a') as file:  # Open the file in append mode
        json.dump(data, file)           # Write the JSON data
        file.write('\n')                # Ensure a newline character after each JSON object

data_path = 'data.jsonl'

import re

def contains_no_link(text):
    """
    Returns True if there is no link in the string, False otherwise.
    
    Args:
    text (str): The string to check for links.
    
    Returns:
    bool: True if no links are found, False otherwise.
    """
    # Regular expression to identify most common forms of URLs
    url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+|www\.(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
    
    # Search for a URL in the text
    match = re.search(url_pattern, text)
    
    # Return False if a URL is found, True otherwise
    return match is None

import requests

def get_json_from_url(url):
    try:
        response = requests.get(url)  # Send a HTTP GET request to the URL
        response.raise_for_status()   # Raise an exception for HTTP errors
        data = response.json()        # Convert the JSON data to a Python dictionary
        return data
    except requests.RequestException as e:
        print(f"HTTP Request failed: {e}")
    except ValueError:
        print("Invalid JSON")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

def tox_detect(text):
    return tox_detector.predict(text)['toxicity']
        
def process_comments(comment, depth=0, max_depth=3, top_k=5, tox_tolerance=0.3, char_limit=300):
    
    """ Recursively process comments and their replies into a nested dictionary up to a certain depth and limit to top k comments. """
    if depth >= max_depth or isinstance(comment, praw.models.MoreComments):
        return None # Skip processing if it's a MoreComments object or maximum depth is exceeded
    
    comment_data = {
        'id': comment.id,
        'author': str(comment.author),  # Handle deleted users safely
        'body': comment.body,
        'score': comment.score,
        'created_utc': comment.created_utc,
        'replies': []
    }
    
    # Process replies if within depth limit, and only take the top k replies
#     if depth < max_depth:
    replies = [reply for reply in comment.replies if not isinstance(reply, praw.models.MoreComments) and '[deleted]' not in reply.body]
    if char_limit:
        replies = [reply for reply in replies if len(reply.body) <= char_limit]
    replies = [reply for reply in replies if contains_no_link(reply.body)]

    # cutoff
    replies = sorted(replies, key=lambda r: r.score, reverse=True)[:int(np.floor(top_k * 2))]
    replies = [reply for reply in replies if tox_detect(reply.body) < tox_tolerance] # sum filtering
    replies = replies[:top_k]

    for reply in replies:
        processed_reply = process_comments(reply, depth + 1, max_depth, top_k)
        if processed_reply:
            comment_data['replies'].append(processed_reply)
            
#     print(comment_data)
    
    return comment_data


def get_comments_structure(submission, max_depth=3, top_k=5, tox_tolerance=0.3, char_limit=300):
    """ Takes a PRAW submission object and returns a structured data of comments that have more than 10 upvotes, limited by depth. """
#     submission.comments.replace_more(limit=None)  # Ensures all top-level comments are loaded

    # Filter out MoreComments and accept comments with more than 10 upvotes
    comments = [comment for comment in submission.comments
                          if not isinstance(comment, praw.models.MoreComments) and comment.score > 10 and '[deleted]' not in comment.body and and '[removed]' not in comment.body]
    
    if char_limit:
        replies = [reply for reply in comments if len(reply.body) <= char_limit]
    replies = [reply for reply in replies if contains_no_link(reply.body) and '[deleted]' not in reply.body and '[removed]' not in reply.body]
    replies = sorted(replies, key=lambda r: r.score, reverse=True)[:int(np.floor(top_k * 1.5))]
    replies = [reply for reply in replies if tox_detect(reply.body) < tox_tolerance] # sum filtering
    replies = replies[:top_k]
    
#     print(replies)
    
    comments_structure = [process_comments(comment, depth=0, max_depth=max_depth, top_k=top_k, tox_tolerance=tox_tolerance, char_limit=char_limit) for comment in replies]
#     print(comments_structure)
    return comments_structure

In [None]:
from tqdm.notebook import tqdm
import logging
import numpy as np

In [None]:
import numpy as np
import requests
from PIL import Image
from io import BytesIO

def load_image(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    try:
        # Send an HTTP GET request to the URL with headers
        response = requests.get(url, headers=headers)
        # Check if the request was successful
        response.raise_for_status()

        # Open the image from the bytes of the response
        image = Image.open(BytesIO(response.content))

        # Convert the image to RGB format to ensure compatibility
        image = image.convert('RGB')

        # Convert the image to a NumPy array
        numpy_array = np.array(image)

        return numpy_array
    except:
        return None

In [None]:
time_filter = 'all'
subreddit = reddit.subreddit('pics')
MAX_POSTS_PER_KEYWORD = 100
ids = set()

top_posts_list = [
#     subreddit.top(limit=1_000, time_filter=time_filter),
#     subreddit.controversial(limit=1_000, time_filter=time_filter)
]

morality_keywords = []

# basic morality
# morality_keywords += ['ethical', 'unethical', 'ethics', 'bioethics', 'moral', 'immoral', 'amoral', 'racism', 'racist', 'anti-racism', 'gender', 'transgender', 'classism', 'classist', 'upperclass', 'middleclass', 'lowerclass', 'ethical dilemma', 'moral dilemma', 'war', 'warfare', 'warrior', 'antiwar', 'justice', 'unjust', 'justice system', 'criminal justice', 'equality', 'inequality', 'equitable', 'inequitable', 'rights', 'human rights', 'civil rights', 'righteous', 'integrity', 'honest', 'dishonest', 'fairness', 'fair', 'unfair', 'responsibility', 'responsible', 'irresponsible', 'respect', 'disrespect', 'respectful', 'disrespectful', 'compassion', 'compassionate', 'freedom', 'freedom of speech', 'freedom of expression', 'enslavement', 'discrimination', 'discriminatory', 'anti-discrimination', 'privilege', 'privileged', 'underprivileged', 'prejudice', 'prejudiced']

# social politics
# morality_keywords += ['Malcolm X', 'Malcolm Little', 'El-Hajj Malik El-Shabazz', 'Black nationalism', 'civil rights leader', 'Pan-Africanism', 'Martin Luther King Jr', 'MLK', 'Dr. King', 'Civil Rights leader', 'nonviolent protest', 'Southern Christian Leadership Conference', 'Civil Rights Movement', '1960s civil rights', 'desegregation', 'voting rights', 'freedom rides', 'March on Washington', 'Black Lives Matter', 'BLM', 'police brutality', 'racial justice', 'All Lives Matter', 'systemic racism', 'Ku Klux Klan', 'KKK', 'white robes', 'cross burning', 'racial hatred', 'white nationalism', 'White supremacy', 'white supremacist', 'racial superiority', 'alt-right', 'neo-Nazi', 'race war', 'Feminism', 'feminist', 'women’s rights', 'gender studies', 'equal rights', 'women’s liberation', 'MeToo', '#MeToo', 'sexual harassment', 'women’s movement', 'Harvey Weinstein', 'consent', 'Gender equality', 'gender equity', 'women empowerment', 'equal pay', 'gender roles', 'gender discrimination', 'Social justice', 'activism', 'social equity', 'social reform', 'equality', 'justice', 'Activist', 'activism', 'protester', 'social change', 'community organizing', 'grassroots', 'Protest', 'protester', 'demonstration', 'rally', 'public protest', 'civil disobedience', 'Ethical hacking', 'white hat', 'security testing', 'penetration testing', 'cybersecurity', 'hacker ethics', 'Cyberethics', 'internet ethics', 'online morality', 'digital ethics', 'data privacy', 'internet safety', 'Corporate ethics', 'business ethics', 'professional ethics', 'corporate responsibility', 'ethical business practices', 'corporate governance', 'Environmental ethics', 'eco-friendly', 'sustainability', 'sustainable', 'conservation', 'green living', 'Political ethics', 'political correctness', 'politically incorrect', 'ethical politics', 'political integrity', 'political conduct', 'Cultural appropriation', 'cultural sensitivity', 'inclusive', 'inclusivity', 'cultural respect', 'cultural exchange', 'Ageism', 'ableism', 'neurodiversity', 'ageism', 'disability rights', 'inclusive practices', 'Animal rights', 'veganism', 'vegetarianism', 'animal cruelty', 'cruelty-free', 'animal welfare', 'AI ethics', 'robot rights', 'machine ethics', 'artificial intelligence', 'ethical AI', 'robotics ethics', 'Bioethics', 'medical ethics', 'genetic engineering', 'euthanasia', 'medical research ethics', 'clinical ethics', 'Philosophical ethics', 'utilitarianism', 'deontology', 'virtue ethics', 'moral philosophy', 'ethical theories', 'Theology', 'religious ethics', 'biblical ethics', 'Islamic ethics', 'Jewish ethics', 'moral theology', 'Colonialism', 'postcolonialism', 'imperialism', 'decolonization', 'anti-colonial', 'colonial history', 'Truth and reconciliation', 'apartheid', 'genocide', 'Holocaust', 'reconciliation processes', 'historical justice', 'Surveillance', 'privacy', 'data protection', 'cyber security', 'personal privacy', 'surveillance state', 'Whistleblower', 'Edward Snowden', 'Julian Assange', 'WikiLeaks', 'leaker', 'disclosure', 'Corporate governance', 'financial ethics', 'anti-corruption', 'transparency', 'ethical leadership', 'corporate accountability', 'Conflict resolution', 'peace studies', 'diplomacy', 'international relations', 'mediation', 'peacebuilding', 'Humanitarian aid', 'international law', 'war crimes', 'human rights violations', 'aid distribution', 'global justice', 'Climate change', 'global warming', 'environmental protection', 'climate action', 'carbon footprint', 'greenhouse gases', 'Social media ethics', 'digital divide', 'net neutrality', 'online behavior', 'internet access', 'digital rights', 'Intellectual property', 'copyright infringement', 'patent law', 'IP law', 'copyright law', 'trademark protection', 'Homelessness', 'social inequality', 'wealth gap', 'universal basic income', 'poverty', 'income disparity', 'Addiction', 'drug abuse', 'mental health', 'psychotherapy', 'substance abuse', 'behavioral health', 'Education inequality', 'educational reform', 'student rights', 'access to education', 'school funding', 'educational equity', 'Reproductive rights', 'abortion', 'contraception', 'planned parenthood', 'women’s health', 'birth control']

#  international politics
# morality_keywords += ['Tiananmen Square', '1989 protests', 'Chinese government', 'human rights in China', 'student protests', 'tank man', 'Beijing', 'June Fourth', 'Chinese censorship', 'Mao Zedong', 'censorship', 'internet censorship', 'media control', 'freedom of speech', 'government surveillance', 'press freedom', 'social media bans', 'political repression', 'information control', 'authoritarianism', 'Xi Jinping', 'Chinese President', 'Communist Party of China', 'Belt and Road', 'U.S.-China relations', 'Chinese policy', 'South China Sea', 'Xi’s policies', 'Great Firewall', 'China’s economy', 'Vladimir Putin', 'Russian President', 'Kremlin', 'Ukraine conflict', 'Russian opposition', 'KGB', 'Navalny', 'Crimea annexation', 'Syrian intervention', 'Putin’s regime', 'Donald Trump', 'US President', 'Trump administration', 'impeachment', 'MAGA', 'Trump rallies', 'Trump tweets', '2020 election', 'Capitol riot', 'Trump’s policies', 'car crash', 'road safety', 'drunk driving', 'traffic fatalities', 'car safety features', 'automotive industry', 'driver distraction', 'insurance claims', 'emergency services', 'vehicle laws', 'protest', 'demonstration', 'public dissent', 'social movement', 'civil unrest', 'political protest', 'activism', 'strike', 'rally', 'march', 'killed', 'fatalities', 'murder', 'homicide', 'casualties', 'death toll', 'violent crime', 'shooting', 'armed conflict', 'manslaughter', 'Edward Snowden', 'NSA', 'government surveillance', 'whistleblower', 'privacy rights', 'data leak', 'Asylum', 'U.S. intelligence', 'Snowden leaks', 'global surveillance', 'WikiLeaks', 'Julian Assange', 'document leaks', 'classified information', 'government secrets', 'Chelsea Manning', 'data transparency', 'encryption', 'political asylum', 'leaks', 'police brutality', 'George Floyd', 'BLM', 'law enforcement', 'racial profiling', 'police reform', 'body cameras', 'stop and frisk', 'police accountability', 'civil rights', 'Black Lives Matter', 'racial justice', 'civil rights movement', 'systemic racism', 'police reform', 'social justice', 'racial equality', 'demonstrations', 'protests', 'activism', 'Syrian Civil War', 'Assad regime', 'refugees', 'Middle East conflict', 'ISIS', 'Aleppo', 'chemical weapons', 'humanitarian crisis', 'Russia’s involvement', 'peace talks', 'immigration', 'border control', 'migrants', 'asylum seekers', 'immigration policy', 'DACA', 'detention centers', 'ICE', 'border wall', 'refugee crisis', 'border wall', 'US-Mexico border', 'immigration policy', 'border security', 'illegal immigration', 'customs enforcement', 'border patrol', 'migration', 'human trafficking', 'border fence', 'climate change', 'global warming', 'carbon emissions', 'greenhouse gases', 'Paris Agreement', 'climate policy', 'renewable energy', 'sea level rise', 'climate activism', 'COP26', 'gun control', 'Second Amendment', 'NRA', 'mass shootings', 'firearm laws', 'background checks', 'gun violence', 'school shootings', 'assault weapons', 'gun rights', 'school shootings', 'Columbine', 'Sandy Hook', 'Parkland', 'gun control', 'student safety', 'mental health', 'active shooter', 'emergency response', 'gun laws', 'Parkland shooting', 'March for Our Lives', 'gun control', 'Nikolas Cruz', 'student activism', 'school safety', 'firearm regulation', 'mental health issues', 'national walkout', 'gun debate', 'Brexit', 'EU', 'UK politics', 'Article 50', 'EU referendum', 'trade agreements', 'immigration policy', 'British economy', 'European Union', 'No-deal Brexit', 'European Union', 'EU', 'Brexit', 'Schengen Area', 'Eurozone', 'EU policies', 'European Commission', 'EU Parliament', 'European Court of Justice', 'EU member states', 'NATO', 'North Atlantic Treaty', 'military alliance', 'Article 5', 'transatlantic relations', 'defense spending', 'NATO summits', 'Eastern Europe', 'NATO expansion', 'Russia-NATO relations', 'Ukraine invasion', 'Russia-Ukraine war', 'Crimea', 'Donbas', 'Eastern Ukraine', 'sanctions', 'NATO response', 'military conflict', 'peace negotiations', 'Minsk agreements', 'North Korea', 'Kim Jong Un', 'nuclear program', 'missile tests', 'DMZ', 'Korean Peninsula', 'Pyongyang', 'South Korea relations', 'human rights abuses', 'Korean War', 'Kim Jong Un', 'North Korea', 'Supreme Leader', 'nuclear weapons', 'hermit kingdom', 'Pyongyang', 'North Korean propaganda', 'human rights', 'Korean Peninsula', 'inter-Korean relations', 'Israel-Palestine conflict', 'Gaza', 'West Bank', 'Hamas', 'Israeli settlements', 'Two-State Solution', 'Jerusalem', 'Intifada', 'Middle East peace', 'security barrier', 'Hamas', 'Gaza Strip', 'Palestinian Authority', 'terrorist group', 'Israeli conflict', 'rocket attacks', 'ceasefire agreements', 'political Islam', 'Middle East politics', 'Palestinian nationalism', 'Benjamin Netanyahu', 'Israeli Prime Minister', 'Likud party', 'Israel politics', 'Gaza conflict', 'peace process', 'corruption charges', 'Israeli settlements', 'Middle East diplomacy', 'security policy', 'settlements', 'Israeli settlements', 'West Bank', 'international law', 'land disputes', 'Palestinian territories', 'peace talks', 'Two-State Solution', 'UN resolutions', 'occupation', 'West Bank', 'Israeli occupation', 'Palestinian territories', 'settlement expansion', 'security issues', 'Fatah', 'Palestinian statehood', 'Middle East conflict', 'checkpoint', 'Area C', 'Gaza Strip', 'Hamas', 'Israeli blockade', 'rocket fire', 'border clashes', 'humanitarian crisis', 'Great March of Return', 'UNRWA', 'economic blockade', 'tunnel smuggling', 'Saudi Arabia', 'Mohammed bin Salman', 'Khashoggi', 'Yemen war', 'oil economy', 'Middle East politics', 'Wahhabism', 'Vision 2030', 'human rights', 'Saudi-US relations', 'Mohammed bin Salman', 'Saudi crown prince', 'Vision 2030', 'Saudi reforms', 'Khashoggi case', 'oil policy', 'Saudi royal family', 'NEOM', 'G20 summit', 'Middle East leadership', 'Yemen conflict', 'Houthi rebels', 'Saudi-led coalition', 'civil war', 'humanitarian crisis', 'missile strikes', 'peace talks', 'Aden', 'Al Qaeda in the Arabian Peninsula', 'Iran-Saudi rivalry', 'Khashoggi', 'Jamal Khashoggi', 'Saudi consulate', 'journalist murder', 'international outrage', 'press freedom', 'UN investigation', 'human rights violations', 'global response', 'media coverage', 'Amazon rainforest', 'deforestation', 'Amazon basin', 'rainforest conservation', 'indigenous rights', 'climate impact', 'logging', 'biodiversity loss', 'Brazil', 'environmental policies', 'deforestation', 'rainforest destruction', 'logging industry', 'climate change', 'biodiversity', 'conservation efforts', 'tropical forests', 'carbon sink', 'illegal logging', 'forest management', 'Greta Thunberg', 'climate activism', 'Fridays for Future', 'youth movement', 'climate speeches', 'UN Climate Summit', 'environmental protest', 'global warming', 'media influence', 'eco-awareness', 'Extinction Rebellion', 'climate protests', 'environmental activism', 'civil disobedience', 'green policies', 'carbon neutrality', 'biodiversity', 'climate emergency', 'direct action', 'public awareness', 'Paris Agreement']

# social issues
# morality_keywords += ['court', 'legal', 'legislat', 'Paris Agreement', 'climate accord', 'CO2 emissions targets', 'global cooperation', 'environmental treaty', 'national commitments', 'climate finance', 'sustainable development', 'temperature goals', 'international climate policy', 'AI ethics', 'artificial intelligence', 'machine learning', 'robot rights', 'ethical AI', 'data bias', 'algorithmic transparency', 'AI surveillance', 'autonomous vehicles', 'tech regulation', 'facial recognition technology', 'privacy concerns', 'biometric data', 'surveillance tools', 'law enforcement', 'civil liberties', 'AI bias', 'public security', 'ethical issues', 'regulatory frameworks', 'surveillance state', 'government surveillance', 'privacy rights', 'data collection', 'national security', 'civil liberties', 'NSA', 'CCTV', 'electronic monitoring', 'privacy laws', 'data privacy', 'personal data', 'GDPR', 'data protection laws', 'cybersecurity', 'privacy policy', 'internet privacy', 'data breaches', 'user consent', 'digital footprint', 'gender equality', 'women’s rights', 'equal pay', 'gender discrimination', 'workplace equality', 'feminism', 'gender roles', 'gender bias', 'LGBTQ equality', 'gender policies', 'LGBTQ rights', 'same-sex marriage', 'trans rights', 'gender identity', 'non-discrimination', 'pride parade', 'gay rights', 'bisexual rights', 'queer communities', 'civil rights', 'same-sex marriage', 'marriage equality', 'civil unions', 'LGBTQ activism', 'legal rights', 'domestic partnerships', 'Supreme Court rulings', 'gay marriage', 'relationship recognition', 'civil rights', 'trans rights', 'transgender', 'gender identity', 'non-binary', 'gender confirmation surgery', 'bathroom bills', 'gender expression', 'legal recognition', 'gender marker', 'trans activism', 'Conversion therapy', 'LGBTQ therapy', 'ban conversion therapy', 'pseudoscience', 'mental health', 'human rights', 'ethical concerns', 'legal status', 'therapeutic practices', 'sexual orientation', 'abortion rights', 'Roe v. Wade', 'pro-choice', 'pro-life', 'reproductive health', 'women’s healthcare', 'Planned Parenthood', 'abortion laws', 'fetal rights', 'reproductive justice', 'opioid crisis', 'opioid epidemic', 'prescription drugs', 'drug addiction', 'overdose deaths', 'pain management', 'fentanyl', 'drug treatment', 'public health', 'Big Pharma', 'pharmaceutical industry', 'drug pricing', 'medication costs', 'healthcare ethics', 'clinical trials', 'FDA', 'prescription abuse', 'pharma lobbying', 'drug patents', 'Purdue Pharma', 'opioid litigation', 'Sackler family', 'bankruptcy', 'settlements', 'legal battles', 'opioid settlements', 'corporate responsibility', 'public backlash', 'pharmaceutical ethics', 'racial profiling', 'law enforcement', 'bias in policing', 'stop and frisk', 'civil rights', 'racial bias', 'minority communities', 'legal issues', 'police practices', 'discriminatory practices', 'stop and frisk', 'police tactic', 'Fourth Amendment', 'racial discrimination', 'New York City', 'police reform', 'public safety', 'legal challenges', 'community relations', 'law enforcement policy', 'affirmative action', 'college admissions', 'employment equality', 'diversity initiatives', 'reverse discrimination', 'Supreme Court', 'educational opportunities', 'race-based', 'social policies', 'equity in education', 'critical race theory', 'race studies', 'education curriculum', 'social justice', 'racial issues', 'academic theory', 'public debate', 'ideological conflict', 'education policy', 'diversity training', 'white privilege', 'social inequality', 'racial justice', 'systemic racism', 'white supremacy', 'educational disparity', 'cultural awareness', 'societal structures', 'racial dialogue', 'privilege awareness', 'Antifa', 'anti-fascist', 'political activism', 'street protests', 'far-left', 'political violence', 'anarchism', 'social movements', 'counter-protest', 'radical politics', 'Proud Boys', 'far-right', 'extremist group', 'political violence', 'alt-right', 'nationalism', 'street fights', 'hate group', 'white nationalism', 'right-wing extremism', 'political extremism', 'radical ideologies', 'extremist movements', 'far-right', 'far-left', 'political violence', 'ideological warfare', 'domestic terrorism', 'radicalization', 'extreme politics', 'Capitol riot', 'January 6', 'U.S. Capitol', 'election certification', 'Trump supporters', 'insurrection', 'domestic terrorism', 'political unrest', 'congressional response', 'national security', 'cryptocurrency', 'Bitcoin', 'blockchain', 'digital currency', 'crypto trading', 'Ethereum', 'financial technology', 'crypto market', 'decentralized finance', 'crypto regulation', 'Bitcoin', 'cryptocurrency', 'digital assets', 'blockchain technology', 'BTC', 'crypto investment', 'bitcoin mining', 'crypto wallets', 'bitcoin transactions', 'financial innovation', 'Elon Musk', 'Tesla', 'SpaceX', 'electric vehicles', 'entrepreneur', 'technology innovation', 'Starlink', 'Neuralink', 'clean energy', 'tech mogul', 'Tesla', 'electric cars', 'EVs', 'autonomous vehicles', 'Elon Musk', 'automotive industry', 'sustainable transportation', 'Model S', 'battery technology', 'auto innovation', 'SpaceX', 'space exploration', 'Elon Musk', 'rocket launches', 'Mars colonization', 'Falcon rockets', 'Starship', 'commercial spaceflight', 'space technology', 'space missions', 'deepfakes', 'artificial intelligence', 'video manipulation', 'AI ethics', 'misinformation', 'digital media', 'face swapping', 'synthetic media', 'content authenticity', 'media trust', 'misinformation', 'fake news', 'social media', 'disinformation', 'information warfare', 'media literacy', 'propaganda', 'fact-checking', 'online falsehoods', 'news credibility', 'fake news', 'disinformation', 'media bias', 'propaganda', 'social media', 'misleading information', 'news verification', 'clickbait', 'information disorder', 'media manipulation', 'social media regulation', 'online platforms', 'tech policy', 'internet governance', 'data privacy', 'content moderation', 'free speech online', 'digital policy', 'tech giants', 'platform responsibility', 'Facebook', 'social network', 'Mark Zuckerberg', 'data privacy', 'online ads', 'social media impact', 'content moderation', 'Facebook algorithm', 'user data', 'social media addiction', 'Twitter', 'social media platform', 'tweets', 'microblogging', 'online communication', 'hashtag activism', 'Twitter trends', 'social networking', 'tweetstorms', 'Twitter policy', 'universal basic income', 'UBI', 'social welfare', 'economic policy', 'income guarantee', 'poverty reduction', 'financial security', 'basic income trial', 'government benefits', 'social experiment', 'wealth tax', 'tax policy', 'economic inequality', 'progressive taxation', 'millionaire tax', 'capital gains tax', 'fiscal policy', 'wealth inequality', 'tax reform', 'estate tax', 'Bernie Sanders', 'Democratic socialism', '2020 presidential candidate', 'Medicare for All', 'political revolution', 'progressive politics', 'income inequality', 'campaign rallies', 'Green New Deal', 'Senate', 'Elizabeth Warren', 'consumer protection', 'wealth tax', 'Democratic senator', 'financial regulation', '2020 presidential candidate', 'progressive agenda', 'economic reform', 'banking oversight', 'political campaign', 'Jeff Bezos', 'Amazon', 'richest man', 'e-commerce', 'Blue Origin', 'Washington Post', 'tech entrepreneur', 'corporate power', 'antitrust concerns', 'wealth accumulation', 'homelessness', 'homeless population', 'shelter crisis', 'urban poverty', 'housing insecurity', 'social services', 'homeless shelters', 'street homelessness', 'homeless advocacy', 'housing policy', 'gentrification', 'urban redevelopment', 'displacement', 'neighborhood change', 'affordable housing', 'real estate development', 'community displacement', 'economic development', 'local businesses', 'urban planning', 'urban development', 'city planning', 'infrastructure', 'smart cities', 'urban growth', 'public spaces', 'zoning laws', 'urban regeneration', 'housing development', 'urban sustainability', 'housing crisis', 'affordable housing', 'rent control', 'real estate market', 'housing shortage', 'tenant rights', 'rental market', 'housing policy', 'home ownership', 'housing affordability', 'rent control', 'housing policy', 'tenant rights', 'landlord-tenant law', 'rent stabilization', 'rent increase', 'housing market', 'affordable rentals', 'lease agreements', 'housing regulations', 'mental health awareness', 'mental wellness', 'mental illness', 'mental health stigma', 'therapy', 'counseling', 'psychological support', 'mental health resources', 'depression', 'anxiety', 'suicide prevention', 'mental health', 'crisis intervention', 'suicide hotline', 'mental health support', 'awareness campaigns', 'suicidal thoughts', 'prevention programs', 'public health', 'mental wellness', 'therapy', 'counseling', 'psychotherapy', 'mental health care', 'therapeutic practices', 'cognitive behavioral therapy', 'mental health professionals', 'group therapy', 'counseling services', 'emotional healing', 'stigma', 'mental health stigma', 'social stigma', 'discrimination', 'awareness campaigns', 'public perception', 'mental illness misconceptions', 'cultural attitudes', 'stigma reduction', 'mental health education', 'World Mental Health Day', 'mental health awareness', 'global awareness', 'mental wellness', 'health campaigns', 'public education', 'mental health advocacy', 'mental health support', 'global health', 'mental health initiatives', 'animal rights', 'animal welfare', 'animal cruelty', 'veganism', 'animal protection', 'animal ethics', 'animal rescue', 'animal legislation', 'animal advocacy', 'animal treatment', 'factory farming', 'industrial agriculture', 'animal welfare', 'livestock conditions', 'animal rights', 'meat production', 'animal ethics', 'agricultural practices', 'humane treatment', 'food industry', 'veganism', 'plant-based diet', 'animal-free', 'ethical eating', 'vegan lifestyle', 'cruelty-free', 'environmental impact', 'health benefits', 'animal rights', 'vegan products', 'animal testing', 'vivisection', 'cruelty-free', 'laboratory animals', 'bioethics', 'cosmetic testing', 'animal welfare', 'research ethics', 'humane alternatives', 'scientific research', 'PETA', 'People for the Ethical Treatment of Animals', 'animal rights organization', 'anti-fur campaigns', 'animal activism', 'vegan advocacy', 'animal cruelty protests', 'ethical treatment', 'animal rescue', 'animal welfare laws', 'SeaWorld', 'marine life', 'animal captivity', 'orca shows', 'animal welfare', 'marine parks', 'animal ethics', 'Blackfish', 'public backlash', 'conservation efforts']

# affective orientations and descriptions
# morality_keywords += ['anguish', 'sympathy', 'empathy', 'heartbreak', 'outrage', 'disgust', 'delight', 'pride', 'shame', 'guilt', 'joy', 'sorrow', 'remorse', 'indignation', 'contentment', 'pleasure', 'grief', 'fear', 'terror', 'horror', 'excitement', 'thrill', 'euphoria', 'anxiety', 'panic', 'hope', 'despair', 'confidence', 'insecurity', 'jealousy', 'envy', 'admiration', 'contempt', 'resentment', 'bitterness', 'satisfaction', 'fulfillment', 'loneliness', 'nostalgia', 'melancholy', 'bliss', 'rage', 'fury', 'hatred', 'love', 'affection', 'attraction', 'repulsion', 'calmness', 'stress', 'relief', 'eagerness', 'apathy', 'curiosity', 'bewilderment', 'astonishment', 'humiliation', 'dignity', 'violation', 'honor', 'betrayal', 'trust', 'mistrust', 'skepticism', 'faith', 'doubt', 'optimism', 'pessimism', 'cynicism', 'solitude', 'rejection', 'acceptance', 'compassion', 'cruelty', 'kindness', 'malice', 'generosity', 'greed', 'patience', 'impatience', 'tolerance', 'intolerance', 'fascination', 'obsession', 'disinterest', 'anticipation', 'hesitation', 'willingness', 'unwillingness', 'freedom', 'constraint', 'liberation']
# morality_keywords += ['ignorance', 'enlightenment', 'bewilderment', 'cooperation', 'conflict', 'unity', 'division', 'inclusion', 'exclusion', 'synergy', 'friction', 'harmony', 'discord', 'resilience', 'fragility', 'strength', 'weakness', 'capability', 'incapability', 'dependence', 'independence', 'necessity', 'luxury', 'practicality', 'impracticality', 'reality', 'fantasy', 'truth', 'deception', 'honesty', 'dishonesty', 'sincerity', 'insincerity', 'loyalty', 'disloyalty', 'faithfulness', 'unfaithfulness', 'devotion', 'neglect', 'care', 'carelessness', 'responsibility', 'irresponsibility', 'accountability', 'unaccountability', 'moral', 'immoral', 'ethical', 'unethical', 'righteous', 'wicked', 'virtuous', 'nefarious', 'humble', 'arrogant', 'modest', 'pretentious', 'genuine', 'fake', 'real', 'unreal', 'authentic', 'counterfeit', 'original', 'imitation', 'permanent', 'temporary', 'persistent', 'ephemeral', 'lasting', 'fleeting', 'enduring', 'transient', 'infinite', 'finite', 'boundless', 'limited', 'unlimited', 'restricted', 'unconditional', 'conditional', 'absolute', 'relative', 'total', 'partial', 'complete']
# morality_keywords += ['unproductive', 'effective', 'ineffective', 'efficient', 'inefficient', 'competent', 'incompetent', 'capable', 'incapable', 'skilled', 'unskilled', 'qualified', 'unqualified', 'prepared', 'unprepared', 'ready', 'unready', 'prompt', 'delayed', 'timely', 'untimely', 'seasonable', 'unseasonable', 'expedient', 'inexpedient', 'suitable', 'unsuitable', 'appropriate', 'inappropriate', 'fitting']
# morality_keywords += ['misfitting', 'proper', 'improper', 'correct', 'incorrect', 'right', 'wrong', 'true', 'false', 'accurate', 'inaccurate', 'precise', 'imprecise', 'exact', 'inexact', 'detailed', 'vague', 'specific', 'nonspecific', 'particular', 'general', 'special', 'ordinary', 'unique', 'common', 'rare', 'normal', 'abnormal', 'typical']
# morality_keywords += ['atypical', 'conventional', 'unconventional', 'traditional', 'nontraditional', 'orthodox', 'unorthodox', 'regular', 'irregular', 'habitual', 'unhabitual', 'routine', 'nonroutine', 'customary', 'unusual', 'expected', 'unexpected', 'confinement', 'security', 'vulnerability', 'comfort', 'discomfort', 'easiness', 'difficulty', 'simplicity', 'complexity', 'clarity', 'confusion', 'awareness']
# morality_keywords += ['incomplete', 'pure', 'impure', 'clean', 'dirty', 'clear', 'obscure', 'visible', 'invisible', 'audible', 'inaudible', 'spoken', 'unspoken', 'said', 'unsaid', 'expressed', 'unexpressed', 'manifested', 'hidden', 'obvious', 'ambiguous', 'certain', 'uncertain', 'definite', 'indefinite', 'decisive', 'indecisive', 'resolute', 'wavering', 'firm', 'yielding', 'steady', 'unsteady', 'stable', 'unstable']
# morality_keywords += ['balanced', 'unbalanced', 'equitable', 'inequitable', 'just', 'unjust', 'fair', 'unfair', 'equal', 'unequal', 'biased', 'unbiased', 'partial', 'impartial', 'favorable', 'unfavorable', 'advantageous', 'disadvantageous', 'beneficial', 'harmful', 'helpful', 'detrimental', 'constructive', 'destructive', 'productive']


# morality_keywords = list(set(morality_keywords))


keywords = []
for keyword in morality_keywords:
    top_posts_list.append(subreddit.search(keyword, limit=None, sort='top',)) # time_filter=time_filter))
    top_posts_list.append(subreddit.search(keyword, limit=None, sort='controversial',)) # time_filter=time_filter))
    keywords.append(keyword + '-top')
    keywords.append(keyword + '-controversial')

packets_added = 0

logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

import time
start = time.time()
BREAK = False

for top_posts, id_ in zip(top_posts_list, keywords):
    
    counter = 0
    
    print(id_)

    for post in tqdm(top_posts):
        
        if time.time() - start >= 11 * 3600 + 55 * 60:
            BREAK = True
            break
        
        if post.id in ids or post.num_comments < 50: continue
            
        if load_image(post.url) is None: continue

        full_post_url = 'https://www.reddit.com' + post.permalink  # Create full URL from permalink

        ids.add(post.id)

        new_data = {
            'id': post.id,
            'post_url': full_post_url,
            'img_url': post.url,
            'title': post.title,
            'score': post.score,
            'n_comments': post.num_comments,
            'selftext': post.selftext,
            'comments': get_comments_structure(post, max_depth=3, top_k=3, tox_tolerance=0.3, char_limit=300)
        }
        packets_added += 1

        append_json_to_file(new_data, data_path)
        
        counter += 1
        
        if counter > MAX_POSTS_PER_KEYWORD: break
        
    logging.info(f'Running packets added for {id_}: {packets_added}')
    print(f'Running packets added for {id_}: {packets_added}')
    
    if BREAK:
        break

In [None]:
# import multiprocessing
# import os
# import json
# from tqdm import tqdm
# import logging

# def process_posts(top_posts, id_, temp_file_path):
#     with open(temp_file_path, 'a') as temp_file:
#         for post in tqdm(top_posts):
#             if post.id in ids or post.num_comments < 50:
#                 continue
#             full_post_url = f'https://www.reddit.com{post.permalink}'
#             ids.add(post.id)
#             new_data = {
#                 'id': post.id,
#                 'post_url': full_post_url,
#                 'img_url': post.url,
#                 'title': post.title,
#                 'score': post.score,
#                 'n_comments': post.num_comments,
#                 'selftext': post.selftext,
#                 'comments': get_comments_structure(post, max_depth=3, top_k=3, tox_tolerance=0.3, char_limit=300)
#             }
#             append_json_to_file(new_data, temp_file)

# def append_json_to_file(data, file):
#     json.dump(data, file)
#     file.write('\n')

# manager = multiprocessing.Manager()
# pool = multiprocessing.Pool()
# jobs = []

# temp_files = [f'temp_data_{i}.jsonl' for i in range(len(keywords))]

# for i, (top_posts, id_) in tqdm(enumerate(zip(top_posts_list, keywords))):
#     temp_file_path = temp_files[i]
#     job = pool.apply_async(process_posts, (top_posts, id_, temp_file_path))
#     jobs.append(job)

# pool.close()
# pool.join()

# # Combine all temporary files into the main data file
# with open(data_path, 'w') as main_file:
#     for temp_file in temp_files:
#         with open(temp_file, 'r') as f:
#             main_file.write(f.read())
#         main_file.write('\n')