In [33]:
import os
import sys

PARENT_PATH = os.path.abspath(os.path.dirname(os.getcwd()))
sys.path.append(PARENT_PATH)

from utils import flat_subreddits, subreddits

from tqdm import tqdm
import pandas as pd
import re



In [34]:
posts = pd.read_csv(f"{PARENT_PATH}/data/processed/all_posts.csv")

In [35]:
keywords = [
    "generative ai", "aigc", "chatgpt", "gpt", "openai", "bard", "llm", "large language model",
    "midjourney", "diffusion model", "stability ai", "ai", "artificial intelligence",
    "artificial intelligence generated content", "dalle 2", "chatgpt", "chat gpt", "bing chat",
    "bingchat", "perplexity ai", "perplexityai", "perplexity.ai", "perplexityask", "perplexity ask",
    "dall-e-2", "dall-e 2", "dall-e2", "dall·e 2", "dall·e2", "dalle2", "dalle2", "(dalle 2)",
    "dall-e", "dall·e", "(dalle)", "stable diffusion", "stablediffusion", "midjourney", "(mid journey)",
    "imagen", "crayon", "dall-e-mini", "dall-e mini", "dall·e mini", "dall·e-mini", "dalle-mini",
    "dallemini", "dalle mini", "dreamstudio", "copilot", "co-pilot", "gpt-4", "gpt4", "gpt 4",
    "gpt-3.5", "gpt3.5", "gpt 3.5", "gpt-3", "gpt3", "gpt 3", "gpt-2", "gpt2", "(gpt 2)", "gemini"
]


In [36]:
posts["all_text"] = posts["title"].str.lower() + " " + posts["selftext"].str.lower()

In [None]:
# Add a column indicating if post contains any AI-related keyword
# Compile regex pattern once for better performance
pattern = re.compile(' | '.join(map(lambda x: f' {x} ', keywords)), re.IGNORECASE)

# Create ai_related column using vectorized operations
posts['ai_related'] = posts['all_text'].fillna('').apply(
    lambda text: any(f' {kw} ' in f' {text.lower()} ' for kw in keywords)
)

# Print count of AI-related posts 
n_ai_posts = posts['ai_related'].sum()
print(f"Number of AI-related posts: {n_ai_posts}")

Number of AI-related posts: 179


In [38]:
# Count number of unique subreddits with AI-related posts before/after ChatGPT
chatgpt_date = pd.to_datetime('2022-11-30')
posts['created_utc'] = pd.to_datetime(posts['created_utc'])

# Before ChatGPT
before_gpt = posts[
    (posts['ai_related'] == True) & 
    (posts['created_utc'] < chatgpt_date)
]
before_subreddits = before_gpt['subreddit'].nunique()
print(f"Number of subreddits with AI-related posts before ChatGPT: {before_subreddits}")

# After ChatGPT 
after_gpt = posts[
    (posts['ai_related'] == True) & 
    (posts['created_utc'] >= chatgpt_date)
]
after_subreddits = after_gpt['subreddit'].nunique()
print(f"Number of subreddits with AI-related posts after ChatGPT: {after_subreddits}")

# Show distribution before ChatGPT
before_counts = before_gpt['subreddit'].value_counts()
print("\nDistribution of AI-related posts across subreddits before ChatGPT:")
print(before_counts)

# Show distribution after ChatGPT
after_counts = after_gpt['subreddit'].value_counts()
print("\nDistribution of AI-related posts across subreddits after ChatGPT:")
print(after_counts)

Number of subreddits with AI-related posts before ChatGPT: 4
Number of subreddits with AI-related posts after ChatGPT: 5

Distribution of AI-related posts across subreddits before ChatGPT:
subreddit
AskALiberal         16
centrist             8
AskConservatives     7
SocialDemocracy      6
Name: count, dtype: int64

Distribution of AI-related posts across subreddits after ChatGPT:
subreddit
AskALiberal         55
AskConservatives    46
centrist            28
SocialDemocracy     12
conservatives        1
Name: count, dtype: int64
