# Social Media Analysis

### Developing your Code


### Doing a full dataset run of your code

## Install and Import Dependencies

Run this code only if running this notebook on Google CoLab.

In [None]:
!pip install praw
!pip install peft
!pip install trl
!pip install bitsandbytes
!pip install accelerate

Run this code regardless of your environment.

In [1]:
from typing import Iterable
import matplotlib.pyplot as plt
import pandas as pd
# import googleapiclient.discovery
import praw
import praw.models
from getpass import getpass
import itertools
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, PreTrainedModel, PreTrainedTokenizer, AutoConfig
from peft import AutoPeftModelForCausalLM
from getpass import getpass
import os
import torch
import time
import gc
import json

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

  from .autonotebook import tqdm as notebook_tqdm


## Data Collection (MODIFY THIS)

### Connect to API Clients

In [2]:
def connect_to_reddit():
    """Returns an API client for the Reddit API

    e.g. reddit_client = connect_to_reddit()

    Pass this to any function that has a paremeter called reddit_client.
    """
    reddit = praw.Reddit(
        client_id=prompt_unset_env("REDDIT_CLIENT_ID", "Enter your Reddit client ID:"), # 6KTbUpdoqYZdoqxLahn-mA
        client_secret=prompt_unset_env("REDDIT_CLIENT_SECRET", "Enter your Reddit client secret:"),# 0Ve6HtLoeVPcrbGWCcDqRxZ5UFXcWQ
        user_agent="COMP 631"  #myredditbot
        # username="your_reddit_username:",  #zhoulikefish
        # password="your_reddit_password:"   #
    )
    return reddit

def prompt_unset_env(env_var: str, prompt: str):
    """
    Asks the user for env_var with the text prompt
    if os.environ[env_var] is unset.

    You should not need to call this function directly.
    """
    if env_var not in os.environ:
        os.environ[env_var] = getpass(prompt)
    return os.environ[env_var]


#### Get posts

In [3]:
def search_reddit(reddit_client: praw.Reddit, query: str, n_posts: int=25, subreddit: str="all", time_filter="day"):
    """
    Performs a search of the Reddit site for query. This is like typing query into the search bar at the top of the site.

    Inputs:
    - reddit_client: The API client returned from connect_to_reddit()
    - query: The string to search the site for
    - n_posts: Number of posts to return. Defaults to 25
    - subreddit: The subreddit to search, without the /r/ prefix (e.g. to search /r/politics, should be "politics").
        Defaults to /r/all, which is a fake subreddit that displays all posts across the site.
    - time_filter: Limit your search to posts created within a certain time range.
        Can be one of: "all", "day", "hour", "month", "week", or "year" (default: "day")
    """
    return list(reddit_client.subreddit(subreddit).search(query=query, time_filter=time_filter, limit=n_posts))

In [None]:
# def get_most_popular_reddit_posts(reddit_client: praw.Reddit, n_posts: int=100, subreddit: str="all", time_filter="year"):
#     """
#     Gets the most upvoted posts on Reddit.

#     Inputs:
#     - reddit_client: The API client returned from connect_to_reddit()
#     - n_posts: Number of posts to return. Defaults to 25
#     - subreddit: The subreddit to return posts from, without the /r/ prefix (e.g. to search /r/politics, should be "politics").
#         Defaults to /r/all, which is a fake subreddit that displays all posts across the site.
#     - time_filter: Limit to posts created within a certain time range.
#         Can be one of: "all", "day", "hour", "month", "week", or "year" (default: "day")
#     """
#     return list(reddit_client.subreddit(subreddit).top(limit=n_posts, time_filter=time_filter))

In [4]:
import praw
import datetime

def get_most_popular_reddit_posts(reddit_client, n_posts=100, subreddit="all", start_date=None, end_date=None):
    """
    Gets the most upvoted posts on Reddit within a custom date range.

    Inputs:
    - reddit_client: The API client returned from connect_to_reddit()
    - n_posts: Number of posts to return. Defaults to 100
    - subreddit: The subreddit to return posts from, without the /r/ prefix.
    - start_date: The start date (datetime object)
    - end_date: The end date (datetime object)
    """
    subreddit_obj = reddit_client.subreddit(subreddit)
    start_timestamp = int(start_date.timestamp()) if start_date else None
    end_timestamp = int(end_date.timestamp()) if end_date else None

    posts = []
    for submission in subreddit_obj.top(limit=None):  # 获取所有置顶帖子
        if start_timestamp and submission.created_utc < start_timestamp:
            continue
        if end_timestamp and submission.created_utc > end_timestamp:
            continue
        posts.append(submission)
        if len(posts) >= n_posts:
            break

    return posts


#### Format Reddit data to Pandas

In [2]:
def create_reddit_posts_df(posts: Iterable[praw.models.Submission]) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Takes in an iterable (list, generator, set, etc) of Reddit posts and returns the following DataFrame:

    Columns:
        id: Post ID (index)
        likes: Upvote / downvote count of post
        title: Title of post
        body: Text of post
        subreddit: Subreddit where post was posted
        author: Username that wrote post
        published_timestamp: Time when the post was published.
    """
    post_data = {
        "id": [],
        "likes": [],
        "title": [],
        "body": [],
        "subreddit": [],
        "author": [],
        "published_timestamp": []
    }

    for submission in posts:
        post_data["id"].append(submission.id)
        post_data["published_timestamp"].append(submission.created)
        post_data["author"].append(submission.author)
        post_data["likes"].append(submission.score)
        post_data["body"].append(submission.selftext)
        post_data["title"].append(submission.title)
        post_data["subreddit"].append(submission.subreddit.display_name)


    return pd.DataFrame(post_data).set_index("id")

def create_reddit_comments_df(posts: Iterable[praw.models.Submission], comment_limit=10) -> pd.DataFrame:
    """
    Takes in an iterable (list, generator, set, etc) of Reddit posts and returns a DataFrame
    containing data about the comments.

    Columns:
        id: Comment ID (index)
        post_id: ID of the post this comment is a reply to
        likes: Upvote / downvote count of commment
        body: Text of comment
        subreddit: Subreddit where post was posted
        author: Username that wrote comment
        order_returned: 0 if 1st comment retrieved from the post, 1 if 2nd comment, etc
    """
    comment_data = {
        "id": [],
        "post_id": [],
        "likes": [],
        "author": [],
        "body": [],
        "author": [],
        "order_returned": [],
    }
    for submission in posts:
        submission.comments.replace_more(limit=0)
        for idx, comment in enumerate(submission.comments):
            if idx >= comment_limit:
                break
            comment_data["order_returned"].append(idx)
            comment_data["id"].append(comment.id)
            comment_data["post_id"].append(submission.id)
            comment_data["likes"].append(comment.score)
            comment_data["author"].append(comment.author)
            comment_data["body"].append(comment.body)
    return pd.DataFrame(comment_data).set_index("id")

def join_reddit_dfs(reddit_posts_df, reddit_comments_df):
    """Joins together the comments and posts."""
    return reddit_comments_df.join(reddit_posts_df, on="post_id", rsuffix="_post")


### Put it all together and collect the data (MODIFY THIS)

In [6]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [36]:
# Connect clients
reddit_client = connect_to_reddit()

# Define the data
sub_reddits = {
    # 'Subreddit': ['cscareerquestions', 'jobs', 'resumes', 'forhire', 'recruitinghell', 'jobsearchhacks', 'careerguidance', 'careeradvice', 'antiwork'],
    # 'Members': ['2.2 M', '1.8 M', '1.2 M', '412 K', '787 K', '210 K', '4.6 M', '640 K', '2.9 M']
    'Subreddit': ['Salary', 'AskHR', 'WorkReform'],
    'Members': ['295 K', '1.8 M', '750 K']
    
}

# Create a DataFrame
df_sub_reddits = pd.DataFrame(sub_reddits)

# Function to convert members to numbers
def convert_members(members):
    if 'M' in members:
        return float(members.replace(' M', '')) * 1e6
    elif 'K' in members:
        return float(members.replace(' K', '')) * 1e3
    else:
        return float(members)

# Apply conversion
df_sub_reddits['Members'] = df_sub_reddits['Members'].apply(convert_members)


In [37]:
from tqdm import tqdm
# total: 1000 topics, 10 comments per topic
n_posts = 1000
df_sub_reddits["n_posts"] = ((df_sub_reddits['Members'] / df_sub_reddits['Members'].sum()) * n_posts).astype(int)

start_date = datetime.datetime(2024, 1, 1)
end_date = datetime.datetime(2025, 2, 24)

posts_combined = []
for idx, row in tqdm(df_sub_reddits.iterrows(), total=len(df_sub_reddits), desc="Fetching Reddit posts"):
    posts_combined = posts_combined + get_most_popular_reddit_posts(reddit_client, n_posts=row["n_posts"], subreddit=row["Subreddit"], start_date=start_date, end_date=end_date)

len(posts_combined)

# END MODIFY THIS PART OF THE CODE
reddit_posts_df = create_reddit_posts_df(posts_combined)
reddit_comments_df = create_reddit_comments_df(posts_combined, comment_limit=20) # do we want to fetch more comments?
reddit_df = join_reddit_dfs(reddit_posts_df, reddit_comments_df)


Fetching Reddit posts: 100%|██████████| 3/3 [00:27<00:00,  9.06s/it]


## outcome here


In [38]:
reddit_df

Unnamed: 0_level_0,post_id,likes,author,body,order_returned,likes_post,title,body_post,subreddit,author_post,published_timestamp
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
lz35t2m,1h0ej7f,1444,logicflow123,What a dream,0,46057,Radiologist. I work 17-18 weeks a year.,Hi everyone I'm 3 years out from training. 34 ...,Salary,Radiant_Hovercraft93,1.732635e+09
lz36ut5,1h0ej7f,1097,Improvcommodore,I have two immediate family members who are bo...,1,46057,Radiologist. I work 17-18 weeks a year.,Hi everyone I'm 3 years out from training. 34 ...,Salary,Radiant_Hovercraft93,1.732635e+09
lz37me6,1h0ej7f,101,bigtome2120,How many RVUs annually?,2,46057,Radiologist. I work 17-18 weeks a year.,Hi everyone I'm 3 years out from training. 34 ...,Salary,Radiant_Hovercraft93,1.732635e+09
lz3eynz,1h0ej7f,101,Independent-Pie3588,"Dude how do you do it. I’m rads too, did night...",3,46057,Radiologist. I work 17-18 weeks a year.,Hi everyone I'm 3 years out from training. 34 ...,Salary,Radiant_Hovercraft93,1.732635e+09
lz3ej31,1h0ej7f,197,seajayacas,My impression is that the ability to be a top ...,4,46057,Radiologist. I work 17-18 weeks a year.,Hi everyone I'm 3 years out from training. 34 ...,Salary,Radiant_Hovercraft93,1.732635e+09
...,...,...,...,...,...,...,...,...,...,...,...
l0e9hes,1c81wcm,7,,Considering right wing politics are defined by...,15,11899,It's Not About Left Or Right; It's The Have-No...,,WorkReform,zzill6,1.713546e+09
l0c8wh1,1c81wcm,11,alficles,"And as a bonus, the slogan doubles as a Grindr...",16,11899,It's Not About Left Or Right; It's The Have-No...,,WorkReform,zzill6,1.713546e+09
l0dlerc,1c81wcm,9,eliteharvest15,the left is literally the side that supports w...,17,11899,It's Not About Left Or Right; It's The Have-No...,,WorkReform,zzill6,1.713546e+09
l0dl0gv,1c81wcm,4,Tookoofox,"Ah, yes, the ever popular strategy of bellowin...",18,11899,It's Not About Left Or Right; It's The Have-No...,,WorkReform,zzill6,1.713546e+09


In [40]:
reddit_df.to_csv("reddit_data_2024_S.csv", index=False)
# reddit_df.to_csv("/content/reddit_data.csv", index=False)

In [29]:
len(posts_combined)

1948

In [39]:
reddit_df["published_timestamp"] = pd.to_datetime(reddit_df["published_timestamp"], unit="s")
reddit_df["published_timestamp"] 

id
lz35t2m   2024-11-26 15:30:33
lz36ut5   2024-11-26 15:30:33
lz37me6   2024-11-26 15:30:33
lz3eynz   2024-11-26 15:30:33
lz3ej31   2024-11-26 15:30:33
                  ...        
l0e9hes   2024-04-19 16:57:06
l0c8wh1   2024-04-19 16:57:06
l0dlerc   2024-04-19 16:57:06
l0dl0gv   2024-04-19 16:57:06
l0cieo0   2024-04-19 16:57:06
Name: published_timestamp, Length: 13030, dtype: datetime64[ns]

In [20]:
df_sub_reddits["n_posts"]

0    1491
1    1220
2     813
3     279
4     533
5     142
6    3118
7     433
8    1966
Name: n_posts, dtype: int32

### prepocessing

In [3]:
import os
import pandas as pd

# 1. 读取所有 CSV 文件并合并
folder_path = "reddit_data"
all_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]
all_files

['reddit_data_2023.csv',
 'reddit_data_2023_S.csv',
 'reddit_data_2024.csv',
 'reddit_data_2024_1.csv',
 'reddit_data_2024_9.csv',
 'reddit_data_2024_S.csv',
 'reddit_data_2025_1.csv']

In [None]:
# # 读取并合并所有 CSV
# df_list = [pd.read_csv(os.path.join(folder_path, f)) for f in all_files]
# df = pd.concat(df_list, ignore_index=True)

# # 2. 保存合并后的数据
# output_path = os.path.join(folder_path, "reddit_data_TTL.csv")
# df.to_csv(output_path, index=False)

In [48]:
# 3. 基本 EDA
import os
import pandas as pd

df = pd.read_csv("reddit_data/reddit_data_TTL.csv")
df_filtered= df.copy(deep=True)
print(f"数据量: {df.shape[0]} 行, {df.shape[1]} 列")
print("列名:", df.columns.tolist())

# 4. 过滤低质量数据（点赞数少于3）
df_filtered = df[df["likes"] >= 3]
# distinct_type_count = df["subreddit"].nunique() #23
df_filtered.shape[0]

数据量: 164903 行, 11 列
列名: ['post_id', 'likes', 'author', 'body', 'order_returned', 'likes_post', 'title', 'body_post', 'subreddit', 'author_post', 'published_timestamp']


132724

In [None]:
# 查找重复项（基于指定列） # first 1705 false 2217
duplicate_rows = df[df.duplicated(subset=['post_id', 'likes', 'published_timestamp', 'title', 'body'], keep=False)]

# 打印重复的行
if not duplicate_rows.empty:
    print("发现重复项:")
    print(duplicate_rows['body'])

发现重复项:
6037                                              Congrats!
6039                                              Congrats!
8878                                                     No
8881                                                     No
8991                                              Congrats!
                                ...                        
159244                                      Congratulations
162234    Arizona here.  This sounds much more reasonabl...
162235    Arizona here.  This sounds much more reasonabl...
163885                                            [deleted]
163889                                            [deleted]
Name: body, Length: 2117, dtype: object


In [45]:
print(df.shape[0])
df_filtered.shape[0]

164903


132724

In [50]:
# 删除重复项（保留第一条出现的记录）
print(df_filtered.shape[0])
df_filtered = df_filtered.drop_duplicates(subset=['body'], keep='first')

# 确认删除后的数据形状
print(f"去重后剩余数据: {df_filtered.shape[0]} 行")

132724
去重后剩余数据: 121773 行


In [51]:
output_path = os.path.join(folder_path, "reddit_data_Prepocessed.csv")
df_filtered.to_csv(output_path, index=False)