# About
In this notebook we'll be building an Alpaca-based dataset containing every comment scraped from r/Hasan_Piker, specifically prepared for finetuning Llama3.1.

Theoretically, this should be able to turn into a script, where attributes are flags

WIP

## 1.0 - Preparation

### 1.1 - Import Packages

In [1]:
# import libs
import json
import pandas as pd
from loguru import logger

### 1.2 - Set Configuration Variables
Change these to build different datasets

In [2]:
SUBREDDIT = "Destiny"
COUNT = 25000
CLEANED_DATA = f'../../json/local/cleaned_data_DEVELOPMENT_{COUNT}.json'

class AlpacaPrompt:
    def __init__(self, instruction: str, input: str, response: str):
        self.instruction = instruction
        self.input = input
        self.response = response

    def to_dict(self):
        return {
            "instruction": self.instruction,
            "input": self.input,
            "response": self.response
        }

## 2.0 - Filter & Transform

### 2.1 - Load Cleaned Data

In [3]:
# load data into pandas dataframe
with open(CLEANED_DATA, 'r') as f:
  data = json.load(f)

df = pd.DataFrame(data)

# logging
prefilter_len = len(df)
logger.info(f"Data loaded: {df.shape}")
df.head(1)

[32m2024-11-25 11:46:34.055[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mData loaded: (23859, 25)[0m


Unnamed: 0,id,subreddit,selftext,title,downs,name,upvote_ratio,ups,removed_by_category,link_flair_text,score,author_premium,edited,total_awards_received,suggested_sort,no_follow,created_utc,author_flair_text,author,num_comments,subreddit_subscribers,send_replies,is_video,deleted,comments
0,1dx1b0z,Destiny,,New Vegan,0,t3_1dx1b0z,0.95,121,,Shitpost,121,False,False,0,confidence,False,1720304607,,TuningsGaming,2,248289,True,False,False,"[{'id': 'lbyv8mn', 'total_awards_received': 0,..."


### 2.2 - Drop Unwanted Subreddits

In [4]:
# only keep the rows where the subreddit is in SUBREDDITS
df = df[df['subreddit'] == SUBREDDIT]

# logging
posts_removed = prefilter_len - len(df)
logger.info(f"Data filtered: {df.shape}")
logger.info(f"{posts_removed} post{'s' if posts_removed > 1 else ''} removed")
df.head(1)

[32m2024-11-25 11:46:34.086[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mData filtered: (2647, 25)[0m
[32m2024-11-25 11:46:34.087[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1m21212 posts removed[0m


Unnamed: 0,id,subreddit,selftext,title,downs,name,upvote_ratio,ups,removed_by_category,link_flair_text,score,author_premium,edited,total_awards_received,suggested_sort,no_follow,created_utc,author_flair_text,author,num_comments,subreddit_subscribers,send_replies,is_video,deleted,comments
0,1dx1b0z,Destiny,,New Vegan,0,t3_1dx1b0z,0.95,121,,Shitpost,121,False,False,0,confidence,False,1720304607,,TuningsGaming,2,248289,True,False,False,"[{'id': 'lbyv8mn', 'total_awards_received': 0,..."


## 3.0 - Bucket Data by Instruction Type

### 3.1 - Define instructions

In [5]:
instruction_types = [
  { 
    "instruction": f"Write a response to this post from r/{SUBREDDIT}",
    "response": []
  },
  { 
    "instruction": f"Write a response to this comment from r/{SUBREDDIT}",
    "responses": []
  }
]

# TODO: MAKE INSTRUCTIONS MORE SPECIFIC LIKE:
# instructions = [
#   { 
#     "instruction": f"Write a controverial response to this comment from r/{SUBREDDIT}",
#     "response": []
#   },
#   { 
#     "instruction": f"Write a likely upvoted response to this comment from r/{SUBREDDIT}",
#     "responses": []
#   },
#   { 
#     "instruction": f"Write a likely downvoted response to this comment from r/{SUBREDDIT}",
#     "responses": []
#   },
#   { 
#     "instruction": f"Write a response that is likely to be removed by the moderators to this comment from r/{SUBREDDIT}",
#     "responses": []
#   }
# ]

## 3.2 - ROUGH DRAFT

In [6]:
def is_comment_valid(comment) -> tuple[bool, str]:
  if (comment.startswith('!') and comment != "!") or 'http:' in comment or 'https:' in comment:
    return False, f"Comment is a bot call or contains a link: {comment}"
  match comment:
    case '[deleted]':
      return False, "Comment was deleted"
    case '[removed]':
      return False, "Comment was removed"
    case _:
      return True, ""
    
def is_post_valid(post_row) -> tuple[bool, str]:
  if post_row['selftext'].startswith('#'):
    return False, f"Post is a bot: {post_row['selftext']}"
  if post_row['num_comments'] == 0:
    return False, "Post has no comments"
  return True, ""
  

In [7]:
prompts: AlpacaPrompt = []

for i, row in df.iterrows():
  # check if post is valid
  valid, reason = is_post_valid(row)
  if not valid:
    logger.error(f"Post {row['name']} is not valid: {reason}, skipping")
    continue

  # post is valid but if it has no selftext, use title instead
  if row['selftext'] == '':
    logger.warning(f"Post {row['name']} has no selftext, using title instead")
    if row['title'] == '':
      logger.error(f"Post {row['name']} has no title, skipping")
      continue
    row['selftext'] = row['title']

  post_row = row
  # go down the comment chain in the post
  for comments in post_row['comments']:
    valid, reason = is_comment_valid(comments['body'])
    if not valid:
      logger.error(f"Comment {comments['name']} is not valid: {reason}, skipping")
      continue
    if comments['body'] == '':
      logger.warning(f"Comment {comments['name']} has no body, skipping")
      continue
    # input is the post body, response is the comment body
    prompts.append(AlpacaPrompt(
      instruction=instruction_types[0]['instruction'],
      input=post_row['selftext'],
      response=comments['body'],
    ))

    if comments['replies'] == []:
      # logger.info(f"Comment {comments['name']} has no replies, end of chain")
      continue

    # go down the reply chain in the comment
    for reply in comments['replies']:
      if reply['body'] == '':
        logger.warning(f"Reply {reply['name']} has no body, skipping")
        continue
      # input is the comment body, response is the reply body
      prompts.append(AlpacaPrompt(
        instruction=instruction_types[1]['instruction'],
        input=comments['body'],
        response=reply['body'],
      ))

[32m2024-11-25 11:46:34.128[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mComment t1_lbyv8mn has no replies, end of chain[0m
[32m2024-11-25 11:46:34.129[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mComment t1_lbyw648 has no replies, end of chain[0m
[32m2024-11-25 11:46:34.129[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mComment t1_lbycefd has no replies, end of chain[0m
[32m2024-11-25 11:46:34.130[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mComment t1_lbypxa3 has no replies, end of chain[0m
[32m2024-11-25 11:46:34.130[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mComment t1_lbz6gxk has no replies, end of chain[0m
[32m2024-11-25 11:46:34.131[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m36[0m - [1mComment t1_lbyjcc6 has no replies, end of chain[0m
[32m2024-11-25 11:46:34.132[0m | [1mI

In [8]:
# write prompts to json file for alpaca
with open(f'datasets/{SUBREDDIT}_prompts_{COUNT}.jsonl'.lower(), 'w') as f:
  json.dump([prompt.to_dict() for prompt in prompts], f)

logger.info(f"Prompts written to file: {len(prompts)}")
from datetime import datetime
logger.info(f"Finished at: {datetime.now()}")

[32m2024-11-25 11:46:52.382[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mPrompts written to file: 54403[0m
[32m2024-11-25 11:46:52.383[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mFinished at: 2024-11-25 11:46:52.383850[0m


## 4.0 - Huggingface
Push the dataset to HuggingFace

In [None]:
from datasets import load_dataset
dataset = load_dataset('json', data_files=f'datasets/local/{SUBREDDIT}_prompts_{COUNT}.jsonl'.lower())

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

def count_to_str(count: int) -> str:
  if count < 1000:
    return str(count)
  if count < 1000000:
    return f"{int(count/1000)}k"
  return f"{int(count/1000000)}m"

if not os.getenv('HF_TOKEN'):
  logger.error("No Hugging Face token found, not pushing to hub")
else:
  dataset.push_to_hub(f"brianmatzelle/alpaca-{SUBREDDIT}-{count_to_str(COUNT)}".lower(), token=os.getenv('HF_TOKEN'))

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/55 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/brianmatzelle/alpaca-destiny-25k/commit/d7b09f917b911c590f031c4c44b4c292d1f93027', commit_message='Upload dataset', commit_description='', oid='d7b09f917b911c590f031c4c44b4c292d1f93027', pr_url=None, pr_revision=None, pr_num=None)