In [18]:
import os
import datetime as dt
import json
import textwrap

import polars as pl
from praw import Reddit
from pydantic import BaseModel, TypeAdapter

# 1. Extract data from Reddit

We use PRAW https://pypi.org/project/praw/, "Python Reddit API Wrapper".

Login to Reddit and create an app at https://www.reddit.com/prefs/apps

![Reddit app](img/reddit-app.png)

(Image from https://www.jcchouinard.com/get-reddit-api-credentials-with-praw/)

In [19]:
%load_ext dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [20]:
%dotenv

In [21]:
reddit_client_id = os.environ["REDDIT_CLIENT_ID"]
reddit_client_secret = os.environ["REDDIT_SECRET"]
reddit_username = os.environ["REDDIT_USERNAME"]  # Just for assembling the user agent

minio_endpoint_url = os.environ["MINIO_ENDPOINT_URL"]
minio_access_key = os.environ["MINIO_KEY"]
minio_secret_id = os.environ["MINIO_SECRET"]

In [22]:
# Read-only Reddit connection https://praw.readthedocs.io/en/stable/getting_started/quick_start.html#read-only-reddit-instances
reddit = Reddit(
    client_id=reddit_client_id,
    client_secret=reddit_client_secret,
    user_agent=f"social summarizer by u/{reddit_username}",
)
reddit.user.me(), reddit.read_only

(None, True)

In [23]:
selected_subreddit_names = [
    "r/jokes",
    "r/askscience",
    "r/explainlikeimfive",
    "r/LifeProTips",
    "r/GetMotivated",
    "r/AmItheAsshole"
]
target_subreddit_name = "r/AmItheAsshole"

In [24]:
from IPython.display import Markdown

In [25]:
subreddit = reddit.subreddit(target_subreddit_name.removeprefix("r/"))
print(subreddit.title)
Markdown(subreddit.description[:1_000] + "...")

Am I the Asshole? 


#Welcome to r/AmITheAsshole!

A catharsis for the frustrated moral philosopher in all of us, and a place to finally find out if you were wrong in a real-world argument that's been bothering you. Tell us about any non-violent conflict you have experienced; give us both sides of the story, and find out if you're right, or you're the asshole.

This is the sub to lay out your  actions and conflicts and get impartial judgment rendered against you.  Were you the asshole in that situation or not? Post should reflect real situations, and abide by the rules below.

After 18 hours, your post will be given a flair representing the final judgment on your matter.  This flair is determined by the subscribers who have both rendered judgment and voted on which judgment is best.  ***The power of the crowd will judge you***.  If your top level comment has the highest number of upvotes in a thread, you will get a flair point. More details are listed in [our FAQ](https://www.reddit.com/r/AmItheAssho...

In [26]:
# Possible parameters https://old.reddit.com/dev/api#GET_new
submissions = []
for submission in subreddit.new(limit=2):
    submissions.append({
        "title": submission.title,
        "author_name": submission.author.name,
        "creation_datetime": dt.datetime.utcfromtimestamp(submission.created_utc).isoformat(),
        "subreddit_name": submission.subreddit_name_prefixed,
        "num_comments": submission.num_comments,
        "sfw": not submission.over_18,
        "score": submission.score,
        "upvote_ratio": submission.upvote_ratio,
        "is_self": submission.is_self,
        "permalink": submission.permalink,
        "selftext": submission.selftext,
        "flair_text": submission.link_flair_text,
    })

In [27]:
print("\n".join(textwrap.wrap((json.dumps(submissions)[:1_000] + "..."), width=70)))

[{"title": "AITA for saying my mom isn't my mom anymore ",
"author_name": "Every-Disaster1853", "creation_datetime":
"2024-03-24T13:07:01", "subreddit_name": "r/AmItheAsshole",
"num_comments": 2, "sfw": true, "score": 0, "upvote_ratio": 0.5,
"is_self": true, "permalink": "/r/AmItheAsshole/comments/1bmkbpj/aita_
for_saying_my_mom_isnt_my_mom_anymore/", "selftext": "I (15F) my mom
(52F) never had a good relationship with eachother. She was also a
drunk \n\nWe were a ok family before we moved to another country. She
was beginning to drink more and more and when we were fighting she
began screaming my oof attamt should have worked.at that moment my
rage was so big i screamed at her that she needs to Figure out what
she wants but that im not her daughter anymore. At that moment my aunt
came with the police and she got sent to a psychiatric hospital \n\nSo
am i the asshole \n\nPs im safe now my sister is also safe my mom is
out of the facility .\n", "flair_text": null}, {"title": "AITA:
Bath

Notice the `creation_datetime` is `str`, for easier serialization!

In [28]:
submissions[-1]["creation_datetime"]

'2024-03-24T13:05:45'

# 2. Model the data

We use Pydantic https://pypi.org/project/pydantic/, a popular Python library for data validation

In [29]:
class RedditSubmission(BaseModel):
    title: str
    author_name: str
    creation_datetime: dt.datetime
    subreddit_name: str
    num_comments: int
    sfw: bool
    score: int
    upvote_ratio: float
    is_self: bool
    permalink: str
    selftext: str | None
    flair_text: str | None

In [30]:
adapter = TypeAdapter(list[RedditSubmission])

In [31]:
objects = adapter.validate_python(submissions)
objects

[RedditSubmission(title="AITA for saying my mom isn't my mom anymore ", author_name='Every-Disaster1853', creation_datetime=datetime.datetime(2024, 3, 24, 13, 7, 1), subreddit_name='r/AmItheAsshole', num_comments=2, sfw=True, score=0, upvote_ratio=0.5, is_self=True, permalink='/r/AmItheAsshole/comments/1bmkbpj/aita_for_saying_my_mom_isnt_my_mom_anymore/', selftext='I (15F) my mom (52F) never had a good relationship with eachother. She was also a drunk \n\nWe were a ok family before we moved to another country. She was beginning to drink more and more and when we were fighting she began screaming my oof attamt should have worked.at that moment my rage was so big i screamed at her that she needs to Figure out what she wants but that im not her daughter anymore. At that moment my aunt came with the police and she got sent to a psychiatric hospital \n\nSo am i the asshole \n\nPs im safe now my sister is also safe my mom is out of the facility .\n', flair_text=None),

Pydantic automatically converted the str datetime to an actual `datetime.datatime` object, as specified in the model:

In [32]:
objects[-1].creation_datetime

datetime.datetime(2024, 3, 24, 13, 5, 45)

# 3. Store data

We use Polars https://pypi.org/project/polars/, a nascent dataframe library with an expressive API and blazing fast performance

In [33]:
df = pl.from_dicts(objects)
df.head()

title,author_name,creation_datetime,subreddit_name,num_comments,sfw,score,upvote_ratio,is_self,permalink,selftext,flair_text
str,str,datetime[μs],str,i64,bool,i64,f64,bool,str,str,str
"""AITA for sayin…","""Every-Disaster…",2024-03-24 13:07:01,"""r/AmItheAsshol…",2,True,0,0.5,True,"""/r/AmItheAssho…","""I (15F) my mom…",
"""AITA: Bathroom…","""Used_Warning_2…",2024-03-24 13:05:45,"""r/AmItheAsshol…",2,True,1,1.0,True,"""/r/AmItheAssho…","""My little brot…",


In [34]:
df.write_delta("submissions", mode="overwrite")

Writing to a local Delta table is okay, but doing so to an object-storage is even better! Enter MinIO:

In [35]:
minio_storage_options = {
    "AWS_ENDPOINT_URL": minio_endpoint_url,
    "AWS_ACCESS_KEY_ID": minio_access_key,
    "AWS_SECRET_ACCESS_KEY": minio_secret_id,
    "AWS_REGION": "<localhost>",  # Boilerplate
    "AWS_ALLOW_HTTP": "true",  # Boilerplate
    "AWS_S3_ALLOW_UNSAFE_RENAME": "true",  # Boilerplate
}

In [36]:
df.write_delta("s3://reddit-submissions/submissions-raw", mode="overwrite", storage_options=minio_storage_options)

[90m[[0m2024-03-24T13:10:07Z [33mWARN [0m aws_config::imds::region[90m][0m failed to load region from IMDS err=failed to load IMDS session token: dispatch failure: timeout: error trying to connect: HTTP connect timeout occurred after 1s: HTTP connect timeout occurred after 1s: timed out (FailedToLoadToken(FailedToLoadToken { source: DispatchFailure(DispatchFailure { source: ConnectorError { kind: Timeout, source: hyper::Error(Connect, HttpTimeoutError { kind: "HTTP connect", duration: 1s }), connection: Unknown } }) }))
[90m[[0m2024-03-24T13:10:08Z [33mWARN [0m aws_config::imds::region[90m][0m failed to load region from IMDS err=failed to load IMDS session token: dispatch failure: timeout: error trying to connect: HTTP connect timeout occurred after 1s: HTTP connect timeout occurred after 1s: timed out (FailedToLoadToken(FailedToLoadToken { source: DispatchFailure(DispatchFailure { source: ConnectorError { kind: Timeout, source: hyper::Error(Connect, HttpTimeoutError { kind

And the icing in the cake is doing an `UPSERT`-like operation:

In [37]:
(
    df.write_delta(
        "s3://reddit-submissions/submissions-raw",
        mode="merge",
        storage_options=minio_storage_options,
        delta_merge_options={
            "predicate": "s.permalink = t.permalink",
            "source_alias": "s",
            "target_alias": "t",
        },
    )
    .when_matched_update_all()
    .when_not_matched_insert_all()
    .execute()
)

{'num_source_rows': 2,
 'num_target_rows_inserted': 0,
 'num_target_rows_updated': 2,
 'num_target_rows_deleted': 0,
 'num_target_rows_copied': 0,
 'num_output_rows': 2,
 'num_target_files_added': 1,
 'num_target_files_removed': 1,
 'execution_time_ms': 40,
 'scan_time_ms': 0,
 'rewrite_time_ms': 7}

# 4. Load data and apply AI

We load the data again from the same MinIO object:

In [38]:
df = pl.read_delta("s3://reddit-submissions/submissions-raw", storage_options=minio_storage_options)
df.head()

title,author_name,creation_datetime,subreddit_name,num_comments,sfw,score,upvote_ratio,is_self,permalink,selftext,flair_text
str,str,datetime[μs],str,i64,bool,i64,f64,bool,str,str,str
"""AITA: Bathroom…","""Used_Warning_2…",2024-03-24 13:05:45,"""r/AmItheAsshol…",2,True,1,1.0,True,"""/r/AmItheAssho…","""My little brot…",
"""AITA for sayin…","""Every-Disaster…",2024-03-24 13:07:01,"""r/AmItheAsshol…",2,True,0,0.5,True,"""/r/AmItheAssho…","""I (15F) my mom…",


In [39]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [40]:
summarizer = pipeline("summarization")  # model="sshleifer/distilbart-cnn-12-6"

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [41]:
s0 = df.item(0, "selftext")

In [42]:
summary_s0 = summarizer(s0)[0]["summary_text"]

In [43]:
print("\n".join(textwrap.wrap(summary_s0, width=70)).strip())

My little brother got mad when I opened the bathroom door to see if
he was inside and when he saw he was using the sink I closed the door
. He then got mad at me and told me “why would you open the door, if
you saw the light was on!”


In [44]:
mask_filler = pipeline("fill-mask", model="Twitter/twhin-bert-base")

In [45]:
mask_filler(summary_s0 + " #<mask>")

[{'score': 0.08455858379602432,
  'token': 12409,
  'token_str': 'dad',
  'sequence': 'My little brother got mad when I opened the bathroom door to see if he was inside and when he saw he was using the sink I closed the door. He then got mad at me and told me “why would you open the door, if you saw the light was on!” #dad'},
 {'score': 0.03174099326133728,
  'token': 73866,
  'token_str': 'help',
  'sequence': 'My little brother got mad when I opened the bathroom door to see if he was inside and when he saw he was using the sink I closed the door. He then got mad at me and told me “why would you open the door, if you saw the light was on!” #help'},
 {'score': 0.027666352689266205,
  'token': 17457,
  'token_str': 'sad',
  'sequence': 'My little brother got mad when I opened the bathroom door to see if he was inside and when he saw he was using the sink I closed the door. He then got mad at me and told me “why would you open the door, if you saw the light was on!” #sad'},
 {'score': 0.

In [46]:
def get_hashtags(text, mask_filler_model) -> list[str]:
    results = mask_filler_model(text.strip() + " #<mask>")

    hashtags = set()
    for ht_dict in results:
        hashtags.add(ht_dict["token_str"])

    if max(len(ht) for ht in hashtags) == 1:
        # Discard list of ugly hashtags
        return []

    return list(hashtags)

In [47]:
def get_summary(text, summarizer_model) -> str:
    summary = summarizer_model(text)[0]["summary_text"].strip()
    return summary

In [48]:
get_hashtags(get_summary(df.item(0, "selftext"), summarizer_model=summarizer), mask_filler)

['sad', 'mama', 'help', 'life', 'dad']

In [49]:
concise_df = df.with_columns(
    pl.col("selftext").map_elements(lambda text: get_summary(text, summarizer_model=summarizer)).alias("summary"),
).with_columns(
    pl.col("summary").map_elements(lambda text: get_hashtags(text, mask_filler_model=mask_filler)).alias("hashtags")
).with_columns(
    (pl.col("summary").str.len_chars() / pl.col("selftext").str.len_chars()).alias("summarization_pct")
).select(
    pl.col("author_name", "creation_datetime", "permalink", "title", "selftext", "summary", "hashtags", "summarization_pct")
)
concise_df.head()

author_name,creation_datetime,permalink,title,selftext,summary,hashtags,summarization_pct
str,datetime[μs],str,str,str,str,list[str],f64
"""Used_Warning_2…",2024-03-24 13:05:45,"""/r/AmItheAssho…","""AITA: Bathroom…","""My little brot…","""My little brot…","[""sad"", ""mama"", … ""dad""]",0.360124
"""Every-Disaster…",2024-03-24 13:07:01,"""/r/AmItheAssho…","""AITA for sayin…","""I (15F) my mom…","""I (15F) my mom…","[""gh"", ""RIP"", … ""dad""]",0.576592


# 5. Store the manipulated data

In [50]:
concise_df.write_delta("s3://reddit-submissions/submissions-concise", mode="overwrite", storage_options=minio_storage_options)

In [51]:
(
    concise_df.write_delta(
        "s3://reddit-submissions/submissions-concise",
        mode="merge",
        storage_options=minio_storage_options,
        delta_merge_options={
            "predicate": "s.permalink = t.permalink",
            "source_alias": "s",
            "target_alias": "t",
        },
    )
    .when_matched_update_all()
    .when_not_matched_insert_all()
    .execute()
)

{'num_source_rows': 2,
 'num_target_rows_inserted': 0,
 'num_target_rows_updated': 2,
 'num_target_rows_deleted': 0,
 'num_target_rows_copied': 0,
 'num_output_rows': 2,
 'num_target_files_added': 1,
 'num_target_files_removed': 1,
 'execution_time_ms': 39,
 'scan_time_ms': 0,
 'rewrite_time_ms': 7}

# Extra: Using LLMs through APIs

Ollama is an open source tool that allows you to access pre-trained open models with an API that is compatible with OpenAI.

<div class="alert alert-danger"><em>⚠️ Local LLMs require a properly configured GPU to perform inference in reasonable times. If you don't have one, try purchasing OpenAI tokens instead. Since this workshop is about not using ChatGPT and not all participants may have adequate hardware or drivers, this part is optional.</em></div>

In [52]:
import ollama

ollama.pull("llama2")

{'status': 'success'}

In [53]:
from openai import OpenAI

client = OpenAI(
    base_url = 'http://localhost:11434/v1',
    api_key='ollama', # required, but unused
)

response = client.chat.completions.create(
    messages=[{"role": "user", "content": "Say this is a test"}],
    # model="gpt-3.5-turbo",
    model="llama2"
)
print(response.choices[0].message.content)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Of course! Here is the information you requested:

This is a test.
