In [22]:
import requests
import pandas as pd
import boto3
from io import StringIO
import json
import s3fs
from io import BytesIO
from pyspark.sql import SparkSession
from time import time
import os
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, udf
import re
from pyspark.sql.types import StringType

In [2]:
s3_bucket = 'reddit-tifu'
filename = "tifu_all_tokenized_and_filtered.json"

In [3]:
AWS_ACCESS_KEY_ID = '**'
AWS_SECRET_ACCESS_KEY = '**'

In [4]:
def read_json_from_s3(s3_bucket, filename, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY):
    s3 = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID,
                          aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
    obj = s3.get_object(Bucket=s3_bucket, Key=filename)
    df = pd.read_json(BytesIO(obj['Body'].read()), lines = True)

    return df

In [113]:
def save_to_s3_as_parquet(df, bucket_name, object_name, access_key_id, secret_access_key):
    s3_resource = boto3.resource('s3', aws_access_key_id=access_key_id, aws_secret_access_key=secret_access_key)
    parquet_buffer = BytesIO()
    df.to_parquet(parquet_buffer, index=False, compression='snappy')
    s3_resource.Object(bucket_name, object_name).put(Body=parquet_buffer.getvalue())

In [5]:
def process_and_save_data():
    df = read_json_from_s3(s3_bucket, filename, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    parquet_filename = 'tifu_subset_tokenized_and_filtered.snappy.parquet'
    save_to_s3_as_parquet(df, s3_bucket, parquet_filename, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

In [4]:
conf = SparkConf()

conf.setAll([
    ("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4"),
    ("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY_ID),
    ("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY)
])

spark = SparkSession.builder \
    .master("local[*]") \
    .config(conf=conf) \
    .appName("read-s3-with-spark") \
    .getOrCreate()

:: loading settings :: url = jar:file:/home/ec2-user/.local/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ec2-user/.ivy2/cache
The jars for the packages stored in: /home/ec2-user/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-0743534b-ae37-4649-ab13-848a72def655;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 369ms :: artifacts dl 9ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	-----------------------------

In [6]:
df = spark.read.parquet(f"s3a://{s3_bucket}/tifu_all_tokenized_and_filtered.snappy.parquet")

25/04/12 20:24:14 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [7]:
df.columns

['title_tokenized',
 'permalink',
 'title',
 'url',
 'num_comments',
 'tldr',
 'created_utc',
 'trimmed_title_tokenized',
 'id',
 'selftext_html',
 'score',
 'upvote_ratio',
 'selftext',
 'trimmed_title',
 'selftext_without_tldr_tokenized',
 'ups',
 'selftext_without_tldr',
 'tldr_tokenized']

In [8]:
df.select('selftext').show(1, truncate=False)

                                                                                

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|selftext                                                                                                                                                                                                                                                                                     |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|I was on Skype on my tablet as I went to the toilet IMing a friend. I don't multitask very well, so I forgot one of the most important 

In [9]:
df.select('selftext_html').show(1, truncate=False)



+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|selftext_html                                                                                                                                                                                                                                                                                                                                                |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [63]:
def clean_text(df):
    col = 'selftext_html'
    # remove HTML tags
    df = df.withColumn(col, regexp_replace(col, ".*<p>", ""))
    df = df.withColumn(col, regexp_replace(col, "</p>(?s).*", ""))
    # remove punctuation codes
    df = df.withColumn(col, regexp_replace(col, "&#33;", "!"))
    df = df.withColumn(col, regexp_replace(col, "&#34;", "\""))
    df = df.withColumn(col, regexp_replace(col, "&#35;", "#"))
    df = df.withColumn(col, regexp_replace(col, "&#37;", "%"))
    df = df.withColumn(col, regexp_replace(col, "&#38;", "&"))
    df = df.withColumn(col, regexp_replace(col, "&#39;", "'"))
    df = df.withColumn(col, regexp_replace(col, "&#40;", "("))
    df = df.withColumn(col, regexp_replace(col, "&#41;", ")"))
    df = df.withColumn(col, regexp_replace(col, "&#42;", "*"))
    df = df.withColumn(col, regexp_replace(col, "&#44;", ","))
    df = df.withColumn(col, regexp_replace(col, "&#46;", "."))
    df = df.withColumn(col, regexp_replace(col, "&#47;", "/"))
    df = df.withColumn(col, regexp_replace(col, "&#58;", ":"))
    df = df.withColumn(col, regexp_replace(col, "&#59;", ";"))
    df = df.withColumn(col, regexp_replace(col, "&#63;", "?"))
    df = df.withColumn(col, regexp_replace(col, "&#64;", "@"))
    
    return df

In [71]:
df_cleaned = clean_text(df)

In [72]:
df_cleaned.select('selftext_html').show(1, truncate=False)



+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|selftext_html                                                                                                                                                                                                                                                                                |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|I was on Skype on my tablet as I went to the toilet IMing a friend. I don't multitask very well, so I forgot one of the most important 

                                                                                

In [69]:
df.select('trimmed_title').show(1, truncate=False)

+-----------------------------------------------------+
|trimmed_title                                        |
+-----------------------------------------------------+
|forgetting to pull my underwear down before i pooped.|
+-----------------------------------------------------+
only showing top 1 row



In [78]:
from pyspark.sql.functions import col, concat, lit

In [96]:
def text_summary(df):
    df = df.withColumn("Prompt",
        concat(
            lit("Text: "), col("selftext_html"),
            lit(" \nTitle: "), col("trimmed_title"), 
            lit(" ####"))
                      )
    
    return df.select("Prompt")

In [97]:
df_llm = text_summary(df_cleaned)

In [107]:
df_llm.show(1, truncate=False)



+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Prompt                                                                                                                                                                                                                                                                                                                                                                 |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------



In [119]:
save_to_s3_as_parquet(df_llm.toPandas(), s3_bucket, 'reddit_tifu_llm.parquet', AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

Exception in thread "serve-DataFrame" java.net.SocketTimeoutException: Accept timed out
	at java.base/java.net.PlainSocketImpl.socketAccept(Native Method)
	at java.base/java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:474)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:565)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:533)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:65)
                                                                                