In [1]:
import requests
import pandas as pd
import boto3
from io import StringIO
import json
import s3fs
from io import BytesIO
from pyspark.sql import SparkSession
from time import time
import os
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, udf
import re
from pyspark.sql.types import StringType
from pyspark.sql.functions import col, concat, lit

In [2]:
# AWS credentials
s3_bucket = '**'
AWS_ACCESS_KEY_ID = '**'
AWS_SECRET_ACCESS_KEY = '**'

In [3]:
filename = "subreddits_train_data_2.json"

In [4]:
def read_json_from_s3(s3_bucket, filename, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY):
    '''
    read json file from S3 bucket
    '''
    s3 = boto3.client('s3', aws_access_key_id=AWS_ACCESS_KEY_ID,
                          aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
    obj = s3.get_object(Bucket=s3_bucket, Key=filename)
    df = pd.read_json(BytesIO(obj['Body'].read()))

    return df.T

In [5]:
def save_to_s3_as_parquet(df, bucket_name, object_name, access_key_id, secret_access_key):
    '''
    Compress file to parquet file
    '''
    s3_resource = boto3.resource('s3', aws_access_key_id=access_key_id, aws_secret_access_key=secret_access_key)
    parquet_buffer = BytesIO()
    df.to_parquet(parquet_buffer, index=False, compression='snappy')
    s3_resource.Object(bucket_name, object_name).put(Body=parquet_buffer.getvalue())

In [6]:
def process_and_save_data(parquet_filename):
    '''
    Save parquet file in S3
    '''
    df = read_json_from_s3(s3_bucket, filename, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    save_to_s3_as_parquet(df, s3_bucket, parquet_filename, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

In [7]:
# Spark configuration
conf = SparkConf()

conf.setAll([
    ("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4"),
    ("spark.hadoop.fs.s3a.access.key", AWS_ACCESS_KEY_ID),
    ("spark.hadoop.fs.s3a.secret.key", AWS_SECRET_ACCESS_KEY)
])

spark = SparkSession.builder \
    .master("local[*]") \
    .config(conf=conf) \
    .appName("read-s3-with-spark") \
    .getOrCreate()

:: loading settings :: url = jar:file:/home/ec2-user/.local/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ec2-user/.ivy2/cache
The jars for the packages stored in: /home/ec2-user/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ab61d924-22c8-4ed1-b80e-7196d1b101d6;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 390ms :: artifacts dl 18ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	----------------------------

In [8]:
# read raw data as pandas dataframe
df = read_json_from_s3(s3_bucket, filename, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
df.head()

Unnamed: 0,text
0,Another home on the eastern plains of Colorado...
1,Colorado stopped using state Medicaid funds on...
2,Here’s one of many that are now abandoned-
3,More Blossoms & Bandos.
4,This house has a very interesting history.


In [9]:
df = df.astype(str)

In [10]:
object_name = 'subreddits_train_data_2.snappy.parquet'
save_to_s3_as_parquet(df, s3_bucket, object_name, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

In [11]:
df = spark.read.parquet(f"s3a://{s3_bucket}/subreddits_train_data_2.snappy.parquet")

25/05/09 17:20:11 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [12]:
df.take(1)

                                                                                

[Row(text='Another home on the eastern plains of Colorado left to time- ')]

In [13]:
def clean_text(df, col):
    '''
    clean text
    '''
    # remove https links
    # df = df.withColumn(col, regexp_replace(col, ".*<p>", ""))
    df = df.withColumn(col, regexp_replace(col, 'http\S+', ""))
    # remove punctuation codes
    df = df.withColumn(col, regexp_replace(col, "&#33;", "!"))
    df = df.withColumn(col, regexp_replace(col, "&#34;", "\""))
    df = df.withColumn(col, regexp_replace(col, "&#35;", "#"))
    df = df.withColumn(col, regexp_replace(col, "&#37;", "%"))
    df = df.withColumn(col, regexp_replace(col, "&#38;", "&"))
    df = df.withColumn(col, regexp_replace(col, "&#39;", "'"))
    df = df.withColumn(col, regexp_replace(col, "&#40;", "("))
    df = df.withColumn(col, regexp_replace(col, "&#41;", ")"))
    df = df.withColumn(col, regexp_replace(col, "&#42;", "*"))
    df = df.withColumn(col, regexp_replace(col, "&#44;", ","))
    df = df.withColumn(col, regexp_replace(col, "&#46;", "."))
    df = df.withColumn(col, regexp_replace(col, "&#47;", "/"))
    df = df.withColumn(col, regexp_replace(col, "&#58;", ":"))
    df = df.withColumn(col, regexp_replace(col, "&#59;", ";"))
    df = df.withColumn(col, regexp_replace(col, "&#63;", "?"))
    df = df.withColumn(col, regexp_replace(col, "&#64;", "@"))
    # remove special characters
    df = df.withColumn(col, regexp_replace(col, "\*", ""))
    df = df.withColumn(col, regexp_replace(col, "\n+", " "))
    df = df.withColumn(col, regexp_replace(col, "\/", "or"))
    df = df.withColumn(col, regexp_replace(col, "\'", "'"))
    df = df.withColumn(col, regexp_replace(col, "\"", ""))
    df = df.withColumn(col, regexp_replace(col, "”", ""))
    df = df.withColumn(col, regexp_replace(col, "“", ""))
    df = df.withColumn(col, regexp_replace(col, "\(.+\)", ""))
    df = df.withColumn(col, regexp_replace(col, "\[.+\]", ""))

    df = df.withColumn(col, regexp_replace(col, " +", " "))
    return df

In [14]:
df_cleaned = clean_text(df, 'text')
df_cleaned = df_cleaned.filter(~col("text").startswith("."))

In [15]:
df_cleaned.take(2)[0]

                                                                                

Row(text='Another home on the eastern plains of Colorado left to time- ')

In [16]:
def text_summary(df):
    '''
    prepare text for LLM
    '''
    df = df.withColumn("PostText",
        concat(
            lit("Text: "), col("text"))
                      )
    
    return df.select("PostText")

In [17]:
df_llm = text_summary(df_cleaned)

In [18]:
df_llm.take(2)[0]

Row(PostText='Text: Another home on the eastern plains of Colorado left to time- ')

In [19]:
save_to_s3_as_parquet(df_llm.toPandas(), s3_bucket, 'subreddits_train_data_2.parquet', AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)

                                                                                

In [20]:
df_llm.count()

                                                                                

85762