In [1]:
import time
a = time.time()
import os
# Overwrite Pyspark driver
os.environ['PYSPARK_PYTHON'] = "./environment/bin/python"

In [2]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from langdetect import detect
import pyspark as ps
import datetime
from pyspark.sql import functions as f
from pyspark.sql import types as t
from sparknlp.pretrained import PretrainedPipeline
import re
# from transformers import pipeline

In [3]:
# Helper functions
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"  
def cleanText(text):
    return re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
# Convert function to UDF
cleanTextDF = f.udf(lambda z: cleanText(z))

In [4]:
spark = SparkSession.builder \
    .appName("Redit Summarization App")\
    .master("yarn")\
    .config("spark.executor.memoryOverhead","2048")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2022-05-01 15:29:37,917 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2022-05-01 15:29:40,894 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [5]:
# Get spark configurations
spark

In [6]:
# Read data
df = spark.read.format("csv").option("header","true").load("hdfs://namenode:9000/dis_materials/data_reddit.csv")

                                                                                

In [7]:
# drop duplicates
df2 = df.dropDuplicates()

In [8]:
df.dtypes

[('created_utc', 'string'),
 ('ups', 'string'),
 ('subreddit_id', 'string'),
 ('link_id', 'string'),
 ('name', 'string'),
 ('score_hidden', 'string'),
 ('author_flair_css_class', 'string'),
 ('author_flair_text', 'string'),
 ('subreddit', 'string'),
 ('id', 'string'),
 ('removal_reason', 'string'),
 ('gilded', 'string'),
 ('downs', 'string'),
 ('archived', 'string'),
 ('author', 'string'),
 ('score', 'string'),
 ('retrieved_on', 'string'),
 ('body', 'string'),
 ('distinguished', 'string'),
 ('edited', 'string'),
 ('controversiality', 'string'),
 ('parent_id', 'string')]

In [9]:
df2= df2.withColumn('created_utc', f.from_unixtime('created_utc').cast(t.DateType()))

In [10]:
# drop null values
df2 = df2.na.drop(subset=["subreddit","subreddit_id","body","created_utc","ups","parent_id","link_id"])

In [11]:
df2 = df2.withColumn("ups", df2["ups"].cast(t.IntegerType()))

In [12]:
"""
Remove comments belonging to moderators
"""
df2 = df2.filter((df2.distinguished != "moderator")|(df2.body!="[deleted]"))

In [13]:
"""
Use CleantTextDf to clean body column
"""
df2 = df2.withColumn("clean_body",cleanTextDF(f.col("body")))

In [14]:
df2 = df2.na.drop(subset=["clean_body"])

In [15]:
df2 = df2.drop("name","author_flair_css_class","author_flair_text","score_hidden","id","distinguished","body","removal_reason","downs","archived","gilded","retrieved_on","edited","controversiality","author","score")

In [16]:
df2 = df2.where(f.length(f.col("parent_id")) <= 12)

In [17]:
# df2.show(truncate=False)

In [18]:
df2 = df2.persist(ps.StorageLevel.DISK_ONLY)

In [None]:
df2.write.option("header","true").csv("hdfs://namenode:9000/cleaned_data_01.csv")

In [None]:
b = time.time()
b-a