In [1]:
### Importing required libraries 
import findspark
findspark.init()
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.functions import isnan, when, count, col, lit
import pandas as pd
import numpy as np
import json
import re 


In [2]:
!/mnt/miniconda/bin/pip install spark-nlp==4.2.1 --force
!/mnt/miniconda/bin/pip install sparknlp

Collecting spark-nlp==4.2.1
  Using cached spark_nlp-4.2.1-py2.py3-none-any.whl (643 kB)
Installing collected packages: spark-nlp
  Attempting uninstall: spark-nlp
    Found existing installation: spark-nlp 4.2.1
    Uninstalling spark-nlp-4.2.1:
      Successfully uninstalled spark-nlp-4.2.1
Successfully installed spark-nlp-4.2.1


In [3]:
spark = SparkSession.builder \
        .appName("SparkNLP") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.2.1") \
    .master('yarn') \
    .getOrCreate()

Ivy Default Cache set to: /home/hadoop/.ivy2/cache
The jars for the packages stored in: /home/hadoop/.ivy2/jars
:: loading settings :: url = jar:file:/usr/lib/spark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f142f47d-8e24-491d-98f3-c03b08b3b0af;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;4.2.1 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in central
	found com.google.code.findbugs#jsr305;3.0.1 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	foun

In [4]:
spark

In [5]:
### Importing spark nlp libraries
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

In [6]:
### Importing the data 

df_com = spark.read.parquet("s3a://ppol567-llj40-bucket-3/cleaned_comments")
df_sub = spark.read.parquet("s3a://ppol567-llj40-bucket-3/cleaned_submissions")

                                                                                

In [7]:
### Viewing the specs
print(df_com.count())
df_com.printSchema()

print(df_sub.count())
df_sub.printSchema()

                                                                                

16171595
root
 |-- author: string (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- body: string (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: long (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- edited: double (nullable = true)
 |-- gilded: long (nullable = true)
 |-- id: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- score: long (nullable = true)
 |-- stickied: boolean (nullable = true)
 |-- body_length: integer (nullable = true)
 |-- created_date: string (nullable = true)
 |-- date_clean: date (nullable = true)
 |-- submission_id: string (nullable = true)





146327
root
 |-- author: string (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- created_utc: long (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- edited: double (nullable = true)
 |-- id: string (nullable = true)
 |-- is_self: boolean (nullable = true)
 |-- locked: boolean (nullable = true)
 |-- num_comments: long (nullable = true)
 |-- over_18: boolean (nullable = true)
 |-- score: long (nullable = true)
 |-- stickied: boolean (nullable = true)
 |-- title: string (nullable = true)
 |-- url: string (nullable = true)
 |-- title_length: integer (nullable = true)
 |-- created_date: string (nullable = true)
 |-- date_clean: date (nullable = true)
 |-- Top_100: integer (nullable = true)
 |-- Live_Thread: boolean (nullable = true)
 |-- War_Dummy: boolean (nullable = true)



                                                                                

In [8]:
### Test Dataframe to run the preprocessing pipeline
df_title = df_sub.select("id", "title")
df_title.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)



In [7]:
### Cleaning the text 

#### Intializing document assembler 
documentAssembler = DocumentAssembler().setInputCol("body").setOutputCol("document")

### Tokenizing the document 
tokenizer = Tokenizer() \
            .setInputCols("document") \
            .setOutputCol("token")

#### Removing English stop-words
stop_words = StopWordsCleaner.pretrained("stopwords_en", "en") \
                             .setInputCols(["token"]) \
                             .setOutputCol("sw_rem")



### Cleaning data to remove special characters and non-english words and converting to lower case
cleanUpPatterns = ["[^A-Za-z0-9 ]"]

normalizer = Normalizer() \
     .setInputCols(["sw_rem"]) \
     .setOutputCol("normalized") \
     .setLowercase(True) \
     .setCleanupPatterns(cleanUpPatterns) 

#### Applying Lemmatization
lemmatizer = LemmatizerModel.pretrained() \
     .setInputCols(["normalized"]) \
     .setOutputCol("clean")


### Transforming into human-readable form using finisher
finisher = Finisher() \
     .setInputCols(['clean']) 


stopwords_en download started this may take some time.
Approximate size to download 2.9 KB
[ | ]stopwords_en download started this may take some time.
Approximate size to download 2.9 KB
Download done! Loading the resource.
[ / ]

                                                                                

[OK!]
lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
Download done! Loading the resource.
[ / ]

                                                                                

[OK!]


In [8]:
#Training Preprocessing pipeline
pre_pipeline = Pipeline(
      stages = [
          documentAssembler, 
          tokenizer,
          stop_words,
          normalizer, 
          lemmatizer, 
          finisher
      ])
### Testing the pipeline on our test dataframe 
preprocessed_df = pre_pipeline.fit(df_title).transform(df_title)
preprocessed_df.show(5, truncate = False)

In [9]:
### Extracting required columns from the datasets 
df_sub_req = df_sub.select(["id", "title", "created_date", "date_clean", "Live_Thread", "War_Dummy"])
df_com_req = df_com.select(["id", "submission_id", "created_date", "date_clean", "body", "controversiality"])


In [11]:
### Preprocessing the dataframes above 
df_sub_preprocessed = pre_pipeline.fit(df_sub_req).transform(df_sub_req).write.parquet("s3a://ppol567-llj40-bucket-3/preprocessed_submissions")
df_com_preprocessed = pre_pipeline.fit(df_com_req).transform(df_com_req).write.parquet("s3a://ppol567-llj40-bucket-3/preprocessed_comments")

                                                                                

In [None]:
### Word Length Graphs 
bin_width_sub = 10
bin_width_com = 5

#Counting the number of words in each submission and comment 
df_sub = df_sub.withColumn('no_of_words', f.size(f.split(col('title'), ' ')))
df_com = df_com.withColumn('no_of_words', f.size(f.split(col('body'), ' ')))

### Binning the values 
df_sub_words = df_sub.withColumn("bucket", (col("no_of_words")/bin_width_sub).cast('int')).select("no_of_words", "bucket")
df_com_words = df_com.withColumn("bucket", (col("no_of_words")/bin_width_com).cast('int')).select("no_of_words", "bucket")

### Generating the tables 
df_sub_words = df_sub_words.groupby("bucket") \
                           .count().orderBy("bucket").toPandas()

df_com_words = df_com_words.groupby("bucket") \
                           .count().orderBy("bucket").toPandas()

### Saving as csv files 
df_sub_words.to_csv("Word_Distribution_Submissions.csv")
df_com_words.to_csv("Word_Distribution_Comments.csv")

In [None]:
### Most Common Words 
### Training a count vectorizer 
cv = CountVectorizer(inputCol = "finished_clean", outputCol = "features", minDF = 1.0)
cv_model = cv.fit(df_sub_preprocessed)
result = cv_model.transform(df_sub_preprocessed)

cv_result = result.withColumn("features", col("features").cast('string')).select("finished_clean", "features")

#Function definition to get first parameter
def first_params(row):
    match1 = re.findall(r"\[(.*?)\]", row)
    return match1[0]

#Function definition to get second parameter
def second_params(row):
    match2 = re.findall(r"\[(.*?)\]", row)
    return match2[1]

#Registering as udf
udf_1 = f.udf(lambda z: first_params(z))
udf_2 = f.udf(lambda z: second_params(z))

### Extracting features through udfs 
cv_result_final = cv_result.withColumn("param1", udf_1(f.col("features"))).withColumn("param2", udf_2(f.col("features")))

### Converting in a form to enable explode 
cv_result_final = cv_result_final.withColumn("features", f.split(f.col("features"),",")) \
                                 .withColumn("param1", f.split(f.col("param1"), ",")) \
                                 .withColumn("param2", f.split(f.col("param2"), ","))

### Implementing the explode 
final_cv = cv_result_final.select("features", f.explode(f.arrays_zip(f.col("finished_clean"), f.col("param1"), f.col("param2"))).alias("Params"))\
                          .select("features",f.col("Params.finished_clean").alias("word"), f.col("Params.param1").alias("word_index"), f.col("Params.param2").alias("frequency"))

### Viewing the results 
final_cv = final_cv.select('word', 'word_index', 'frequency')

final_cv.show(3)

### Filtering the top 15 words 
top_words_df = final_cv.groupby('word').agg(f.count('word').alias('Total Occurence')).orderBy(f.col('Total Occurence').desc()).limit(15).toPandas()

### Saving as csv file
top_words_df.to_csv('top_words_df.csv', index=False)

In [None]:
### Most Important Words 
### Obtaining TF-IDF 
h_t = HashingTF(inputCol = "finished_clean", outputCol = "features")
tf = h_t.transform(df_sub_preprocessed)

idf = IDF(inputCol="features", outputCol="TF_IDF_features", minDocFreq=10)
idf_model = idf.fit(tf)
tfidf = idf_model.transform(tf)

tfidf.select("finished_clean", "TF_IDF_features").show(5, truncate=False)

tfidf_result = tfidf.withColumn("TF_IDF_features", col("TF_IDF_features").cast('string')).select("finished_clean", "TF_IDF_features")

### Extracting features through udfs 
tfidf_result_final = tfidf_result.withColumn("param1", udf_1(f.col("TF_IDF_features"))).withColumn("param2", udf_2(f.col("TF_IDF_features")))
### Converting in a form to enable explode
tfidf_result_final = tfidf_result_final.withColumn("TF_IDF_features", f.split(f.col("TF_IDF_features"),",")) \
                                       .withColumn("param1", f.split(f.col("param1"), ",")) \
                                       .withColumn("param2", f.split(f.col("param2"), ","))

### Implementing the explode 
final_tfidf = tfidf_result_final.select("TF_IDF_features", f.explode(f.arrays_zip(f.col("finished_clean"), f.col("param1"), f.col("param2"))).alias("Params")) \
                                .select("TF_IDF_features",f.col("Params.finished_clean").alias("word"), f.col("Params.param1").alias("word_index"), f.col("Params.param2").alias("TF_IDF_Score"))

### Viewing the results                              
final_tfidf.show(5)

final_tfidf = final_tfidf.withColumn("TF_IDF_Score", col("TF_IDF_Score").cast(FloatType()))

### Filtering the top 12 words
tf_idf_df = final_tfidf.orderBy(f.col('TF_IDF_Score').desc()).select('word', 'TF_IDF_Score').limit(12).toPandas()
### Saving as csv file
tf_idf_df.to_csv('tf_idf_sub.csv', index=False)


In [12]:
spark.stop()