In [15]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.ml.feature import RegexTokenizer, Tokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from nltk.classify import NaiveBayesClassifier

from toxic_funcs import Cleaner

In [None]:
# Start Spark session
spark = SparkSession.builder.appName('tokenizer').getOrCreate()
# Using directly pyspark is giving rubbish so creating pandas df and converting to pyspark dataframe
sqlCtx = SQLContext(spark)
# Pandas dataframe
panda_df = pd.read_csv("data/train.csv")
# Converting pandas to pyspark
pyspark_dataframe = sqlCtx.createDataFrame(panda_df)
pyspark_dataframe.select("comment_text").show(100)
pyspark_dataframe.show()



In [None]:
# Tokenizing the comments
tokenizer = Tokenizer(inputCol="comment_text", outputCol="tokenized_text")
tokenized = tokenizer.transform(pyspark_dataframe)
tokenized.show()

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

In [15]:
sid = SentimentIntensityAnalyzer()

In [16]:
comments = pyspark_dataframe.select("comment_text").collect()
#Convert the dataframe column to list
comment_list = list(
    pyspark_dataframe.select('comment_text').toPandas()['comment_text']
)

159571


In [19]:
for comment in comment_list[0:100]:
    cleaned = Cleaner(comment)
    ss = sid.polarity_scores(cleaned)
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end='')

compound: 0.5574, neg: 0.0, neu: 0.897, pos: 0.103, compound: 0.2942, neg: 0.099, neu: 0.743, pos: 0.158, compound: -0.1779, neg: 0.083, neu: 0.849, pos: 0.068, compound: 0.5106, neg: 0.022, neu: 0.916, pos: 0.062, compound: 0.6808, neg: 0.0, neu: 0.663, pos: 0.337, compound: 0.7964, neg: 0.0, neu: 0.464, pos: 0.536, compound: -0.7783, neg: 0.531, neu: 0.469, pos: 0.0, compound: -0.1779, neg: 0.129, neu: 0.773, pos: 0.099, compound: -0.802, neg: 0.109, neu: 0.891, pos: 0.0, compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, compound: 0.991, neg: 0.019, neu: 0.877, pos: 0.104, compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, compound: 0.3034, neg: 0.13, neu: 0.673, pos: 0.197, compound: -0.4015, neg: 0.12, neu: 0.783, pos: 0.097, compound: 0.128, neg: 0.16, neu: 0.668, pos: 0.171, compound: -0.5672, neg: 0.071, neu: 0.888, pos: 0.041, compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, compound: -0.0423, neg: 0.099, neu: 0.809, pos: 0.092, compound: 0.9188, n