In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.ml.feature import RegexTokenizer, Tokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from nltk.classify import NaiveBayesClassifier
from toxic_funcs import Cleaner

In [2]:
# Start Spark session
spark = SparkSession.builder.appName('tokenizer').getOrCreate()
# Using directly pyspark is giving rubbish so creating pandas df and converting to pyspark dataframe
sqlCtx = SQLContext(spark)
# Pandas dataframe
panda_df = pd.read_csv("data/train.csv")
# Converting pandas to pyspark
pyspark_dataframe = sqlCtx.createDataFrame(panda_df)
pyspark_dataframe.select("comment_text").show(100)
pyspark_dataframe.show()



+--------------------+
|        comment_text|
+--------------------+
|Explanation
Why t...|
|D'aww! He matches...|
|Hey man, I'm real...|
|"
More
I can't ma...|
|You, sir, are my ...|
|"

Congratulation...|
|COCKSUCKER BEFORE...|
|Your vandalism to...|
|Sorry if the word...|
|alignment on this...|
|"
Fair use ration...|
|bbq 

be a man an...|
|Hey... what is it...|
|Before you start ...|
|Oh, and the girl ...|
|"

Juelz Santanas...|
|Bye! 

Don't look...|
|REDIRECT Talk:Voy...|
|The Mitsurugi poi...|
|Don't mean to bot...|
|"

 Regarding you...|
|"
Good to know. A...|
|"

 Snowflakes ar...|
|"

 The Signpost:...|
|"

Re-considering...|
|Radial symmetry 
...|
|There's no need t...|
|Yes, because the ...|
|"
Ok. But it will...|
|"== A barnstar fo...|
|How could I post ...|
|Not sure about a ...|
|Praise 

looked a...|
|I was able to pos...|
|"
Well, not ""bef...|
|"

Not at all, yo...|
|"

 ""Mainland As...|
|pretty much every...|
|Hi Explicit, can ...|
|Notability of Rur...|
|"
 Sure, b

In [7]:
# Tokenizing the comments
tokenizer = Tokenizer(inputCol="comment_text", outputCol="tokenized_text")
tokenized = tokenizer.transform(pyspark_dataframe)
tokenized.show()

+----------------+--------------------+-----+------------+-------+------+------+-------------+--------------------+
|              id|        comment_text|toxic|severe_toxic|obscene|threat|insult|identity_hate|      tokenized_text|
+----------------+--------------------+-----+------------+-------+------+------+-------------+--------------------+
|0000997932d777bf|Explanation
Why t...|    0|           0|      0|     0|     0|            0|[explanation, why...|
|000103f0d9cfb60f|D'aww! He matches...|    0|           0|      0|     0|     0|            0|[d'aww!, he, matc...|
|000113f07ec002fd|Hey man, I'm real...|    0|           0|      0|     0|     0|            0|[hey, man,, i'm, ...|
|0001b41b1c6bb37e|"
More
I can't ma...|    0|           0|      0|     0|     0|            0|[", more, i, can'...|
|0001d958c54c6e35|You, sir, are my ...|    0|           0|      0|     0|     0|            0|[you,, sir,, are,...|
|00025465d4725e87|"

Congratulation...|    0|           0|      0|     0

In [8]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')



[nltk_data] Error loading vader_lexicon: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed
[nltk_data]     (_ssl.c:749)>


False

In [9]:
sid = SentimentIntensityAnalyzer()

In [10]:
comments = pyspark_dataframe.select("comment_text").collect()
#Convert the dataframe column to list
comment_list = list(
    pyspark_dataframe.select('comment_text').toPandas()['comment_text']
)

In [21]:
negatives = []
positives = []
neutral = []
for comment in comment_list:
    cleaned = Cleaner(comment)
    ss = sid.polarity_scores(cleaned)
    for k in sorted(ss):
        if(k == "neg"):
            negatives.append(ss[k])
        elif(k == 'pos'):
            positives.append(ss[k])
        elif(k == 'neu'):
            neutral.append(ss[k])
        #negatives.append()
        #print('{0}: {1}, '.format(k, ss[k]), end='')

In [24]:
negative_col = pd.Series(negatives)
positive_col = pd.Series(positives)
neutral_col = pd.Series(neutral)

In [25]:
panda_df['overall_negative_sent'] = negative_col.values
panda_df['overall_positive_sent'] = positive_col.values
panda_df['overall_neutral_sent'] = neutral_col.values

In [26]:
panda_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,overall_negative_sent,overall_positive_sent,overall_neutral_sent
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0.0,0.103,0.897
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0.101,0.146,0.754
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0.083,0.068,0.849
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0.043,0.062,0.895
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0.0,0.337,0.663
