In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('nlp').getOrCreate()

# Corpus

In [3]:
df = spark.createDataFrame([(1, 'I really liked this movie'),
                            (2, 'I would recommend this movie to my friends'),
                            (3, 'movie was alright but acting was horrible'),
                            (4, 'I am never watching that movie ever again')], 
                           ['user_id', 'review'])

In [6]:
df.show()

+-------+--------------------+
|user_id|              review|
+-------+--------------------+
|      1|I really liked th...|
|      2|I would recommend...|
|      3|movie was alright...|
|      4|I am never watchi...|
+-------+--------------------+



# Tokenize

In [5]:
from pyspark.ml.feature import Tokenizer

tokenization = Tokenizer(inputCol='review', outputCol='tokens')
tokenized_df = tokenization.transform(df)
tokenized_df.show()

+-------+--------------------+--------------------+
|user_id|              review|              tokens|
+-------+--------------------+--------------------+
|      1|I really liked th...|[i, really, liked...|
|      2|I would recommend...|[i, would, recomm...|
|      3|movie was alright...|[movie, was, alri...|
|      4|I am never watchi...|[i, am, never, wa...|
+-------+--------------------+--------------------+



# Stopwords Removal

In [9]:
from pyspark.ml.feature import StopWordsRemover

stopword_removal = StopWordsRemover(inputCol='tokens', 
                                    outputCol='refined_tokens')
refined_df = stopword_removal.transform(tokenized_df)
refined_df.select(['user_id', 'tokens', 'refined_tokens']).show()

+-------+--------------------+--------------------+
|user_id|              tokens|      refined_tokens|
+-------+--------------------+--------------------+
|      1|[i, really, liked...|[really, liked, m...|
|      2|[i, would, recomm...|[recommend, movie...|
|      3|[movie, was, alri...|[movie, alright, ...|
|      4|[i, am, never, wa...|[never, watching,...|
+-------+--------------------+--------------------+



# Bag of Words

# Count Vectorizer

In [11]:
from pyspark.ml.feature import CountVectorizer

count_vec = CountVectorizer(inputCol='refined_tokens', 
                            outputCol='features')
cv_df = count_vec.fit(refined_df).transform(refined_df)
cv_df.select(['user_id', 'refined_tokens', 'features']).show()

+-------+--------------------+--------------------+
|user_id|      refined_tokens|            features|
+-------+--------------------+--------------------+
|      1|[really, liked, m...|(11,[0,1,8],[1.0,...|
|      2|[recommend, movie...|(11,[0,3,10],[1.0...|
|      3|[movie, alright, ...|(11,[0,4,7,9],[1....|
|      4|[never, watching,...|(11,[0,2,5,6],[1....|
+-------+--------------------+--------------------+



In [12]:
count_vec.fit(refined_df).vocabulary

['movie',
 'horrible',
 'liked',
 'alright',
 'friends',
 'really',
 'watching',
 'ever',
 'recommend',
 'acting',
 'never']

# TF-IDF

In [14]:
from pyspark.ml.feature import HashingTF, IDF


hashing_vec = HashingTF(inputCol='refined_tokens',
                        outputCol='tf_features')
hashing_df = hashing_vec.transform(refined_df)
hashing_df.select(['user_id', 'refined_tokens', 'tf_features']).show(4)

+-------+--------------------+--------------------+
|user_id|      refined_tokens|         tf_features|
+-------+--------------------+--------------------+
|      1|[really, liked, m...|(262144,[14,32675...|
|      2|[recommend, movie...|(262144,[129613,1...|
|      3|[movie, alright, ...|(262144,[80824,15...|
|      4|[never, watching,...|(262144,[63139,15...|
+-------+--------------------+--------------------+



In [15]:
tf_idf_vec = IDF(inputCol='tf_features', outputCol='tf_idf_features')
tf_idf_df = tf_idf_vec.fit(hashing_df).transform(hashing_df)
tf_idf_df.select(['user_id', 'tf_idf_features']).show()

+-------+--------------------+
|user_id|     tf_idf_features|
+-------+--------------------+
|      1|(262144,[14,32675...|
|      2|(262144,[129613,1...|
|      3|(262144,[80824,15...|
|      4|(262144,[63139,15...|
+-------+--------------------+



# Text Classification using Machine Learning

In [17]:
text_df = spark.read.csv('Movie_reviews.csv', inferSchema=True, header=True, sep=',')
text_df.printSchema()

root
 |-- Review: string (nullable = true)
 |-- Sentiment: string (nullable = true)



In [18]:
text_df.count()

7087

In [20]:
# Select only records that are labeled correctly.

text_df = text_df.filter(((text_df.Sentiment == '1') | (text_df.Sentiment == '0')))
text_df.count()

6990

In [21]:
text_df.groupBy('Sentiment').count().show()

+---------+-----+
|Sentiment|count|
+---------+-----+
|        0| 3081|
|        1| 3909|
+---------+-----+



In [22]:
from pyspark.sql.functions import rand

text_df.orderBy(rand()).show(10)

+--------------------+---------+
|              Review|Sentiment|
+--------------------+---------+
|oh and i loved th...|        1|
|Da Vinci Code suc...|        0|
|I want to be here...|        1|
|The Da Vinci Code...|        1|
|I love its Harry ...|        1|
|Love luv lubb the...|        1|
|These Harry Potte...|        0|
|Harry Potter drag...|        0|
|""" brokeback mou...|        0|
|the last stand an...|        1|
+--------------------+---------+
only showing top 10 rows



In [23]:
text_df = text_df.withColumn('Label', text_df.Sentiment.cast('float')).drop('Sentiment')
text_df.orderBy(rand()).show(10)

+--------------------+-----+
|              Review|Label|
+--------------------+-----+
|Da Vinci Code suc...|  0.0|
|i love kirsten / ...|  1.0|
|i heard da vinci ...|  0.0|
|I am going to sta...|  1.0|
|Combining the opi...|  0.0|
|the story of Harr...|  1.0|
|I did lapse once,...|  0.0|
|then we went to m...|  1.0|
|i liked the Da Vi...|  1.0|
|lol ya and then i...|  1.0|
+--------------------+-----+
only showing top 10 rows



In [24]:
from pyspark.sql.functions import length

In [25]:
text_df = text_df.withColumn('length', length(text_df['Review']))
text_df.orderBy(rand()).show(10)

+--------------------+-----+------+
|              Review|Label|length|
+--------------------+-----+------+
|Back in Melbourne...|  1.0|    72|
|by the way, the D...|  0.0|    62|
|Oh, and Brokeback...|  0.0|    48|
|Friday I went out...|  1.0|    72|
|friday hung out w...|  0.0|    72|
|Which is why i sa...|  1.0|    72|
|The Da Vinci Code...|  1.0|    71|
|"Anyway, thats wh...|  1.0|    49|
|Oh, and Brokeback...|  0.0|    48|
|Brokeback Mountai...|  1.0|    34|
+--------------------+-----+------+
only showing top 10 rows



In [26]:
text_df.groupBy('Label').agg({'Length': 'mean'}).show()

+-----+-----------------+
|Label|      avg(Length)|
+-----+-----------------+
|  1.0|47.61882834484523|
|  0.0|50.95845504706264|
+-----+-----------------+



In [30]:
# Tokenize and stopwords removal.
tokenization = Tokenizer(inputCol='Review', 
                         outputCol='tokens')
tokenized_df = tokenization.transform(text_df)
stopwords_removal = StopWordsRemover(inputCol='tokens',
                                     outputCol='refined_tokens')
refined_text_df = stopwords_removal.transform(tokenized_df)

In [31]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import rand, col

len_udf = udf(lambda s: len(s), IntegerType())
refined_text_df = refined_text_df.withColumn('token_count', len_udf(col('refined_tokens')))
refined_text_df.orderBy(rand()).show(10)

+--------------------+-----+------+--------------------+--------------------+-----------+
|              Review|Label|length|              tokens|      refined_tokens|token_count|
+--------------------+-----+------+--------------------+--------------------+-----------+
|Brokeback Mountai...|  0.0|    40|[brokeback, mount...|[brokeback, mount...|          4|
|These Harry Potte...|  0.0|    38|[these, harry, po...|[harry, potter, m...|          5|
|The Da Vinci Code...|  0.0|    34|[the, da, vinci, ...|[da, vinci, code,...|          6|
|So as felicia's m...|  1.0|    71|[so, as, felicia'...|[felicia's, mom, ...|          7|
|man i loved broke...|  1.0|    31|[man, i, loved, b...|[man, loved, brok...|          4|
|we're gonna like ...|  1.0|    51|[we're, gonna, li...|[gonna, like, wat...|          6|
|DA VINCI CODE IS ...|  1.0|    26|[da, vinci, code,...|[da, vinci, code,...|          4|
|Harry Potter is A...|  1.0|    66|[harry, potter, i...|[harry, potter, a...|          7|
|, she hel

In [32]:
count_vec = CountVectorizer(inputCol='refined_tokens',
                            outputCol='features')
cv_text_df = count_vec.fit(refined_text_df).transform(refined_text_df)
cv_text_df.select(['refined_tokens', 'token_count', 'features', 'Label']).show(10)

+--------------------+-----------+--------------------+-----+
|      refined_tokens|token_count|            features|Label|
+--------------------+-----------+--------------------+-----+
|[da, vinci, code,...|          5|(2302,[0,1,4,43,2...|  1.0|
|[first, clive, cu...|          9|(2302,[11,51,229,...|  1.0|
|[liked, da, vinci...|          5|(2302,[0,1,4,53,3...|  1.0|
|[liked, da, vinci...|          5|(2302,[0,1,4,53,3...|  1.0|
|[liked, da, vinci...|          8|(2302,[0,1,4,53,6...|  1.0|
|[even, exaggerati...|          6|(2302,[46,229,271...|  1.0|
|[loved, da, vinci...|          8|(2302,[0,1,22,30,...|  1.0|
|[thought, da, vin...|          7|(2302,[0,1,4,228,...|  1.0|
|[da, vinci, code,...|          6|(2302,[0,1,4,33,2...|  1.0|
|[thought, da, vin...|          7|(2302,[0,1,4,223,...|  1.0|
+--------------------+-----------+--------------------+-----+
only showing top 10 rows



In [36]:
model_text_df = cv_text_df.select(['features', 'token_count', 'Label'])

In [37]:
from pyspark.ml.feature import VectorAssembler

df_assembler = VectorAssembler(inputCols=['features', 'token_count'],
                               outputCol='features_vec')
model_text_df = df_assembler.transform(model_text_df)
model_text_df.printSchema()

root
 |-- features: vector (nullable = true)
 |-- token_count: integer (nullable = true)
 |-- Label: float (nullable = true)
 |-- features_vec: vector (nullable = true)



In [38]:
from pyspark.ml.classification import LogisticRegression

train_df, test_df = model_text_df.randomSplit([.75, .25])

In [39]:
train_df.groupBy('Label').count().show()

+-----+-----+
|Label|count|
+-----+-----+
|  1.0| 2907|
|  0.0| 2318|
+-----+-----+



In [40]:
test_df.groupBy('Label').count().show()

+-----+-----+
|Label|count|
+-----+-----+
|  1.0| 1002|
|  0.0|  763|
+-----+-----+



In [42]:
log_reg = LogisticRegression(featuresCol='features_vec',
                             labelCol='Label').fit(train_df)
results = log_reg.evaluate(test_df).predictions
results.show()

+--------------------+-----------+-----+--------------------+--------------------+--------------------+----------+
|            features|token_count|Label|        features_vec|       rawPrediction|         probability|prediction|
+--------------------+-----------+-----+--------------------+--------------------+--------------------+----------+
|(2302,[0,1,4,5,12...|          7|  1.0|(2303,[0,1,4,5,12...|[-41.867867077536...|[6.56169982694168...|       1.0|
|(2302,[0,1,4,5,64...|          6|  1.0|(2303,[0,1,4,5,64...|[-18.216681482391...|[1.22629723287176...|       1.0|
|(2302,[0,1,4,5,30...|          5|  1.0|(2303,[0,1,4,5,30...|[-19.613921240322...|[3.03237030807226...|       1.0|
|(2302,[0,1,4,5,44...|          5|  1.0|(2303,[0,1,4,5,44...|[-22.227997429774...|[2.22076892814667...|       1.0|
|(2302,[0,1,4,5,65...|          5|  1.0|(2303,[0,1,4,5,65...|[-16.828934597592...|[4.91231883054462...|       1.0|
|(2302,[0,1,4,5,82...|          6|  1.0|(2303,[0,1,4,5,82...|[-15.138940634494..

In [43]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

tp = results[(results.Label == 1) & (results.prediction == 1)].count()
tn = results[(results.Label == 0) & (results.prediction == 0)].count()
fp = results[(results.Label == 0) & (results.prediction == 1)].count()
fn = results[(results.Label == 1) & (results.prediction == 0)].count()

In [44]:
recall = float(tp) / (tp + fn)
recall

0.9930139720558883

In [45]:
precision = float(tp) / (tp + fp)
precision

0.9688412852969815

In [47]:
accuracy = float(tp + tn) / (results.count())
accuracy

0.9779036827195468