# Multi-Class Text Classification Using PySpark, MLlib & Doc2Vec

[Reference](https://medium.com/towards-artificial-intelligence/multi-class-text-classification-using-pyspark-mllib-doc2vec-dbfcee5b39f2)

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

MAX_MEMORY = "4g"
#     config("spark.driver.memory", MAX_MEMORY).

spark = SparkSession.builder. \
    appName("pyspark-nlp"). \
    config("spark.executor.memory", MAX_MEMORY). \
    getOrCreate()

# Dataset
https://www.kaggle.com/rmisra/news-category-dataset

In [3]:
%%time

# Load and repartition data
CORES = 8
df = spark.read.json('/dataset/news/huffingtonpost-news.json.gz').repartition(CORES * 8)

CPU times: user 20 ms, sys: 40 ms, total: 60 ms
Wall time: 22.7 s


In [4]:
df.rdd.getNumPartitions()

80

In [5]:
# df.count()
from pyspark.sql import functions as f

df.agg(f.approx_count_distinct(df.category).alias('distinct_categories')).show()

+-------------------+
|distinct_categories|
+-------------------+
|                 39|
+-------------------+



In [6]:
df.groupBy('category').count().orderBy('count', ascending=False).collect()

[Row(category='POLITICS', count=32739),
 Row(category='WELLNESS', count=17827),
 Row(category='ENTERTAINMENT', count=16058),
 Row(category='TRAVEL', count=9887),
 Row(category='STYLE & BEAUTY', count=9649),
 Row(category='PARENTING', count=8677),
 Row(category='HEALTHY LIVING', count=6694),
 Row(category='QUEER VOICES', count=6314),
 Row(category='FOOD & DRINK', count=6226),
 Row(category='BUSINESS', count=5937),
 Row(category='COMEDY', count=5175),
 Row(category='SPORTS', count=4884),
 Row(category='BLACK VOICES', count=4528),
 Row(category='HOME & LIVING', count=4195),
 Row(category='PARENTS', count=3955),
 Row(category='THE WORLDPOST', count=3664),
 Row(category='WEDDINGS', count=3651),
 Row(category='WOMEN', count=3490),
 Row(category='IMPACT', count=3459),
 Row(category='DIVORCE', count=3426),
 Row(category='CRIME', count=3405),
 Row(category='MEDIA', count=2815),
 Row(category='WEIRD NEWS', count=2670),
 Row(category='GREEN', count=2622),
 Row(category='WORLDPOST', count=2579),
 

In [7]:
# df[df.category == 'QUEER VOICES'].select('headline').show(truncate=False)
from pyspark.sql import functions as f

CATEGORIES = {
    'ENVIRONMENT' : ['ENVIRONMENT', 'GREEN'],
    'POLITICS' : ['POLITICS', 'BUSINESS', 'MONEY'],
    'TECH' : ['SCIENCE', 'TECH'],
    'OTHERS' : ['ARTS', 'ARTS & CULTURE', 'CULTURE & ARTS', 'STYLE & BEAUTY', 'STYLE', 'TASTE', 'FOOD & DRINK', 'TRAVEL', 'PARENTING', 'DIVORCE', 'EDUCATION', 'PARENTS', 'FIFTY', 'COLLEGE', 'WEDDINGS', 'WELLNESS', 'HEALTHY LIVING', 'HOME & LIVING', 'ENTERTAINMENT', 'COMEDY', 'THE WORLDPOST', 'WORLDPOST', 'MEDIA', 'WORLD NEWS', 'IMPACT', 'WEIRD NEWS', 'GOOD NEWS', 'WOMEN', 'QUEER VOICES', 'BLACK VOICES', 'LATINO VOICES']
}


cats_assigned = {}
for new_category, old_categories in CATEGORIES.items():
    for old_category in old_categories:
        cats_assigned[old_category] = new_category

# UDF
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def reassign_categories(cat):
    if cat in cats_assigned.keys():
        return cats_assigned[cat]
    else:
        return cat

reassign_categories_udf = udf(lambda cat: reassign_categories(cat), StringType())

In [8]:
df = df.withColumn('new_cat', reassign_categories_udf(df.category))

In [9]:
df.groupBy('new_cat').count().orderBy('count', ascending=False).show()

+-----------+------+
|    new_cat| count|
+-----------+------+
|     OTHERS|141420|
|   POLITICS| 40383|
|     SPORTS|  4884|
|       TECH|  4260|
|ENVIRONMENT|  3945|
|      CRIME|  3405|
|   RELIGION|  2556|
+-----------+------+



In [10]:
# Reducing # of categories
# 
# KEEP_CATS = ['POLITICS', 'TECH', 'SPORTS', 'CRIME', 'RELIGION']
# df = df.filter(f.col('new_cat').isin(KEEP_CATS))

In [11]:
from pyspark.sql import functions as f

df = df. \
    drop('category'). \
    drop('authors'). \
    drop('link'). \
    drop('date'). \
    withColumn('raw_text', f.concat(df.headline, f.lit('. '), df.short_description)). \
    drop('headline'). \
    drop('short_description'). \
    withColumnRenamed('new_cat', 'category'). \
    cache()

In [12]:
df.printSchema()
df.show(truncate=False)

root
 |-- category: string (nullable = true)
 |-- raw_text: string (nullable = true)

+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|category   |raw_text                                                                                                                                                                                                                                                                                                                        |
+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [13]:
import gensim.parsing.preprocessing as gsp
from gensim import utils

filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.remove_stopwords, 
           gsp.strip_short, 
           gsp.stem_text
          ]

In [14]:
# Preprocessing UDF
# https://changhsinlee.com/pyspark-udf/

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def clean_text(x):
    s = x.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

clean_text_udf = udf(lambda text: clean_text(text), StringType())

In [15]:
# Creating Field to Split Dataset
SEED=42
TEST_SIZE=0.3

from pyspark.sql.functions import rand, when
df = df.withColumn('train', when(rand(seed=SEED) >= TEST_SIZE, True).otherwise(False))

In [16]:
df_raw = df.withColumn('text', clean_text_udf(df.raw_text)).drop('raw_text')

In [17]:
df_raw.show(truncate=False)

+-----------+-----+----------------------------------------------------------------------------------------------------------------------------------------------------+
|category   |train|text                                                                                                                                                |
+-----------+-----+----------------------------------------------------------------------------------------------------------------------------------------------------+
|OTHERS     |true |easili clean easter grass common household item photo left stubborn grass                                                                           |
|OTHERS     |true |save life ic ic stand case emerg hospit polic firemen paramed know ic mean happen love emerg personnel search phone contact call ic                 |
|POLITICS   |true |trump miracul come paul ryan farc session amaz happen huh                                                                               

In [18]:
# Dataset distribution
df_raw.groupBy('train').count().show()

+-----+------+
|train| count|
+-----+------+
| true|140500|
|false| 60353|
+-----+------+



# Preparing `category` field with `StringIndexer`

In [19]:
from pyspark.ml.feature import StringIndexer
str_indexer = StringIndexer(inputCol="category", outputCol="label")

# Preparing `text` field with `Word2Vec`

In [20]:
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
word2vec = Word2Vec(vectorSize=100, minCount=1, inputCol="tokens", outputCol="features")

In [21]:
%%time
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[str_indexer, tokenizer, word2vec])
model = pipeline.fit(df_raw)

CPU times: user 220 ms, sys: 50 ms, total: 270 ms
Wall time: 3min 22s


In [22]:
%%time
df_w2v = model.transform(df_raw)

CPU times: user 100 ms, sys: 30 ms, total: 130 ms
Wall time: 4.97 s


In [23]:
%%time
df_w2v.show()

+-----------+-----+--------------------+-----+--------------------+--------------------+
|   category|train|                text|label|              tokens|            features|
+-----------+-----+--------------------+-----+--------------------+--------------------+
|     OTHERS| true|easili clean east...|  0.0|[easili, clean, e...|[0.06482430809939...|
|     OTHERS| true|save life ic ic s...|  0.0|[save, life, ic, ...|[0.01097476915658...|
|   POLITICS| true|trump miracul com...|  1.0|[trump, miracul, ...|[-0.0900629305513...|
|   POLITICS| true|new roundup juli ...|  1.0|[new, roundup, ju...|[0.10023661529911...|
|     OTHERS| true|seth meyer roast ...|  0.0|[seth, meyer, roa...|[-0.2318754609674...|
|     OTHERS| true|new children book...|  0.0|[new, children, b...|[0.09767392983373...|
|   POLITICS|false|educ depart near ...|  1.0|[educ, depart, ne...|[-0.0189133385817...|
|     OTHERS|false|talk child dark k...|  0.0|[talk, child, dar...|[-0.0075337088637...|
|ENVIRONMENT| true|th

In [24]:
# Split Datasets
# Thanks Pedro Ferrari for this trick ;)
import pyspark.sql.functions as f
df_train = df_w2v.filter(f.col('train') == True)
df_test = df_w2v.filter(f.col('train') == False)

# Random Forest

In [25]:
from pyspark.ml.classification import RandomForestClassifier

rf_classifier = RandomForestClassifier(labelCol="label", featuresCol="features")
rf_classifier_pipeline = Pipeline(stages=[rf_classifier])

In [26]:
%%time

rf_model = rf_classifier_pipeline.fit(df_train)

CPU times: user 110 ms, sys: 110 ms, total: 220 ms
Wall time: 2min 6s


In [27]:
# %%time
# 
# rf_model.save('/dataset/news/random_forest.model')

In [28]:
%%time

rf_predictions = rf_model.transform(df_test)

CPU times: user 180 ms, sys: 100 ms, total: 280 ms
Wall time: 1.01 s


In [29]:
%%time

rf_predictions.select('category', 'label', 'prediction').show()

+-----------+-----+----------+
|   category|label|prediction|
+-----------+-----+----------+
|   POLITICS|  1.0|       0.0|
|     OTHERS|  0.0|       0.0|
|     OTHERS|  0.0|       0.0|
|     OTHERS|  0.0|       0.0|
|ENVIRONMENT|  4.0|       1.0|
|     OTHERS|  0.0|       0.0|
|     OTHERS|  0.0|       0.0|
|     OTHERS|  0.0|       0.0|
|       TECH|  3.0|       0.0|
|     OTHERS|  0.0|       0.0|
|     OTHERS|  0.0|       0.0|
|   POLITICS|  1.0|       1.0|
|       TECH|  3.0|       0.0|
|     OTHERS|  0.0|       0.0|
|     OTHERS|  0.0|       0.0|
|     OTHERS|  0.0|       0.0|
|     OTHERS|  0.0|       0.0|
|     OTHERS|  0.0|       0.0|
|     SPORTS|  2.0|       0.0|
|     SPORTS|  2.0|       0.0|
+-----------+-----+----------+
only showing top 20 rows

CPU times: user 50 ms, sys: 0 ns, total: 50 ms
Wall time: 968 ms


In [30]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

rf_model_evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")

accuracy = rf_model_evaluator.evaluate(rf_predictions)
print("Accuracy = %g" % (accuracy))

Accuracy = 0.782049
