# Multi-Class Text Classification Using PySpark, MLlib & Doc2Vec

[Reference](https://medium.com/towards-artificial-intelligence/multi-class-text-classification-using-pyspark-mllib-doc2vec-dbfcee5b39f2)

In [1]:
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession

MAX_MEMORY = "4g"
#     config("spark.driver.memory", MAX_MEMORY).

spark = SparkSession.builder. \
    appName("pyspark-nlp"). \
    config("spark.executor.memory", MAX_MEMORY). \
    getOrCreate()

# Dataset
https://www.kaggle.com/rmisra/news-category-dataset

In [None]:
%%time

# Load and repartition data
CORES = 8
df = spark.read.json('/dataset/news/huffingtonpost-news.json').repartition(CORES * 10)

In [None]:
df.rdd.getNumPartitions()

In [None]:
# df.count()
from pyspark.sql import functions as f

df.agg(f.approx_count_distinct(df.category).alias('distinct_categories')).show()

In [None]:
df.groupBy('category').count().orderBy('count', ascending=False).collect()

In [None]:
# df[df.category == 'QUEER VOICES'].select('headline').show(truncate=False)
from pyspark.sql import functions as f

CATEGORIES = {
    'ARTS' : ['ARTS', 'ARTS & CULTURE', 'CULTURE & ARTS', 'STYLE & BEAUTY', 'STYLE', 'TASTE', 'FOOD & DRINK', 'TRAVEL', 'ENTERTAINMENT', 'COMEDY'],
    'ENVIRONMENT' : ['ENVIRONMENT', 'GREEN'],
    'LIFE' : ['PARENTING', 'DIVORCE', 'EDUCATION', 'PARENTS', 'FIFTY', 'COLLEGE', 'WEDDINGS', 'WELLNESS', 'HEALTHY LIVING', 'HOME & LIVING'],
    'POLITICS' : ['POLITICS', 'BUSINESS', 'MONEY'],
    'TECH' : ['SCIENCE', 'TECH'],
    'NEWS' : ['THE WORLDPOST', 'WORLDPOST', 'MEDIA', 'WORLD NEWS', 'IMPACT', 'WEIRD NEWS', 'GOOD NEWS', 'WOMEN', 'QUEER VOICES', 'BLACK VOICES', 'LATINO VOICES']
}


cats_assigned = {}
for new_category, old_categories in CATEGORIES.items():
    for old_category in old_categories:
        cats_assigned[old_category] = new_category

# UDF
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def reassign_categories(cat):
    if cat in cats_assigned.keys():
        return cats_assigned[cat]
    else:
        return cat

reassign_categories_udf = udf(lambda cat: reassign_categories(cat), StringType())

In [None]:
df = df.withColumn('new_cat', reassign_categories_udf(df.category))

In [None]:
df.groupBy('new_cat').count().orderBy('count', ascending=False).show()

In [None]:
# Reducing # of categories
# 
KEEP_CATS = ['POLITICS', 'TECH', 'SPORTS', 'CRIME', 'RELIGION']
df = df.filter(f.col('new_cat').isin(KEEP_CATS))

In [None]:
from pyspark.sql import functions as f

df = df. \
    drop('category'). \
    drop('authors'). \
    drop('link'). \
    drop('date'). \
    withColumn('raw_text', f.concat(df.headline, f.lit('. '), df.short_description)). \
    drop('headline'). \
    drop('short_description'). \
    withColumnRenamed('new_cat', 'category'). \
    cache()

In [None]:
df.printSchema()
df.show(truncate=False)

In [None]:
import gensim.parsing.preprocessing as gsp
from gensim import utils

filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.remove_stopwords, 
           gsp.strip_short, 
           gsp.stem_text
          ]

In [None]:
# Preprocessing UDF
# https://changhsinlee.com/pyspark-udf/

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def clean_text(x):
    s = x.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

clean_text_udf = udf(lambda text: clean_text(text), StringType())

In [None]:
# Creating Field to Split Dataset
SEED=42
TEST_SIZE=0.3

from pyspark.sql.functions import rand, when
df = df.withColumn('train', when(rand(seed=SEED) >= TEST_SIZE, True).otherwise(False))

In [None]:
df_raw = df.withColumn('text', clean_text_udf(df.raw_text)).drop('raw_text')

In [None]:
df_raw.show(truncate=False)

In [None]:
# Dataset distribution
df_raw.groupBy('train').count().show()

# Preparing `category` field with `StringIndexer`

In [None]:
from pyspark.ml.feature import StringIndexer
str_indexer = StringIndexer(inputCol="category", outputCol="label")

# Preparing `text` field with `Word2Vec`

In [None]:
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
word2vec = Word2Vec(vectorSize=100, minCount=1, inputCol="tokens", outputCol="features")

In [None]:
%%time
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[str_indexer, tokenizer, word2vec])
model = pipeline.fit(df_raw)

In [None]:
%%time
df_w2v = model.transform(df_raw)

In [None]:
%%time
df_w2v.show()

In [None]:
# Split Datasets
# Thanks Pedro Ferrari for this trick ;)
import pyspark.sql.functions as f
df_train = df_w2v.filter(f.col('train') == True)
df_test = df_w2v.filter(f.col('train') == False)

# Random Forest

In [None]:
from pyspark.ml.classification import RandomForestClassifier

rf_classifier = RandomForestClassifier(labelCol="label", featuresCol="features")
rf_classifier_pipeline = Pipeline(stages=[rf_classifier])

In [None]:
%%time

rf_model = rf_classifier_pipeline.fit(df_train)

In [None]:
%%time

rf_model.save('/dataset/news/random_forest.model')

In [None]:
%%time

rf_predictions = rf_model.transform(df_test)

In [None]:
%%time

rf_predictions.select('category', 'label', 'prediction').show()

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

rf_model_evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")

accuracy = rf_model_evaluator.evaluate(rf_predictions)
print("Accuracy = %g" % (accuracy))