### HW5: Toxic Comment Classification with Spark

In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.functions import vector_to_array
import pyspark.sql.functions as F
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Word2Vec
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.classification import FMClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MultilabelClassificationEvaluator
import tqdm
import pandas as pd

Start spark local session

In [3]:
spark = SparkSession.builder\
        .master('local[32]')\
        .appName('HW5')\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

sc = spark.sparkContext

Read data

In [4]:
train = spark.read.csv('train.csv', sep=',', quote='\"', escape='\"', multiLine=True, header=True, inferSchema=True)
test = spark.read.csv('test.csv', sep=',', quote='\"', escape='\"', multiLine=True, header=True, inferSchema=True)
test_labels = spark.read.csv('test_labels.csv', sep=',', quote='\"', escape='\"', multiLine=True, header=True, inferSchema=True)
test = test.join(test_labels, 'id')

Train model

In [None]:
targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate',]
res = {
    'numFeatures':[],
    'pipe':[],
    #'model':[],
}

res.update({
    target:[] for target in targets
})

for numFeatures in tqdm.tqdm([10, 20, 50, 100]):
    scores = []
    tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="tf", numFeatures=numFeatures)
    idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
    preprocessing = [tokenizer, hashingTF, idf,]

    pipeline = Pipeline(stages=[tokenizer, hashingTF, idf,])
    res['pipe'].append('tf_idf_logreg')
    res['numFeatures'].append(numFeatures)
    for target in targets:

        model = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol=target, regParam=0.3, elasticNetParam=0.8)
        metrics = BinaryClassificationEvaluator(labelCol=model.getLabelCol(), rawPredictionCol=model.getRawPredictionCol())

        pipeline = Pipeline(stages=preprocessing + [model,])
        pipeline = pipeline.fit(train)
        res[target].append(metrics.evaluate(pipeline.transform(test)))

  0%|          | 0/4 [00:00<?, ?it/s]

In [10]:
pd.DataFrame(res)

ValueError: ignored

In [7]:
for numFeatures in tqdm.tqdm([10, 20, 50, 100]):
    scores = []
    tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="tf", numFeatures=numFeatures)
    idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
    preprocessing = [tokenizer, hashingTF, idf,]

    pipeline = Pipeline(stages=[tokenizer, hashingTF, idf,])
    res['pipe'].append('tf_idf_naiveBayes')
    res['numFeatures'].append(numFeatures)
    for target in targets:

        model = NaiveBayes(
                featuresCol=idf.getOutputCol(), 
                labelCol=target, 
                smoothing=1.0, 
                modelType="multinomial")
        metrics = BinaryClassificationEvaluator(labelCol=model.getLabelCol(), rawPredictionCol=model.getRawPredictionCol())

        pipeline = Pipeline(stages=preprocessing + [model,])
        pipeline = pipeline.fit(train)
        res[target].append(metrics.evaluate(pipeline.transform(test)))

pd.DataFrame(res)

100%|██████████| 4/4 [13:41<00:00, 205.48s/it]


Unnamed: 0,numFeatures,pipe,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,10,tf_idf_logreg,0.5,0.5,0.5,0.5,0.5,0.5
1,20,tf_idf_logreg,0.5,0.5,0.5,0.5,0.5,0.5
2,50,tf_idf_logreg,0.5,0.5,0.5,0.5,0.5,0.5
3,100,tf_idf_logreg,0.5,0.5,0.5,0.5,0.5,0.5
4,10,tf_idf_naiveBayes,0.640626,0.498888,0.637462,0.595308,0.63533,0.620397
5,20,tf_idf_naiveBayes,0.641366,0.499387,0.638537,0.597254,0.63641,0.622166
6,50,tf_idf_naiveBayes,0.641836,0.504088,0.639044,0.606435,0.637086,0.62423
7,100,tf_idf_naiveBayes,0.642066,0.506713,0.639695,0.608007,0.637777,0.624552


In [9]:
for numFeatures in tqdm.tqdm([10, 20, 50, 100]):
    scores = []
    tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")
    w2v = Word2Vec(inputCol=tokenizer.getOutputCol(), outputCol="features", vectorSize=numFeatures)
    preprocessing = [tokenizer, w2v]

    pipeline = Pipeline(stages=[tokenizer, w2v])
    res['pipe'].append('w2v_logreg')
    res['numFeatures'].append(numFeatures)
    for target in targets:

        model = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol=target, regParam=0.3, elasticNetParam=0.8)
        metrics = BinaryClassificationEvaluator(labelCol=model.getLabelCol(), rawPredictionCol=model.getRawPredictionCol())

        pipeline = Pipeline(stages=preprocessing + [model,])
        pipeline = pipeline.fit(train)
        res[target].append(metrics.evaluate(pipeline.transform(test)))

pd.DataFrame(res)

  0%|          | 0/4 [06:07<?, ?it/s]


Py4JJavaError: ignored

WTF