In [1]:
!unzip jigsaw-toxic-comment-classification-challenge.zip

Archive:  jigsaw-toxic-comment-classification-challenge.zip
  inflating: sample_submission.csv.zip  
  inflating: test.csv.zip            
  inflating: test_labels.csv.zip     
  inflating: train.csv.zip           


In [2]:
!unzip train.csv.zip

Archive:  train.csv.zip
  inflating: train.csv               


In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .master("local[1]") \
        .appName("HashingTF") \
        .getOrCreate()
sc = spark.sparkContext

In [6]:
df = spark.read \
    .option("header", True) \
    .option("inferSchema", True) \
    .option("delimiter", ',') \
    .option("multiLine", True) \
    .option("escape", "\"") \
    .csv("train.csv")
df.printSchema()
df.show()

root
 |-- id: string (nullable = true)
 |-- comment_text: string (nullable = true)
 |-- toxic: integer (nullable = true)
 |-- severe_toxic: integer (nullable = true)
 |-- obscene: integer (nullable = true)
 |-- threat: integer (nullable = true)
 |-- insult: integer (nullable = true)
 |-- identity_hate: integer (nullable = true)

+----------------+--------------------+-----+------------+-------+------+------+-------------+
|              id|        comment_text|toxic|severe_toxic|obscene|threat|insult|identity_hate|
+----------------+--------------------+-----+------------+-------+------+------+-------------+
|0000997932d777bf|Explanation\nWhy ...|    0|           0|      0|     0|     0|            0|
|000103f0d9cfb60f|D'aww! He matches...|    0|           0|      0|     0|     0|            0|
|000113f07ec002fd|Hey man, I'm real...|    0|           0|      0|     0|     0|            0|
|0001b41b1c6bb37e|"\nMore\nI can't ...|    0|           0|      0|     0|     0|            0|
|000

In [58]:
from pyspark.sql.functions import regexp_replace, udf, col
from pyspark.sql.types import StringType
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover, Word2Vec
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import re

In [8]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

clean_text_udf = udf(lambda z: clean_text(z),StringType())

In [48]:
def htf_transforms(num_features):
    tokenizer = Tokenizer(inputCol="clear_text", outputCol="words")
    stop_words_remover = StopWordsRemover(inputCol="words", outputCol="clear_words")
    tf  = HashingTF(inputCol="clear_words", outputCol="tf_features", numFeatures=num_features)
    idf = IDF(inputCol="tf_features", outputCol="idf_features")
    pipeline = Pipeline(stages=[tokenizer, stop_words_remover, tf, idf])
    model = pipeline.fit(train_df)
    train_data = model.transform(train_df)
    val_data = model.transform(val_df)
    return train_data, val_data

In [None]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
label2id = {label:idx for idx, label in enumerate(labels)}
df = df.withColumn('clear_text', clean_text_udf(col("comment_text")))
train_df, val_df = df.randomSplit(weights=[0.8, 0.2], seed=0)

In [49]:
for num_features in range(50, 2001, 500):
    train_data, val_data = htf_transforms(num_features)
    print(f"Num features={num_features}")
    print("Training Dataset Count: " + str(train_data.count()))
    print("Test Dataset Count: " + str(val_data.count()))
    for label in labels:
        classifier = LogisticRegression(featuresCol='idf_features', labelCol=label).fit(train_data)
        preds = classifier.transform(val_data).select(col(label).alias("label"), "prediction")
        metrics = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label', metricName='areaUnderROC')
        print(f"Label -->> {label}")
        print("areaUnderROC", metrics.evaluate(preds))

Num features=50
Training Dataset Count: 127378
Test Dataset Count: 32193
Label -->> toxic
areaUnderROC 0.5093374660598615
Label -->> severe_toxic
areaUnderROC 0.5251847985885126
Label -->> obscene
areaUnderROC 0.5110121518727708
Label -->> threat
areaUnderROC 0.4999532637482474
Label -->> insult
areaUnderROC 0.5088514084948993
Label -->> identity_hate
areaUnderROC 0.5033000011015185
Num features=550
Training Dataset Count: 127378
Test Dataset Count: 32193
Label -->> toxic
areaUnderROC 0.6126581294577902
Label -->> severe_toxic
areaUnderROC 0.5959163002240352
Label -->> obscene
areaUnderROC 0.645615291981721
Label -->> threat
areaUnderROC 0.5554525627044711
Label -->> insult
areaUnderROC 0.5767570399841414
Label -->> identity_hate
areaUnderROC 0.5212697394230924
Num features=1050
Training Dataset Count: 127378
Test Dataset Count: 32193
Label -->> toxic
areaUnderROC 0.6695705913988911
Label -->> severe_toxic
areaUnderROC 0.6150367816888936
Label -->> obscene
areaUnderROC 0.71771440154450

In [54]:
def w2v_transforms(window_size):
    tokenizer = Tokenizer(inputCol="clear_text", outputCol="words")
    stop_words_remover = StopWordsRemover(inputCol="words", outputCol="clear_words")
    word2Vec = Word2Vec(vectorSize=50, seed=42, inputCol="clear_words", outputCol="features", windowSize=window_size)
    pipeline = Pipeline(stages=[tokenizer, stop_words_remover, word2Vec])
    model = pipeline.fit(train_df)
    train_data = model.transform(train_df)
    val_data = model.transform(val_df)
    return train_data, val_data

In [64]:
print("Training Dataset Count: " + str(train_data.count()))
print("Test Dataset Count: " + str(val_data.count()), "\n")
for window_size in range(5, 21, 5):
    train_data, val_data = w2v_transforms(window_size)
    print(f"Window size={window_size}")
    for label in labels:
        classifier = LogisticRegression(featuresCol='features', labelCol=label).fit(train_data)
        preds = classifier.transform(val_data).select(col(label).alias("label"), "prediction")
        metrics = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label', metricName='areaUnderROC')
        print(f"Label -->> {label}")
        print("areaUnderROC", metrics.evaluate(preds))
    print("\n")

Training Dataset Count: 127378
Test Dataset Count: 32193 

Window size=5
Label -->> toxic
areaUnderROC 0.7722270705097234
Label -->> severe_toxic
areaUnderROC 0.5985874819511582
Label -->> obscene
areaUnderROC 0.7676274816968421
Label -->> threat
areaUnderROC 0.5304097211403646
Label -->> insult
areaUnderROC 0.7021400316573909
Label -->> identity_hate
areaUnderROC 0.5516281184623819


Window size=10
Label -->> toxic
areaUnderROC 0.7819299002501171
Label -->> severe_toxic
areaUnderROC 0.5970792499927383
Label -->> obscene
areaUnderROC 0.7721374949170335
Label -->> threat
areaUnderROC 0.525307680324038
Label -->> insult
areaUnderROC 0.7037501158015008
Label -->> identity_hate
areaUnderROC 0.55161244300701


Window size=15
Label -->> toxic
areaUnderROC 0.7780158751262689
Label -->> severe_toxic
areaUnderROC 0.5999701567496811
Label -->> obscene
areaUnderROC 0.7755939401634073
Label -->> threat
areaUnderROC 0.5151503349431376
Label -->> insult
areaUnderROC 0.705412377860546
Label -->> iden