In [1]:
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql.functions import col
from pyspark.ml.feature import RegexTokenizer, CountVectorizer, Word2Vec, Tokenizer
from pyspark.ml.classification import LogisticRegression, NaiveBayes, LinearSVC, OneVsRest
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [2]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
sc = SparkContext()
sqlContext = SQLContext(sc)



In [4]:
customSchema = StructType([
    StructField("label", IntegerType()), 
    StructField("text", StringType())])

In [5]:
df_train = sqlContext.read.format("csv").option("header", "true").schema(customSchema).load('data/cleaned_twitter_training.csv')

In [6]:
df_val = sqlContext.read.format("csv").option("header", "true").schema(customSchema).load('data/cleaned_twitter_validation.csv')

In [7]:
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="tokens", pattern=r" +")
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
countVectors = CountVectorizer(inputCol="tokens", outputCol="features", vocabSize=15000, minDF=5)
word2Vec = Word2Vec(vectorSize=100, minCount=0,maxIter=20, inputCol="tokens", outputCol="features")
hashingTF = HashingTF(inputCol="tokens", outputCol="rawFeatures", numFeatures=15000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=10) #minDocFreq: remove sparse terms

In [13]:
# Fit the pipeline to training documents.
pipeline = Pipeline(stages=[regexTokenizer, countVectors])
pipelineFit = pipeline.fit(df_train)
dataset_train = pipelineFit.transform(df_train)
dataset_train.show(5)
(trainingData, testData) = dataset_train.randomSplit([0.9, 0.1], seed = 100)

+-----+--------------------+--------------------+--------------------+
|label|                text|              tokens|            features|
+-----+--------------------+--------------------+--------------------+
|    3|getting borderlan...|[getting, borderl...|(13363,[67,77,154...|
|    3|coming  borders  ...|[coming, borders,...|(13363,[266,304,6...|
|    3|getting borderlan...|[getting, borderl...|(13363,[67,77,304...|
|    3|coming borderland...|[coming, borderla...|(13363,[67,266,15...|
|    3|getting  borderla...|[getting, borderl...|(13363,[67,77,154...|
+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [14]:
# Fit the pipeline to validation documents.
pipeline = Pipeline(stages=[regexTokenizer, countVectors])
dataset_val = pipelineFit.transform(df_val)
dataset_val.show(5)

+-----+--------------------+--------------------+--------------------+
|label|                text|              tokens|            features|
+-----+--------------------+--------------------+--------------------+
|    0|mentioned faceboo...|[mentioned, faceb...|(13363,[3,16,23,2...|
|    2|bbc news amazon b...|[bbc, news, amazo...|(13363,[2,34,138,...|
|    1|why pay  word  fu...|[why, pay, word, ...|(13363,[90,265,69...|
|    1|csgo matchmaking ...|[csgo, matchmakin...|(13363,[0,115,262...|
|    2|now  president sl...|[now, president, ...|(13363,[7,32,143,...|
+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [11]:
lr = LogisticRegression(maxIter=10000, regParam=0.05, elasticNetParam=0.01, tol=1e-4, standardization=True)
lrModel = lr.fit(trainingData)

In [12]:
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0).select("text","probability","label","prediction")\
.orderBy("probability", ascending=False).show(n = 10, truncate = 30)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

+------------------------------+------------------------------+-----+----------+
|                          text|                   probability|label|prediction|
+------------------------------+------------------------------+-----+----------+
| ban rule  battlefield play...|[0.9991763844793757,1.49322...|    0|       0.0|
|loved  liverpool lost diogo...|[0.9970277626004036,2.45538...|    0|       0.0|
|possible ban  battlefield p...|[0.9958923528770662,0.00134...|    0|       0.0|
|join live  facebook today n...|[0.9953259770287342,5.16261...|    0|       0.0|
|ban   battlefield might bas...|[0.9944248912442711,0.00144...|    0|       0.0|
|why ali  best fortnite guy ...|[0.9938053261211168,3.96270...|    0|       0.0|
|fifa liverpool worst night ...|[0.9925169070629243,0.00195...|    0|       0.0|
|source ban claim  battlefie...|[0.9902175408746028,0.00603...|    0|       0.0|
|ban  team battlefield secon...|[0.990191216260858,0.001199...|    0|       0.0|
|sounds bit harsh  see print

0.8229204881510208

In [13]:
lr.write().overwrite().save("lr_model")

In [9]:
pipeline = Pipeline(stages=[regexTokenizer, hashingTF, idf])

pipelineFit = pipeline.fit(df_train)
dataset = pipelineFit.transform(df_train)

(trainingData, testData) = dataset.randomSplit([0.9, 0.1], seed = 100)
lr = LogisticRegression(maxIter=10000, regParam=0.05, elasticNetParam=0)
lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 0) \
    .select("text","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+-----+----------+
|                          text|                   probability|label|prediction|
+------------------------------+------------------------------+-----+----------+
|know the facts selected sel...|[0.9997861594389547,3.59986...|    1|       0.0|
|why ali  best fortnite guy ...|[0.9995881400316241,2.21101...|    0|       0.0|
|best barbie stream find sla...|[0.9993734150716964,5.13528...|    0|       0.0|
| ban rule  battlefield play...|[0.9990398253296685,3.21265...|    0|       0.0|
|really pleased   moves  tou...|[0.9988102769912741,1.63220...|    0|       0.0|
|ban however  battlefield le...|[0.9985637107019326,4.39733...|    0|       0.0|
|years ago princess pop bles...|[0.9979966598621328,5.33171...|    0|       0.0|
|grifters outrage  moral dis...|[0.997681633414802,0.001794...|    0|       0.0|
|this   great movie youunk f...|[0.997518051431198,8.501468...|    0|       0.0|
|classic metal jewelry gift 

In [10]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.7432224930808944

In [12]:
pipeline = Pipeline(stages=[regexTokenizer, countVectors])

pipelineFit = pipeline.fit(df_train)
dataset = pipelineFit.transform(df_train)
(trainingData, testData) = dataset.randomSplit([0.9, 0.1], seed = 42)

lr = LogisticRegression(maxIter=1000, regParam=0.05, elasticNetParam=0)
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
            #  .addGrid(lr.maxIter, [1000, 10000, 100000]) # number of iterations
             .addGrid(lr.regParam, [0.01, 0.03, 0.05]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.0, 0.0]) # Elastic Net Parameter (Ridge = 0)
             .build())
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)
# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.8372386654017403

In [13]:
pipelineFit.save("saved-models/pipelineFit")

In [11]:
cvModel.write().overwrite().save('saved-models/cvModel')

In [8]:
# Fit the pipeline to training documents.
pipeline = Pipeline(stages=[tokenizer, countVectors])
pipelineFit = pipeline.fit(df_train)
dataset_train = pipelineFit.transform(df_train)
dataset_train.show(5)
(trainingData, testData) = dataset_train.randomSplit([0.9, 0.1], seed = 100)

+-----+--------------------+--------------------+--------------------+
|label|                text|              tokens|            features|
+-----+--------------------+--------------------+--------------------+
|    3|getting borderlan...|[getting, borderl...|(13364,[0,68,78,1...|
|    3|coming  borders  ...|[coming, , border...|(13364,[0,267,305...|
|    3|getting borderlan...|[getting, borderl...|(13364,[0,68,78,3...|
|    3|coming borderland...|[coming, borderla...|(13364,[0,68,267,...|
|    3|getting  borderla...|[getting, , borde...|(13364,[0,68,78,1...|
+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [9]:
nb = NaiveBayes(modelType='multinomial', smoothing=1e-4)
nbModel = nb.fit(trainingData)

predictions = nbModel.transform(testData)

predictions.filter(predictions['prediction'] == 0).select("text","probability","label","prediction")\
.orderBy("probability", ascending=False).show(n = 10, truncate = 30)
# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

+------------------------------+------------------------------+-----+----------+
|                          text|                   probability|label|prediction|
+------------------------------+------------------------------+-----+----------+
|bitching  ghost tsushima  c...|[1.0,7.214055635348055E-17,...|    0|       0.0|
|lockout  battlefield player...|[1.0,3.399544854284964E-17,...|    0|       0.0|
|ban  battlefield player pin...|[1.0,1.1048417080972387E-17...|    0|       0.0|
|the bigwigs    shopping mal...|[1.0,6.228762427389828E-18,...|    0|       0.0|
|canya even find confirmatio...|[1.0,3.0500246515050898E-18...|    0|       0.0|
|leaked  fifa fifa gta gamep...|[1.0,9.41337092315493E-19,3...|    0|       0.0|
|unfortunately though cannot...|[1.0,6.33331275006352E-19,1...|    0|       0.0|
|source ban claim  battlefie...|[1.0,5.9864963635378215E-19...|    0|       0.0|
|accelerate  war drugs paral...|[1.0,5.272570825134445E-20,...|    0|       0.0|
|this   great movie youunk f

0.7617420005786116

In [10]:
# Fit the pipeline to training documents.
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf])
pipelineFit = pipeline.fit(df_train)
dataset_train = pipelineFit.transform(df_train)
dataset_train.show(5)
(trainingData, testData) = dataset_train.randomSplit([0.9, 0.1], seed = 100)

+-----+--------------------+--------------------+--------------------+--------------------+
|label|                text|              tokens|         rawFeatures|            features|
+-----+--------------------+--------------------+--------------------+--------------------+
|    3|getting borderlan...|[getting, borderl...|(15000,[3031,3372...|(15000,[3031,3372...|
|    3|coming  borders  ...|[coming, , border...|(15000,[201,3372,...|(15000,[201,3372,...|
|    3|getting borderlan...|[getting, borderl...|(15000,[3372,6586...|(15000,[3372,6586...|
|    3|coming borderland...|[coming, borderla...|(15000,[3031,3372...|(15000,[3031,3372...|
|    3|getting  borderla...|[getting, , borde...|(15000,[3031,3372...|(15000,[3031,3372...|
+-----+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [15]:
svc = LinearSVC(maxIter=10000, regParam=0.05, tol=1e-7, standardization=True)
ovr = OneVsRest(classifier=svc)
ovrModel = ovr.fit(trainingData)

In [16]:
predictions = ovrModel.transform(dataset_val)
predictions.filter(predictions['prediction'] == 0).select("text","label","prediction")\
.show(n = 10, truncate = 30)
# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

+------------------------------+-----+----------+
|                          text|label|prediction|
+------------------------------+-----+----------+
|mentioned facebook   strugg...|    0|       0.0|
|call duty warzone livestrea...|    0|       0.0|
|                best squad yet|    0|       0.0|
|this really disappointing m...|    0|       0.0|
|              melusi   shocked|    2|       0.0|
|      blocking  mans new level|    0|       0.0|
|everyone know  story true l...|    0|       0.0|
|sound enjoy  groove   littl...|    0|       0.0|
|aoc  make   ignorant commen...|    0|       0.0|
|nyummm delicious finally so...|    0|       0.0|
+------------------------------+-----+----------+
only showing top 10 rows



0.8925784039743436