In [0]:
!pip install pyspellchecker
!pip install nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

Collecting pyspellchecker
  Downloading pyspellchecker-0.6.2-py3-none-any.whl (2.7 MB)
[?25l[K     |▏                               | 10 kB 17.7 MB/s eta 0:00:01[K     |▎                               | 20 kB 6.2 MB/s eta 0:00:01[K     |▍                               | 30 kB 3.5 MB/s eta 0:00:01[K     |▌                               | 40 kB 4.0 MB/s eta 0:00:01[K     |▋                               | 51 kB 4.1 MB/s eta 0:00:01[K     |▊                               | 61 kB 4.5 MB/s eta 0:00:01[K     |▉                               | 71 kB 4.4 MB/s eta 0:00:01[K     |█                               | 81 kB 5.0 MB/s eta 0:00:01[K     |█                               | 92 kB 4.7 MB/s eta 0:00:01[K     |█▏                              | 102 kB 4.6 MB/s eta 0:00:01[K     |█▎                              | 112 kB 4.6 MB/s eta 0:00:01[K     |█▍                              | 122 kB 4.6 MB/s eta 0:00:01[K     |█▌                              | 133 kB 4.6 MB/s e

In [0]:
from pyspark.sql.functions import col, explode, array, lit
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pyspark.sql.functions import col, lit
from spellchecker import SpellChecker
from pyspark.sql.functions import udf
import re
import string
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import Tokenizer
from pyspark.sql.window import Window
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics

In [0]:
df = spark.read.csv("/FileStore/tables/final.csv", sep=',', inferSchema=True, header=True, multiLine="true", escape="\"")
df = df.drop(df.Score)
df.show()

+---+--------------------+-----------+--------------------+
|_c0|             Summary|Final_Label|                Text|
+---+--------------------+-----------+--------------------+
|  0|Good Quality Dog ...|        1.0|I have bought sev...|
|  1|   Not as Advertised|        0.0|Product arrived l...|
|  2|"Delight" says it...|        1.0|This is a confect...|
|  3|      Cough Medicine|        0.0|If you are lookin...|
|  4|         Great taffy|        1.0|Great taffy at a ...|
|  5|          Nice Taffy|        1.0|I got a wild hair...|
|  6|Great!  Just as g...|        1.0|This saltwater ta...|
|  7|Wonderful, tasty ...|        1.0|This taffy is so ...|
|  8|          Yay Barley|        1.0|Right now I'm mos...|
|  9|    Healthy Dog Food|        1.0|This is a very he...|
| 10|The Best Hot Sauc...|        1.0|I don't know if i...|
| 11|My cats LOVE this...|        1.0|One of my boys ne...|
| 12|My Cats Are Not F...|        0.0|My cats have been...|
| 13|   fresh and greasy!|        1.0|go

In [0]:
# find the majority and minority
df_major= df.filter(col("Final_Label") == 1)
df_minor= df.filter(col("Final_Label") == 0)
ratio = int(df_major.count()/df_minor.count())
print("ratio: {}".format(ratio))

ratio: 5


In [0]:
a = range(ratio)
# duplicate the minority rows
df_oversampled= df_minor.withColumn("dummy", explode(array([lit(x) for x in a]))).drop('dummy')
# combine both oversampled minority rows and previous majority rows 
df = df_major.unionAll(df_oversampled)
df.show()

+---+--------------------+-----------+--------------------+
|_c0|             Summary|Final_Label|                Text|
+---+--------------------+-----------+--------------------+
|  0|Good Quality Dog ...|        1.0|I have bought sev...|
|  2|"Delight" says it...|        1.0|This is a confect...|
|  4|         Great taffy|        1.0|Great taffy at a ...|
|  5|          Nice Taffy|        1.0|I got a wild hair...|
|  6|Great!  Just as g...|        1.0|This saltwater ta...|
|  7|Wonderful, tasty ...|        1.0|This taffy is so ...|
|  8|          Yay Barley|        1.0|Right now I'm mos...|
|  9|    Healthy Dog Food|        1.0|This is a very he...|
| 10|The Best Hot Sauc...|        1.0|I don't know if i...|
| 11|My cats LOVE this...|        1.0|One of my boys ne...|
| 13|   fresh and greasy!|        1.0|good flavor! thes...|
| 14|Strawberry Twizzl...|        1.0|The Strawberry Tw...|
| 15|Lots of twizzlers...|        1.0|My daughter loves...|
| 17|            Love it!|        1.0|I 

In [0]:
major_df = df.filter(col("Final_Label") == 1)
minor_df = df.filter(col("Final_Label") == 0)
ratio = int(major_df.count()/minor_df.count())
print("ratio: {}".format(ratio))

ratio: 1


In [0]:
@udf
def text_preproccessing(text):
    text = re.sub("(<.*?>)","",text)
    text = re.sub("(\\W|\\d)"," ",text)
    words = word_tokenize(text)

    stopw = stopwords.words("english")
    stop_removed = [word for word in words if word.lower() not in stopw]
    punk_removed = [w.lower() for w in stop_removed if w.lower() not in string.punctuation]
    
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(word) for word in punk_removed]

    stemmer = SnowballStemmer(language="english")
    stemmed_final = [stemmer.stem(token) for token in tokens]

    return " ".join(stemmed_final)

In [0]:
x = df.select("Text", text_preproccessing("Text").alias("Semi_Preproccesed"))

tokenizer = Tokenizer(inputCol="Semi_Preproccesed", outputCol="Final_Preproccesed")
x = tokenizer.transform(x)
x = x.drop(x.Text)
x.show()

+--------------------+--------------------+
|   Semi_Preproccesed|  Final_Preproccesed|
+--------------------+--------------------+
|bought sever vita...|[bought, sever, v...|
|product arriv lab...|[product, arriv, ...|
|confect around ce...|[confect, around,...|
|look secret ingre...|[look, secret, in...|
|great taffi great...|[great, taffi, gr...|
|got wild hair taf...|[got, wild, hair,...|
|saltwat taffi gre...|[saltwat, taffi, ...|
|taffi good soft c...|[taffi, good, sof...|
|right most sprout...|[right, most, spr...|
|healthi dog food ...|[healthi, dog, fo...|
|know cactus tequi...|[know, cactus, te...|
|one boy need lose...|[one, boy, need, ...|
|cat happili eat f...|[cat, happili, ea...|
|good flavor came ...|[good, flavor, ca...|
|strawberri twizzl...|[strawberri, twiz...|
|daughter love twi...|[daughter, love, ...|
|love eat good wat...|[love, eat, good,...|
|satisfi twizzler ...|[satisfi, twizzle...|
|twizzler strawber...|[twizzler, strawb...|
|candi deliv fast ...|[candi, de

In [0]:
w = Window.orderBy(monotonically_increasing_id())
df = df.withColumn("columnindex", row_number().over(w))
x = x.withColumn("columnindex", row_number().over(w))
df = df.join(x, df.columnindex == x.columnindex, 'inner').drop(df.columnindex).drop(x.Semi_Preproccesed)
df.show()

+---+--------------------+-----------+--------------------+--------------------+-----------+
|_c0|             Summary|Final_Label|                Text|  Final_Preproccesed|columnindex|
+---+--------------------+-----------+--------------------+--------------------+-----------+
|  0|Good Quality Dog ...|        1.0|I have bought sev...|[bought, sever, v...|          1|
|  1|   Not as Advertised|        0.0|Product arrived l...|[product, arriv, ...|          2|
|  2|"Delight" says it...|        1.0|This is a confect...|[confect, around,...|          3|
|  3|      Cough Medicine|        0.0|If you are lookin...|[look, secret, in...|          4|
|  4|         Great taffy|        1.0|Great taffy at a ...|[great, taffi, gr...|          5|
|  5|          Nice Taffy|        1.0|I got a wild hair...|[got, wild, hair,...|          6|
|  6|Great!  Just as g...|        1.0|This saltwater ta...|[saltwat, taffi, ...|          7|
|  7|Wonderful, tasty ...|        1.0|This taffy is so ...|[taffi, goo

In [0]:
df = df.withColumnRenamed("Final_Label","label")

## TFIDF Feature Extraction

In [0]:
hashingTF = HashingTF(inputCol="Final_Preproccesed", outputCol="rawFeatures", numFeatures=1000)
featurizedData = hashingTF.transform(df)
idf = IDF(inputCol="rawFeatures", outputCol="TFIDF")
idfModel = idf.fit(featurizedData)
TFIDF_Dataframe = idfModel.transform(featurizedData)
TFIDF_Dataframe.show()

+---+--------------------+-----+--------------------+--------------------+-----------+--------------------+--------------------+
|_c0|             Summary|label|                Text|  Final_Preproccesed|columnindex|         rawFeatures|               TFIDF|
+---+--------------------+-----+--------------------+--------------------+-----------+--------------------+--------------------+
|  0|Good Quality Dog ...|  1.0|I have bought sev...|[bought, sever, v...|          1|(1000,[40,61,133,...|(1000,[40,61,133,...|
|  1|   Not as Advertised|  0.0|Product arrived l...|[product, arriv, ...|          2|(1000,[1,104,133,...|(1000,[1,104,133,...|
|  2|"Delight" says it...|  1.0|This is a confect...|[confect, around,...|          3|(1000,[116,152,15...|(1000,[116,152,15...|
|  3|      Cough Medicine|  0.0|If you are lookin...|[look, secret, in...|          4|(1000,[10,61,73,1...|(1000,[10,61,73,1...|
|  4|         Great taffy|  1.0|Great taffy at a ...|[great, taffi, gr...|          5|(1000,[148,

## CountVectorizer Feature Extraction

In [0]:
cv = CountVectorizer(inputCol="Final_Preproccesed", outputCol="CountVectorized", vocabSize=1000, minDF=2.0)

model = cv.fit(df)

countVectorizer_Dataframe = model.transform(df)
countVectorizer_Dataframe.show()

+---+--------------------+-----+--------------------+--------------------+-----------+--------------------+
|_c0|             Summary|label|                Text|  Final_Preproccesed|columnindex|     CountVectorized|
+---+--------------------+-----+--------------------+--------------------+-----------+--------------------+
|  0|Good Quality Dog ...|  1.0|I have bought sev...|[bought, sever, v...|          1|(1000,[0,3,7,12,1...|
|  1|   Not as Advertised|  0.0|Product arrived l...|[product, arriv, ...|          2|(1000,[7,91,97,11...|
|  2|"Delight" says it...|  1.0|This is a confect...|[confect, around,...|          3|(1000,[2,28,41,43...|
|  3|      Cough Medicine|  0.0|If you are lookin...|[look, secret, in...|          4|(1000,[2,3,24,42,...|
|  4|         Great taffy|  1.0|Great taffy at a ...|[great, taffi, gr...|          5|(1000,[9,25,168,2...|
|  5|          Nice Taffy|  1.0|I got a wild hair...|[got, wild, hair,...|          6|(1000,[2,16,22,23...|
|  6|Great!  Just as g...|  

##TFIDF Training

In [0]:
train, test = TFIDF_Dataframe.randomSplit([0.75, 0.25], seed = 70)

## Logistic Regression

In [0]:
lr = LogisticRegression(featuresCol = 'TFIDF', labelCol = 'label', maxIter=10)
lrModel = lr.fit(train)
predictions_lr = lrModel.transform(test)
accuracy_lr_tfidf = predictions_lr.filter(predictions_lr.label == predictions_lr.prediction).count() / float(predictions_lr.count())
print("Test Accuracy TFIDF Logistic: ", accuracy_lr_tfidf)

predictions_lr_train = lrModel.transform(train)
accuracy_lr_train_tfidf = predictions_lr_train.filter(predictions_lr_train.label == predictions_lr_train.prediction).count() / float(predictions_lr_train.count())
print("Train Accuracy TFIDF Logistic: ", accuracy_lr_train_tfidf)

# Precision Recall F1
predictionAndLabels = predictions_lr.select("prediction","label").rdd
# Instantiate metrics objects
multi_metrics = MulticlassMetrics(predictionAndLabels)
precision_score = multi_metrics.weightedPrecision
recall_score = multi_metrics.weightedRecall
f1_score= (2 * precision_score * recall_score) / (precision_score + recall_score)
print("Precision TFIDF Logistic Regression = %s" % precision_score)
print("Recall TFIDF Logistic Regression = %s" % recall_score)
print("f1_score TFIDF Logistic Regression = %s" % f1_score)

Test Accuracy TFIDF Logistic:  0.8801304396716518
Train Accuracy TFIDF Logistic:  0.8961964224154442
Precision TFIDF Logistic Regression = 0.8661526130511522
Recall TFIDF Logistic Regression = 0.8801304396716518
f1_score TFIDF Logistic Regression = 0.8730855847897514


In [0]:
log_reg = LogisticRegression(featuresCol='TFIDF', labelCol='label')

paramGrid = ParamGridBuilder().addGrid(log_reg.maxIter, [10,20]) \
                                .addGrid(log_reg.regParam, [0,0.02,0.08]) \
                                .addGrid(log_reg.elasticNetParam, [0.2,0.6,0.8]) \
                                .build()

evaluator_lr = MulticlassClassificationEvaluator(labelCol='label', metricName='accuracy')

crossval = CrossValidator(estimator=log_reg,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator_lr,
                          numFolds=2)


# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(train)

# Make predictions on testing data and calculating ROC metrics and model accuracy. 
prediction = cvModel.transform(test)
output= prediction.select("label",  "probability", "prediction")

prediction_train = cvModel.transform(train)
output_train = prediction_train.select("label",  "probability", "prediction")

acc = evaluator_lr.evaluate(output, {evaluator_lr.metricName: "accuracy"})
acc_train = evaluator_lr.evaluate(output_train, {evaluator_lr.metricName: "accuracy"})
f1 = evaluator_lr.evaluate(output, {evaluator_lr.metricName: "f1"})
weightedPrecision = evaluator_lr.evaluate(output, {evaluator_lr.metricName: "weightedPrecision"})
weightedRecall = evaluator_lr.evaluate(output, {evaluator_lr.metricName: "weightedRecall"})

print("Test Accuracy for TFIDF Logistic Regression optimized model", acc)
print("Train Accuracy for TFIDF Logistic Regression optimized model", acc_train)
print("F1 Score for TFIDF Logistic Regression optimized model",f1)
print("Precision for TFIDF Logistic Regression optimized model", weightedPrecision)
print("Recall for TFIDF Logistic Regression optimized model",weightedRecall)

print("best parameter of model(MaxIter)",cvModel.bestModel.getMaxIter())   
print("best parameter of model(RegParam)",cvModel.bestModel.getRegParam()) 
print("best parameter of model(ElasticNetParam)",cvModel.bestModel.getElasticNetParam())

Test Accuracy for TFIDF Logistic Regression optimized model 0.8801304396716518
Train Accuracy for TFIDF Logistic Regression optimized model 0.8961964224154442
F1 Score for TFIDF Logistic Regression optimized model 0.8683418086756394
Precision for TFIDF Logistic Regression optimized model 0.8661526130511522
Recall for TFIDF Logistic Regression optimized model 0.8801304396716518
best parameter of model(MaxIter) 10
best parameter of model(RegParam) 0.0
best parameter of model(ElasticNetParam) 0.2


## Decision Tree

In [0]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="TFIDF")
decision_tree = dt.fit(train)
predictions_dt = decision_tree.transform(test)
accuracy = predictions_dt.filter(predictions_dt.label == predictions_dt.prediction).count() / float(predictions_dt.count())
print("Test Accuracy TFIDF Decision Tree: ",accuracy)

predictionsss_dt = decision_tree.transform(train)
accuraccy = predictionsss_dt.filter(predictionsss_dt.label == predictionsss_dt.prediction).count() / float(predictionsss_dt.count())
print("Train Accuracy TFIDF Decision Tree: ",accuraccy)

# Precision Recall F1
predictionAndLabels = predictions_dt.select("prediction","label").rdd
# Instantiate metrics objects
multi_metrics = MulticlassMetrics(predictionAndLabels)
precision_score = multi_metrics.weightedPrecision
recall_score = multi_metrics.weightedRecall
f1_score= (2 * precision_score * recall_score) / (precision_score + recall_score)
print("Precision TFIDF Decision Tree = %s" % precision_score)
print("Recall TFIDF Decision Tree = %s" % recall_score)
print("f1_score TFIDF Decision Tree = %s" % f1_score)

Test Accuracy TFIDF Decision Tree:  0.8570786011469695
Train Accuracy TFIDF Decision Tree:  0.8603056651472785
Precision TFIDF Decision Tree = 0.8227769719859641
Recall TFIDF Decision Tree = 0.8570786011469695
f1_score TFIDF Decision Tree = 0.8395775773633909


In [0]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="TFIDF")

# Create ParamGrid for Cross Validation
dtparamGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [2, 10, 20, 30])
             .addGrid(dt.maxBins, [10, 20, 40, 80])
             .build())

# Evaluate model
dtevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

# Create 5-fold CrossValidator
dtcv = CrossValidator(estimator = dt,
                      estimatorParamMaps = dtparamGrid,
                      evaluator = dtevaluator,
                      numFolds = 3)

# Run cross validations
dtcvModel = dtcv.fit(train)


# Use test set here so we can measure the accuracy of our model on new data
dtpredictions = dtcvModel.transform(test)

# cvModel uses the best model found from the Cross Validation
# Evaluate best model
accuracy_dt_cv = dtpredictions.filter(dtpredictions.label == dtpredictions.prediction).count() / float(dtpredictions.count())
print("Test Accuracy for TFIDF Decision Tree optimized model: ",accuracy_dt_cv)

dt__predictions = dtcvModel.transform(train)
accuracy_dt___cv = dt__predictions.filter(dt__predictions.label == dt__predictions.prediction).count() / float(dt__predictions.count())
print("Train Accuracy for TFIDF Decision Tree optimized model: ",accuracy_dt___cv)



# Precision Recall F1
predictionAndLabels = dtpredictions.select("prediction","label").rdd
# Instantiate metrics objects
multi_metrics = MulticlassMetrics(predictionAndLabels)
precision_score = multi_metrics.weightedPrecision
recall_score = multi_metrics.weightedRecall
f1_score= (2 * precision_score * recall_score) / (precision_score + recall_score)
print("Precision TFIDF Decision Tree optimized model = %s" % precision_score)
print("Recall TFIDF Decision Tree optimized model = %s" % recall_score)
print("f1_score TFIDF Decision Tree optimized model = %s" % f1_score)
print("best parameter of model(maxDepth)",dtcvModel.bestModel.getMaxDepth())   
print("best parameter of model(maxBins)",dtcvModel.bestModel.getMaxBins())

Test Accuracy for TFIDF Decision Tree optimized model:  0.8429101540537501
Train Accuracy for TFIDF Decision Tree optimized model:  0.9486344658520703
Precision TFIDF Decision Tree optimized model = 0.8200235817914168
Recall TFIDF Decision Tree optimized model = 0.8429101540537502
f1_score TFIDF Decision Tree optimized model = 0.8313093766231323
best parameter of model(maxDepth) 30
best parameter of model(maxBins) 10


## Random Forest

In [0]:
rf = RandomForestClassifier(labelCol="label", featuresCol="TFIDF", numTrees=10)
random_forest = rf.fit(train)
predictions_rf = random_forest.transform(test)
accuracy = predictions_rf.filter(predictions_rf.label == predictions_rf.prediction).count() / float(predictions_rf.count())
print("Test Accuracy TFIDF Random Forest : ",accuracy)

predictions_rf____ = random_forest.transform(train)
accuracy_ = predictions_rf____.filter(predictions_rf____.label == predictions_rf____.prediction).count() / float(predictions_rf____.count())
print("Train Accuracy TFIDF Random Forest : ", accuracy_)

# Precision Recall F1
predictionAndLabels = predictions_rf.select("prediction","label").rdd
# Instantiate metrics objects
multi_metrics = MulticlassMetrics(predictionAndLabels)
precision_score = multi_metrics.weightedPrecision
recall_score = multi_metrics.weightedRecall
f1_score= (2 * precision_score * recall_score) / (precision_score + recall_score)
print("Precision TFIDF Random Forest = %s" % precision_score)
print("Recall TFIDF Random Forest = %s" % recall_score)
print("f1_score TFIDF Random Forest = %s" % f1_score)

Test Accuracy TFIDF Random Forest :  0.8526931294276398
Train Accuracy TFIDF Random Forest :  0.8512659439996936
Precision TFIDF Random Forest = 0.7270855729731016
Recall TFIDF Random Forest = 0.8526931294276398
f1_score TFIDF Random Forest = 0.784895848561519


In [0]:
rf = RandomForestClassifier(labelCol="label", featuresCol="TFIDF", numTrees=10)

paramGrid = (ParamGridBuilder().addGrid(rf.numTrees, [5, 15, 20])
             .build())

crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3) 

cvModel = crossval.fit(train)
predictions_rf_cv = cvModel.transform(test)

accuracy_rf_cv = predictions_rf_cv.filter(predictions_rf_cv.label == predictions_rf_cv.prediction).count() / float(predictions_rf_cv.count())
print("Test Accuracy for TFIDF Random Forest optimized model: ", accuracy_rf_cv)

predictions_rf___cv = cvModel.transform(train)

accuracy_rf___cv = predictions_rf___cv.filter(predictions_rf___cv.label == predictions_rf___cv.prediction).count() / float(predictions_rf___cv.count())
print("Train Accuracy for TFIDF Random Forest optimized model: ", accuracy_rf___cv)



# Precision Recall F1
predictionAndLabels = predictions_rf_cv.select("prediction","label").rdd
# Instantiate metrics objects
multi_metrics = MulticlassMetrics(predictionAndLabels)
precision_score = multi_metrics.weightedPrecision
recall_score = multi_metrics.weightedRecall
f1_score= (2 * precision_score * recall_score) / (precision_score + recall_score)
print("Precision TFIDF Random Forest optimized model = %s" % precision_score)
print("Recall TFIDF Random Forest optimized model = %s" % recall_score)
print("f1_score TFIDF Random Forest optimized model = %s" % f1_score)
bestModel = cvModel.bestModel
print ('Best Param (numTrees): ', bestModel._java_obj.getNumTrees())

Test Accuracy for TFIDF Random Forest optimized model:  0.8526931294276398
Train Accuracy for TFIDF Random Forest optimized model:  0.8512659439996936
Precision TFIDF Random Forest optimized model = 0.7270855729731016
Recall TFIDF Random Forest optimized model = 0.8526931294276398
f1_score TFIDF Random Forest optimized model = 0.784895848561519
Best Param (numTrees):  20


## CountVectorizer Training

In [0]:
train_cv, test_cv = countVectorizer_Dataframe.randomSplit([0.75, 0.25], seed = 70)

## Logistic Regression

In [0]:
lr_cv = LogisticRegression(featuresCol = 'CountVectorized', labelCol = 'label', maxIter=10)
lrModel_cv = lr_cv.fit(train_cv)
predictions_cv = lrModel_cv.transform(test_cv)
accuracy = predictions_cv.filter(predictions_cv.label == predictions_cv.prediction).count() / float(predictions_cv.count())
print("Test Accuracy CountVector Logistic Regression: ",accuracy)

predictions___cv = lrModel_cv.transform(train_cv)
accuracy___ = predictions___cv.filter(predictions___cv.label == predictions___cv.prediction).count() / float(predictions___cv.count())
print("Train Accuracy CountVector Logistic Regression: ",accuracy___)

# Precision Recall F1
predictionAndLabels = predictions_cv.select("prediction","label").rdd
# Instantiate metrics objects
multi_metrics = MulticlassMetrics(predictionAndLabels)
precision_score = multi_metrics.weightedPrecision
recall_score = multi_metrics.weightedRecall
f1_score= (2 * precision_score * recall_score) / (precision_score + recall_score)
print("Precision CountVector Logistic Regression = %s" % precision_score)
print("Recall CountVector Logistic Regression = %s" % recall_score)
print("f1_score CountVector Logistic Regression = %s" % f1_score)

Test Accuracy CountVector Logistic Regression:  0.8942988867648712
Train Accuracy CountVector Logistic Regression:  0.9099475236526602
Precision CountVector Logistic Regression = 0.8850287514851555
Recall CountVector Logistic Regression = 0.8942988867648712
f1_score CountVector Logistic Regression = 0.8896396708438714


In [0]:
log_reg = LogisticRegression(featuresCol='CountVectorized', labelCol='label')

paramGrid = ParamGridBuilder().addGrid(log_reg.maxIter, [10,20]) \
                                .addGrid(log_reg.regParam, [0,0.02,0.08]) \
                                .addGrid(log_reg.elasticNetParam, [0.2,0.6,0.8]) \
                                .build()

evaluator_lr = MulticlassClassificationEvaluator(labelCol='label', metricName='accuracy')

crossval = CrossValidator(estimator=log_reg,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator_lr,
                          numFolds=2)


# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(train_cv)

# Make predictions on testing data and calculating ROC metrics and model accuracy. 
prediction = cvModel.transform(test_cv)
prediction_ = cvModel.transform(train_cv)

output= prediction.select("label",  "probability", "prediction")
output_= prediction_.select("label",  "probability", "prediction")

acc = evaluator_lr.evaluate(output, {evaluator_lr.metricName: "accuracy"})
acc_ = evaluator_lr.evaluate(output_, {evaluator_lr.metricName: "accuracy"})

f1 = evaluator_lr.evaluate(output, {evaluator_lr.metricName: "f1"})
weightedPrecision = evaluator_lr.evaluate(output, {evaluator_lr.metricName: "weightedPrecision"})
weightedRecall = evaluator_lr.evaluate(output, {evaluator_lr.metricName: "weightedRecall"})

print("Test Accuracy for CountVectorized Logistic Regression optimized model", acc)
print("Train Accuracy for CountVectorized Logistic Regression optimized model", acc_)
print("F1 Score for CountVectorized Logistic Regression optimized model",f1)
print("Precision for CountVectorized Logistic Regression optimized model", weightedPrecision)
print("Recall for CountVectorized Logistic Regression optimized model",weightedRecall)

print("best parameter of model(MaxIter)",cvModel.bestModel.getMaxIter())   
print("best parameter of model(RegParam)",cvModel.bestModel.getRegParam()) 
print("best parameter of model(ElasticNetParam)",cvModel.bestModel.getElasticNetParam())

Test Accuracy for CountVectorized Logistic Regression optimized model 0.8942988867648712
Train Accuracy for CountVectorized Logistic Regression optimized model 0.9099475236526602
F1 Score for CountVectorized Logistic Regression optimized model 0.8867219444310933
Precision for CountVectorized Logistic Regression optimized model 0.8850287514851555
Recall for CountVectorized Logistic Regression optimized model 0.8942988867648712
best parameter of model(MaxIter) 10
best parameter of model(RegParam) 0.0
best parameter of model(ElasticNetParam) 0.2


## Decision Tree

In [0]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="CountVectorized")
decision_tree_cv = dt.fit(train_cv)
predictions_dt_cv = decision_tree_cv.transform(test_cv)
accuracy_dt_cv = predictions_dt_cv.filter(predictions_dt_cv.label == predictions_dt_cv.prediction).count() / float(predictions_dt_cv.count())
print("Test Accuracy CountVector Decision Tree: ",accuracy_dt_cv)

predictions_dt_cv__ = decision_tree_cv.transform(train_cv)
accuracy_dt_cv__ = predictions_dt_cv__.filter(predictions_dt_cv__.label == predictions_dt_cv__.prediction).count() / float(predictions_dt_cv__.count())
print("Train Accuracy CountVector Decision Tree: ",accuracy_dt_cv__)

# Precision Recall F1
predictionAndLabels = predictions_dt_cv.select("prediction","label").rdd
# Instantiate metrics objects
multi_metrics = MulticlassMetrics(predictionAndLabels)
precision_score = multi_metrics.weightedPrecision
recall_score = multi_metrics.weightedRecall
f1_score= (2 * precision_score * recall_score) / (precision_score + recall_score)
print("Precision CountVector Decision Tree = %s" % precision_score)
print("Recall CountVector Decision Tree = %s" % recall_score)
print("f1_score CountVector Decision Tree = %s" % f1_score)

Test Accuracy CountVector Decision Tree:  0.8605644889238727
Train Accuracy CountVector Decision Tree:  0.865246868655916
Precision CountVector Decision Tree = 0.8360006740577058
Recall CountVector Decision Tree = 0.8605644889238726
f1_score CountVector Decision Tree = 0.8481047571979341


In [0]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="CountVectorized")

# Create ParamGrid for Cross Validation
dtparamGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [2, 10, 20, 30])
             .addGrid(dt.maxBins, [10, 20, 40, 80])
             .build())

# Evaluate model
dtevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

# Create 5-fold CrossValidator
dtcv = CrossValidator(estimator = dt,
                      estimatorParamMaps = dtparamGrid,
                      evaluator = dtevaluator,
                      numFolds = 3)

# Run cross validations
dtcvModel = dtcv.fit(train_cv)
print(dtcvModel)



# Use test set here so we can measure the accuracy of our model on new data
dtpredictions = dtcvModel.transform(test_cv)
# cvModel uses the best model found from the Cross Validation
# Evaluate best model
accuracy_dt_cv = dtpredictions.filter(dtpredictions.label == dtpredictions.prediction).count() / float(dtpredictions.count())
print("Test Accucary for CountVectorized Decision Tree optimized model: ",accuracy_dt_cv)

dtpredictions_ = dtcvModel.transform(train_cv)
accuracy_dt_cv_ = dtpredictions_.filter(dtpredictions_.label == dtpredictions_.prediction).count() / float(dtpredictions_.count())
print("Train Accuracy for CountVectorized Decision Tree optimized model: ",accuracy_dt_cv_)



# Precision Recall F1
predictionAndLabels = dtpredictions.select("prediction","label").rdd
# Instantiate metrics objects
multi_metrics = MulticlassMetrics(predictionAndLabels)
precision_score = multi_metrics.weightedPrecision
recall_score = multi_metrics.weightedRecall
f1_score= (2 * precision_score * recall_score) / (precision_score + recall_score)
print("Precision CountVector Decision Tree optimized model = %s" % precision_score)
print("Recall CountVector Decision Tree optimized model = %s" % recall_score)
print("f1_score CountVector Decision Tree optimized model = %s" % f1_score)
print("best parameter of model(maxDepth)",dtcvModel.bestModel.getMaxDepth())   
print("best parameter of model(maxBins)",dtcvModel.bestModel.getMaxBins())

CrossValidatorModel_ddb0fb2308a6
Test Accucary for CountVectorized Decision Tree optimized model:  0.8551669852693129
Train Accuracy for CountVectorized Decision Tree optimized model:  0.939364921285479
Precision CountVector Decision Tree optimized model = 0.8348903313280811
Recall CountVector Decision Tree optimized model = 0.8551669852693129
f1_score CountVector Decision Tree optimized model = 0.8449070225733831
best parameter of model(maxDepth) 30
best parameter of model(maxBins) 10


## Random Forest

In [0]:
rf_cv = RandomForestClassifier(labelCol="label", featuresCol="CountVectorized", numTrees=10)
random_forest_cv = rf_cv.fit(train_cv)
predictions_rf_cv = random_forest_cv.transform(test_cv)
accuracy_rf_cv = predictions_rf_cv.filter(predictions_rf_cv.label == predictions_rf_cv.prediction).count() / float(predictions_rf_cv.count())
print("Test Accuracy CountVector Random Forest : ", accuracy_rf_cv)

predictions_rf_cv_train = random_forest_cv.transform(train_cv)
accuracy_rf_cv_train = predictions_rf_cv_train.filter(predictions_rf_cv_train.label == predictions_rf_cv_train.prediction).count() / float(predictions_rf_cv_train.count())
print("Train Accuracy CountVector Random Forest : ", accuracy_rf_cv_train)

# Precision Recall F1
predictionAndLabels = predictions_rf_cv.select("prediction","label").rdd
# Instantiate metrics objects
multi_metrics = MulticlassMetrics(predictionAndLabels)
precision_score = multi_metrics.weightedPrecision
recall_score = multi_metrics.weightedRecall
f1_score= (2 * precision_score * recall_score) / (precision_score + recall_score)
print("Precision CountVector Random Forest = %s" % precision_score)
print("Recall CountVector Random Forest = %s" % recall_score)
print("f1_score CountVector Random Forest = %s" % f1_score)

Test Accuracy CountVector Random Forest :  0.852805577420443
Train Accuracy CountVector Random Forest :  0.8514574635155322
Precision CountVector Random Forest = 0.8744742120534434
Recall CountVector Random Forest = 0.852805577420443
f1_score CountVector Random Forest = 0.8635039787927749


In [0]:
rf = RandomForestClassifier(labelCol="label", featuresCol="CountVectorized", numTrees=10)

paramGrid = (ParamGridBuilder().addGrid(rf.numTrees, [5, 15, 20])
             .build())

crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3) 

cvModel = crossval.fit(train_cv)
predictions_rf_cv = cvModel.transform(test_cv)

accuracy_rf_cv = predictions_rf_cv.filter(predictions_rf_cv.label == predictions_rf_cv.prediction).count() / float(predictions_rf_cv.count())
print("Accuracy for CountVectorized Random Forest optimized model: ", accuracy_rf_cv)

predictions_rf_cv_train = random_forest_cv.transform(train_cv)
accuracy_rf_cv_train = predictions_rf_cv_train.filter(predictions_rf_cv_train.label == predictions_rf_cv_train.prediction).count() / float(predictions_rf_cv_train.count())
print("Train Accuracy CountVector Random Forest optimized model: ", accuracy_rf_cv_train)


# Precision Recall F1
predictionAndLabels = predictions_rf_cv.select("prediction","label").rdd
# Instantiate metrics objects
multi_metrics = MulticlassMetrics(predictionAndLabels)
precision_score = multi_metrics.weightedPrecision
recall_score = multi_metrics.weightedRecall
f1_score= (2 * precision_score * recall_score) / (precision_score + recall_score)
print("Precision CountVector Random Forest optimized model = %s" % precision_score)
print("Recall CountVector Random Forest optimized model = %s" % recall_score)
print("f1_score CountVector Random Forest optimized model = %s" % f1_score)
bestModel = cvModel.bestModel
print ('Best Param (numTrees): ', bestModel._java_obj.getNumTrees())

Accuracy for CountVectorized Random Forest optimized model:  0.8531429213988531
Train Accuracy CountVector Random Forest optimized model:  0.8514574635155322
Precision CountVector Random Forest optimized model = 0.8747196279634945
Recall CountVector Random Forest optimized model = 0.8531429213988531
f1_score CountVector Random Forest optimized model = 0.8637965549760824
Best Param (numTrees):  20
