In [0]:
from pyspark.sql.functions import col, explode, array, lit
!pip install pyspellchecker
!pip install nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pyspark.sql.functions import col, lit
from spellchecker import SpellChecker
from pyspark.sql.functions import udf
import re
import string
from nltk.stem import SnowballStemmer, WordNetLemmatizer

You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.[0m
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
df = spark.read.csv("dbfs:/FileStore/shared_uploads/behnaz.sheikhi1@ucalgary.ca/final-1.csv", sep=',', inferSchema=True, header=True, multiLine="true", escape="\"")
df= df.drop(df.Score)
df.show()

+---+--------------------+-----------+--------------------+
|_c0|             Summary|Final_Label|                Text|
+---+--------------------+-----------+--------------------+
|  0|Good Quality Dog ...|        1.0|I have bought sev...|
|  1|   Not as Advertised|        0.0|Product arrived l...|
|  2|"Delight" says it...|        1.0|This is a confect...|
|  3|      Cough Medicine|        0.0|If you are lookin...|
|  4|         Great taffy|        1.0|Great taffy at a ...|
|  5|          Nice Taffy|        1.0|I got a wild hair...|
|  6|Great!  Just as g...|        1.0|This saltwater ta...|
|  7|Wonderful, tasty ...|        1.0|This taffy is so ...|
|  8|          Yay Barley|        1.0|Right now I'm mos...|
|  9|    Healthy Dog Food|        1.0|This is a very he...|
| 10|The Best Hot Sauc...|        1.0|I don't know if i...|
| 11|My cats LOVE this...|        1.0|One of my boys ne...|
| 12|My Cats Are Not F...|        0.0|My cats have been...|
| 13|   fresh and greasy!|        1.0|go

In [0]:
# find the majority and minority
df_major= df.filter(col("Final_Label") == 1)
df_minor= df.filter(col("Final_Label") == 0)
ratio = int(df_major.count()/df_minor.count())
print("ratio: {}".format(ratio))

ratio: 5


## Oversampling

In [0]:
a = range(ratio)
# duplicate the minority rows
df_oversampled= df_minor.withColumn("dummy", explode(array([lit(x) for x in a]))).drop('dummy')
# combine both oversampled minority rows and previous majority rows 
df_w2v= df_major.unionAll(df_oversampled)
df_w2v.show()

+---+--------------------+-----------+--------------------+
|_c0|             Summary|Final_Label|                Text|
+---+--------------------+-----------+--------------------+
|  0|Good Quality Dog ...|        1.0|I have bought sev...|
|  2|"Delight" says it...|        1.0|This is a confect...|
|  4|         Great taffy|        1.0|Great taffy at a ...|
|  5|          Nice Taffy|        1.0|I got a wild hair...|
|  6|Great!  Just as g...|        1.0|This saltwater ta...|
|  7|Wonderful, tasty ...|        1.0|This taffy is so ...|
|  8|          Yay Barley|        1.0|Right now I'm mos...|
|  9|    Healthy Dog Food|        1.0|This is a very he...|
| 10|The Best Hot Sauc...|        1.0|I don't know if i...|
| 11|My cats LOVE this...|        1.0|One of my boys ne...|
| 13|   fresh and greasy!|        1.0|good flavor! thes...|
| 14|Strawberry Twizzl...|        1.0|The Strawberry Tw...|
| 15|Lots of twizzlers...|        1.0|My daughter loves...|
| 17|            Love it!|        1.0|I 

In [0]:
df_w2v.count()

Out[27]: 887596

In [0]:
major_df = df_w2v.filter(col("Final_Label") == 1)
minor_df = df_w2v.filter(col("Final_Label") == 0)
ratio = int(major_df.count()/minor_df.count())
print("ratio: {}".format(ratio))

ratio: 1


In [0]:
df_w2v= df_w2v.withColumnRenamed("Final_Label","label")
df_w2v.count()

Out[29]: 887596

## Preprocessing

In [0]:
@udf
def text_preproccessing(text):
    text = re.sub("(<.*?>)","",text)
    text = re.sub("(\\W|\\d)"," ",text)
    words = word_tokenize(text)
 
    stopw = stopwords.words("english")
    stop_removed = [word for word in words if word.lower() not in stopw]
    punk_removed = [w.lower() for w in stop_removed if w.lower() not in string.punctuation]
    
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(word) for word in punk_removed]
 
    stemmer = SnowballStemmer(language="english")
    stemmed_final = [stemmer.stem(token) for token in tokens]
 
    return " ".join(stemmed_final)

In [0]:
x = df_w2v.select("Text", text_preproccessing("Text").alias("Semi_Preproccesed"))
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="Semi_Preproccesed", outputCol="Final_Preproccesed")
x = tokenizer.transform(x)
x = x.drop(x.Text)
x.show()

+--------------------+--------------------+
|   Semi_Preproccesed|  Final_Preproccesed|
+--------------------+--------------------+
|bought sever vita...|[bought, sever, v...|
|product arriv lab...|[product, arriv, ...|
|confect around ce...|[confect, around,...|
|look secret ingre...|[look, secret, in...|
|great taffi great...|[great, taffi, gr...|
|got wild hair taf...|[got, wild, hair,...|
|saltwat taffi gre...|[saltwat, taffi, ...|
|taffi good soft c...|[taffi, good, sof...|
|right most sprout...|[right, most, spr...|
|healthi dog food ...|[healthi, dog, fo...|
|know cactus tequi...|[know, cactus, te...|
|one boy need lose...|[one, boy, need, ...|
|cat happili eat f...|[cat, happili, ea...|
|good flavor came ...|[good, flavor, ca...|
|strawberri twizzl...|[strawberri, twiz...|
|daughter love twi...|[daughter, love, ...|
|love eat good wat...|[love, eat, good,...|
|satisfi twizzler ...|[satisfi, twizzle...|
|twizzler strawber...|[twizzler, strawb...|
|candi deliv fast ...|[candi, de

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import monotonically_increasing_id, row_number
w = Window.orderBy(monotonically_increasing_id())
df_w2v = df_w2v.withColumn("columnindex", row_number().over(w))
x = x.withColumn("columnindex", row_number().over(w))
df_w2v = df_w2v.join(x, df_w2v.columnindex == x.columnindex, 'inner').drop(df_w2v.columnindex).drop(x.Semi_Preproccesed)
df_w2v.show()

+---+--------------------+-----+--------------------+--------------------+-----------+
|_c0|             Summary|label|                Text|  Final_Preproccesed|columnindex|
+---+--------------------+-----+--------------------+--------------------+-----------+
|  0|Good Quality Dog ...|  1.0|I have bought sev...|[bought, sever, v...|          1|
|  1|   Not as Advertised|  0.0|Product arrived l...|[product, arriv, ...|          2|
|  2|"Delight" says it...|  1.0|This is a confect...|[confect, around,...|          3|
|  3|      Cough Medicine|  0.0|If you are lookin...|[look, secret, in...|          4|
|  4|         Great taffy|  1.0|Great taffy at a ...|[great, taffi, gr...|          5|
|  5|          Nice Taffy|  1.0|I got a wild hair...|[got, wild, hair,...|          6|
|  6|Great!  Just as g...|  1.0|This saltwater ta...|[saltwat, taffi, ...|          7|
|  7|Wonderful, tasty ...|  1.0|This taffy is so ...|[taffi, good, sof...|          8|
|  8|          Yay Barley|  1.0|Right now I

## Feature Extraction (W2V)

In [0]:
from pyspark.ml.feature import Word2Vec
word2Vec = Word2Vec(vectorSize=1000, minCount=0, inputCol="Final_Preproccesed", outputCol="word2Vec")
model = word2Vec.fit(df_w2v)
Word2Vec_Dataframe = model.transform(df_w2v)
Word2Vec_Dataframe.show()

+---+--------------------+-----+--------------------+--------------------+-----------+--------------------+
|_c0|             Summary|label|                Text|  Final_Preproccesed|columnindex|            word2Vec|
+---+--------------------+-----+--------------------+--------------------+-----------+--------------------+
|  0|Good Quality Dog ...|  1.0|I have bought sev...|[bought, sever, v...|          1|[-0.0138965371044...|
|  1|   Not as Advertised|  0.0|Product arrived l...|[product, arriv, ...|          2|[-0.0027879949710...|
|  2|"Delight" says it...|  1.0|This is a confect...|[confect, around,...|          3|[-0.0129402810431...|
|  3|      Cough Medicine|  0.0|If you are lookin...|[look, secret, in...|          4|[-0.0252936953911...|
|  4|         Great taffy|  1.0|Great taffy at a ...|[great, taffi, gr...|          5|[-0.0013632394659...|
|  5|          Nice Taffy|  1.0|I got a wild hair...|[got, wild, hair,...|          6|[-0.0236987926531...|
|  6|Great!  Just as g...|  

In [0]:
train_w2v, test_w2v = Word2Vec_Dataframe.randomSplit([0.75, 0.25], seed =70)

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics

## LogisticRegression

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

lr = LogisticRegression(featuresCol = 'word2Vec', labelCol = 'label', maxIter=10)
lrModel = lr.fit(train_w2v)

predictions_lr = lrModel.transform(test_w2v)
accuracy_lr = predictions_lr.filter(predictions_lr.label == predictions_lr.prediction).count() / float(predictions_lr.count())
print("LogisticRegression : Test Accuracy (word2Vec) : ",accuracy_lr)

predictions_lr_train= lrModel.transform(train_w2v)
accuracy_lr_train = predictions_lr_train.filter(predictions_lr_train.label == predictions_lr_train.prediction).count() / float(predictions_lr_train.count())
print("LogisticRegression : Train Accuracy (word2Vec): ",accuracy_lr_train)


LogisticRegression : Test Accuracy (word2Vec) :  0.8813673675924885
LogisticRegression : Train Accuracy (word2Vec):  0.8806450377293447


In [0]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType
predictionAndLabels = predictions_lr.select("prediction","label").rdd
# Instantiate metrics objects
multi_metrics = MulticlassMetrics(predictionAndLabels)
precision_score = multi_metrics.weightedPrecision
recall_score = multi_metrics.weightedRecall
f1_score= (2 * precision_score * recall_score) / (precision_score + recall_score)
print("LogisticRegression (word2Vec):  Precision = %s" % precision_score)
print("LogisticRegression (word2Vec) Recall = %s" % recall_score)
print("LogisticRegression (word2Vec) f1_score = %s" % f1_score)

LogisticRegression (word2Vec):  Precision = 0.8668576778203716
LogisticRegression (word2Vec) Recall = 0.8813673675924885
LogisticRegression (word2Vec) f1_score = 0.8740523098929152


## LogisticRegression Tuning

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

log_reg = LogisticRegression(featuresCol='word2Vec', labelCol='label')

paramGrid = ParamGridBuilder().addGrid(log_reg.maxIter, [10,20]) \
                                .addGrid(log_reg.regParam, [0,0.02,0.08]) \
                                .addGrid(log_reg.elasticNetParam, [0.2,0.6,0.8]) \
                                .build()

evaluator_lr = MulticlassClassificationEvaluator(labelCol='label', metricName='accuracy')

crossval = CrossValidator(estimator=log_reg,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator_lr,
                          numFolds=2)


# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(train_w2v)

# Make predictions on testing data and calculating ROC metrics and model accuracy. 
prediction = cvModel.transform(test_w2v)
prediction_train = cvModel.transform(train_w2v)

output= prediction.select("label",  "probability", "prediction")
output_train= prediction_train.select("label",  "probability", "prediction")

acc = evaluator_lr.evaluate(output, {evaluator_lr.metricName: "accuracy"})
acc_train= evaluator_lr.evaluate(output_train, {evaluator_lr.metricName: "accuracy"})


f1 = evaluator_lr.evaluate(output, {evaluator_lr.metricName: "f1"})
weightedPrecision = evaluator_lr.evaluate(output, {evaluator_lr.metricName: "weightedPrecision"})
weightedRecall = evaluator_lr.evaluate(output, {evaluator_lr.metricName: "weightedRecall"})

print("LogisticRegression (word2Vec): Test Accuracy of optimized model", acc)
print("LogisticRegression (word2Vec): Train Accuracy of optimized model", acc_train)
print("LogisticRegression (word2Vec): F1 Score of optimized model",f1)
print("LogisticRegression (word2Vec): Precision of optimized model", weightedPrecision)
print("LogisticRegression (word2Vec): Recall of optimized model",weightedRecall)

print("LogisticRegression (word2Vec): best parameter of model(MaxIter)",cvModel.bestModel.getMaxIter())   
print("LogisticRegression (word2Vec): best parameter of model(RegParam)",cvModel.bestModel.getRegParam()) 
print("LogisticRegression (word2Vec): best parameter of model(ElasticNetParam)",cvModel.bestModel.getElasticNetParam()) 

LogisticRegression (word2Vec): Test Accuracy of optimized model 0.8840661194197683
LogisticRegression (word2Vec): Train Accuracy of optimized model 0.8830964875320795
LogisticRegression (word2Vec): F1 Score of optimized model 0.8705330395534613
LogisticRegression (word2Vec): Precision of optimized model 0.8704742862098793
LogisticRegression (word2Vec): Recall of optimized model 0.8840661194197684
LogisticRegression (word2Vec): best parameter of model(MaxIter) 20
LogisticRegression (word2Vec): best parameter of model(RegParam) 0.0
LogisticRegression (word2Vec): best parameter of model(ElasticNetParam) 0.2


# DecisionTreeClassifier

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(labelCol="label", featuresCol="word2Vec")
decision_tree = dt.fit(train_w2v)
predictions_dt = decision_tree.transform(test_w2v)
accuracy_dt = predictions_dt.filter(predictions_dt.label == predictions_dt.prediction).count() / float(predictions_dt.count())
print("DecisionTreeClassifier (word2Vec) : Test Accuracy :   ",accuracy_dt)

predictions_dt_train= decision_tree.transform(train_w2v)
accuracy_dt_train = predictions_dt_train.filter(predictions_dt_train.label == predictions_dt_train.prediction).count() / float(predictions_dt_train.count())
print("DecisionTreeClassifier  (word2Vec): Train Accuracy: ",accuracy_dt_train)


DecisionTreeClassifier (word2Vec) : Test Accuracy :    0.8641628246935792
DecisionTreeClassifier  (word2Vec): Train Accuracy:  0.8683494848125024


In [0]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType
predictionAndLabels = predictions_dt.select("prediction","label").rdd
# Instantiate metrics objects
multi_metrics = MulticlassMetrics(predictionAndLabels)
precision_score = multi_metrics.weightedPrecision
recall_score = multi_metrics.weightedRecall
f1_score= (2 * precision_score * recall_score) / (precision_score + recall_score)
print("DecisionTreeClassifier (word2Vec) : Precision = %s" % precision_score)
print("DecisionTreeClassifier (word2Vec) : Recall = %s" % recall_score)
print("DecisionTreeClassifier (word2Vec) : f1_score = %s" % f1_score)

DecisionTreeClassifier (word2Vec) : Precision = 0.8387242625443364
DecisionTreeClassifier (word2Vec) : Recall = 0.8641628246935793
DecisionTreeClassifier (word2Vec) : f1_score = 0.8512535367626399


# DecisionTreeClassifier Tuning

In [0]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="label", featuresCol="word2Vec")

# Create ParamGrid for Cross Validation
dtparamGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [2, 10, 20, 30])
             .addGrid(dt.maxBins, [10, 20, 40, 80])
             .build())

# Evaluate model
dtevaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

# Create 5-fold CrossValidator
dtcv = CrossValidator(estimator = dt,
                      estimatorParamMaps = dtparamGrid,
                      evaluator = dtevaluator,
                      numFolds = 3)

# Run cross validations
dtcvModel = dtcv.fit(train_w2v)
print(dtcvModel)



# Use test set here so we can measure the accuracy of our model on new data
dtpredictions = dtcvModel.transform(test_w2v)
accuracy_dt_cv = dtpredictions.filter(dtpredictions.label == dtpredictions.prediction).count() / float(dtpredictions.count())
print(" DecisionTreeClassifier (word2Vec): Test Accuracy of optimized model: ",accuracy_dt_cv)

dtpredictions_train = dtcvModel.transform(train_w2v)
accuracy_dt_cv_train = dtpredictions_train.filter(dtpredictions_train.label == dtpredictions_train.prediction).count() / float(dtpredictions_train.count())
print(" DecisionTreeClassifier (word2Vec): Train Accuracy of optimized model: ",accuracy_dt_cv_train)
 
  

# Precision Recall F1
predictionAndLabels = dtpredictions.select("prediction","label").rdd
# Instantiate metrics objects
multi_metrics = MulticlassMetrics(predictionAndLabels)
precision_score = multi_metrics.weightedPrecision
recall_score = multi_metrics.weightedRecall
f1_score= (2 * precision_score * recall_score) / (precision_score + recall_score)
print("DecisionTreeClassifier (word2Vec) : Precision of optimized model = %s" % precision_score)
print("DecisionTreeClassifier (word2Vec) : Recall of optimized model = %s" % recall_score)
print("DecisionTreeClassifier (word2Vec) : f1_score of optimized model = %s" % f1_score)
print("DecisionTreeClassifier (word2Vec) : best parameter of model(maxDepth)",dtcvModel.bestModel.getMaxDepth())   
print("DecisionTreeClassifier (word2Vec) : best parameter of model(maxBins)",dtcvModel.bestModel.getMaxBins()) 


CrossValidatorModel_6b873d35d563
 DecisionTreeClassifier (word2Vec): Test Accuracy of optimized model:  0.8440346339817835
 DecisionTreeClassifier (word2Vec): Train Accuracy of optimized model:  0.9322786991994484
DecisionTreeClassifier (word2Vec) : Precision of optimized model = 0.8300199415803474
DecisionTreeClassifier (word2Vec) : Recall of optimized model = 0.8440346339817835
DecisionTreeClassifier (word2Vec) : f1_score of optimized model = 0.8369686243402272
DecisionTreeClassifier (word2Vec) : best parameter of model(maxDepth) 10
DecisionTreeClassifier (word2Vec) : best parameter of model(maxBins) 10


# RandomForestClassifier

In [0]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", featuresCol="word2Vec", numTrees=10)
random_forest = rf.fit(train_w2v)

predictions_rf = random_forest.transform(test_w2v)
accuracy_rf = predictions_rf.filter(predictions_rf.label == predictions_rf.prediction).count() / float(predictions_rf.count())
print("RandomForestClassifier (word2Vec) : Test Accuracy : ",accuracy_rf)

predictions_rf_train= random_forest.transform(train_w2v)
accuracy_rf_train = predictions_rf_train.filter(predictions_rf_train.label == predictions_rf_train.prediction).count() / float(predictions_rf_train.count())
print("RandomForestClassifier (word2Vec) : Train Accuracy : ",accuracy_rf_train)

RandomForestClassifier (word2Vec) : Test Accuracy :  0.8561790172045429
RandomForestClassifier (word2Vec) : Train Accuracy :  0.8555176772513119


In [0]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType
predictionAndLabels = predictions_rf.select("prediction","label").rdd
# Instantiate metrics objects
multi_metrics = MulticlassMetrics(predictionAndLabels)
precision_score = multi_metrics.weightedPrecision
recall_score = multi_metrics.weightedRecall
f1_score= (2 * precision_score * recall_score) / (precision_score + recall_score)
print("RandomForestClassifier (word2Vec) : Precision = %s" % precision_score)
print("RandomForestClassifier (word2Vec) : Recall = %s" % recall_score)
print("RandomForestClassifier (word2Vec) : f1_score = %s" % f1_score)

RandomForestClassifier (word2Vec) : Precision = 0.8504967483677172
RandomForestClassifier (word2Vec) : Recall = 0.8561790172045428
RandomForestClassifier (word2Vec) : f1_score = 0.8533284234090809


# RandomForestClassifier Tuning

In [0]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", featuresCol="word2Vec", numTrees=10)

paramGrid = (ParamGridBuilder().addGrid(rf.numTrees, [5, 15, 20])
             .build())

crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3) 

cvModel = crossval.fit(train_w2v)

predictions_rf_cv = cvModel.transform(test_w2v)
accuracy_rf_cv = predictions_rf_cv.filter(predictions_rf_cv.label == predictions_rf_cv.prediction).count() / float(predictions_rf_cv.count())
print("RandomForestClassifier (word2Vec) : Test Accuracy of optimized model: ", accuracy_rf_cv)

predictions_rf_cv_train = cvModel.transform(train_w2v)
accuracy_rf_cv_train = predictions_rf_cv_train.filter(predictions_rf_cv_train.label == predictions_rf_cv_train.prediction).count() / float(predictions_rf_cv_train.count())
print("RandomForestClassifier (word2Vec) : Train Accuracy of optimized model: ", accuracy_rf_cv_train)


# Precision Recall F1
predictionAndLabels = predictions_rf_cv.select("prediction","label").rdd
# Instantiate metrics objects
multi_metrics = MulticlassMetrics(predictionAndLabels)
precision_score = multi_metrics.weightedPrecision
recall_score = multi_metrics.weightedRecall
f1_score= (2 * precision_score * recall_score) / (precision_score + recall_score)
print("RandomForestClassifier (word2Vec) : Precision of optimized model = %s" % precision_score)
print("RandomForestClassifier (word2Vec) : Recall of optimized model = %s" % recall_score)
print("RandomForestClassifier (word2Vec) : f1_score of optimized model = %s" % f1_score)
bestModel = cvModel.bestModel
print ('RandomForestClassifier (word2Vec) : Best Param (numTrees): ', bestModel._java_obj.getNumTrees())

RandomForestClassifier (word2Vec) : Test Accuracy of optimized model:  0.8576408411109862
RandomForestClassifier (word2Vec) : Train Accuracy of optimized model:  0.8567817060558471
RandomForestClassifier (word2Vec) : Precision of optimized model = 0.8481003770511962
RandomForestClassifier (word2Vec) : Recall of optimized model = 0.8576408411109863
RandomForestClassifier (word2Vec) : f1_score of optimized model = 0.8528439284646218
RandomForestClassifier (word2Vec) : Best Param (numTrees):  20
