In [1]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.7/spark-2.4.7-bin-hadoop2.7.tgz
!tar xf spark-2.4.7-bin-hadoop2.7.tgz
!pip install -q findspark
# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.7-bin-hadoop2.7"
# Start a SparkSession
import findspark
findspark.init()

In [2]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MBPT").getOrCreate()

In [3]:
# Read in data from local machine
from pyspark import SparkFiles
from google.colab import files
uploaded = files.upload()


Saving mbti_1.csv to mbti_1.csv


In [21]:
#saving data as spark df
df = spark.read.csv('mbti_1.csv', inferSchema=True, header=True)
df.show()

+----+--------------------+
|type|               posts|
+----+--------------------+
|INFJ|'http://www.youtu...|
|ENTP|'I'm finding the ...|
|INTP|'Good one  _____ ...|
|INTJ|'Dear INTP,   I e...|
|ENTJ|'You're fired.|||...|
|INTJ|'18/37 @.@|||Scie...|
|INFJ|'No, I can't draw...|
|INTJ|'I tend to build ...|
|INFJ|I'm not sure, tha...|
|INTP|'https://www.yout...|
|INFJ|'One time my pare...|
|ENFJ|'https://www.yout...|
|INFJ|'Joe santagato - ...|
|INTJ|'Fair enough, if ...|
|INTP|'Basically this.....|
|INTP|'Your comment scr...|
|INFJ|'some of these bo...|
|INFP|'I think we do ag...|
|INFJ|'I fully believe ...|
|INFP|'That's normal, i...|
+----+--------------------+
only showing top 20 rows



In [22]:
from pyspark.sql.functions import length

data_df = df.withColumn('length', length(df['posts']))
data_df.show()

+----+--------------------+------+
|type|               posts|length|
+----+--------------------+------+
|INFJ|'http://www.youtu...|  4652|
|ENTP|'I'm finding the ...|  7053|
|INTP|'Good one  _____ ...|  5265|
|INTJ|'Dear INTP,   I e...|  6271|
|ENTJ|'You're fired.|||...|  6111|
|INTJ|'18/37 @.@|||Scie...|  8589|
|INFJ|'No, I can't draw...|  7916|
|INTJ|'I tend to build ...|  6900|
|INFJ|I'm not sure, tha...|  5325|
|INTP|'https://www.yout...|  7573|
|INFJ|'One time my pare...|  8381|
|ENFJ|'https://www.yout...|  4724|
|INFJ|'Joe santagato - ...|  7007|
|INTJ|'Fair enough, if ...|  9035|
|INTP|'Basically this.....|  5291|
|INTP|'Your comment scr...|  7244|
|INFJ|'some of these bo...|  8253|
|INFP|'I think we do ag...|  9489|
|INFJ|'I fully believe ...|  8850|
|INFP|'That's normal, i...|  7060|
+----+--------------------+------+
only showing top 20 rows



In [32]:
# creating features for the data set
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer

types_to_num = StringIndexer(inputCol="type", outputCol="label")
tokenizer = Tokenizer(inputCol="posts", outputCol="token_text")
stopremove = StopWordsRemover(inputCol="token_text", outputCol="stop_tokens")
HashingTF = HashingTF(inputCol="stop_tokens", outputCol="hash_token")
idf = IDF(inputCol="hash_token", outputCol="idf_token")



In [33]:
# creating feature vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

clean_up = VectorAssembler(inputCols=["idf_token", "length"], outputCol="features")

In [34]:
from pyspark.ml import Pipeline
#data processing pipeline
prep_pipeline = Pipeline(stages=[types_to_num, tokenizer,stopremove, HashingTF, idf, clean_up])

In [35]:
# fit and transform pipeline

cleaner = prep_pipeline.fit(data_df)
cleaned = cleaner.transform(data_df)

In [36]:
# fit and transform, show label and features

cleaned.select("label", "features").show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|(262145,[14,619,1...|
|  4.0|(262145,[14,367,5...|
|  2.0|(262145,[14,142,7...|
|  3.0|(262145,[329,1466...|
|  8.0|(262145,[14,304,1...|
|  3.0|(262145,[14,191,6...|
|  1.0|(262145,[14,456,2...|
|  3.0|(262145,[926,1405...|
|  1.0|(262145,[619,1836...|
|  2.0|(262145,[14,378,5...|
|  1.0|(262145,[14,198,9...|
| 10.0|(262145,[14,2325,...|
|  1.0|(262145,[319,991,...|
|  3.0|(262145,[14,671,1...|
|  2.0|(262145,[14,1232,...|
|  2.0|(262145,[14,15,99...|
|  1.0|(262145,[14,329,4...|
|  0.0|(262145,[14,961,9...|
|  1.0|(262145,[14,1382,...|
|  0.0|(262145,[329,353,...|
+-----+--------------------+
only showing top 20 rows



In [44]:
from pyspark.ml.classification import NaiveBayes
# Create a Naive Bayes model and fit training data
training, testing = cleaned.randomSplit([0.9, 0.1])

nb = NaiveBayes()
predictor = nb.fit(training)

In [42]:
# transform model with the testing data

test_results = predictor.transform(testing)
test_results.show(5)

+----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|type|               posts|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|ENFJ|'A cup of coffee ...|  6059| 10.0|['a, cup, of, cof...|['a, cup, coffee,...|(262144,[1846,199...|(262144,[1846,199...|(262145,[1846,199...|[-25717.246355470...|[0.97925670967478...|       0.0|
|ENFJ|'And then there a...|  7260| 10.0|['and, then, ther...|['and, nice, men,...|(262144,[14,1904,...|(262144,[14,1904,...|(262145,[14,1904,...|[-24702.817938722...|[8.03291015124004...|       5.0|
|ENFJ

In [43]:
#using class evaluator

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval= MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print(f"Accuracy of this model at predicting reviews: {acc}")

Accuracy of this model at predicting reviews: 0.2670851461062063
