<a href="https://colab.research.google.com/github/biggity2bit/colab/blob/master/politifact_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [0]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NaiveBayes").getOrCreate()

In [3]:
# Upload my files
from google.colab import files
training_data = files.upload()

Saving politifact_titles_no_commas.csv to politifact_titles_no_commas (1).csv
Saving Total_clean.csv to Total_clean (2).csv


In [4]:
from pyspark import SparkFiles
new_train_df = spark.read.csv("Total_clean.csv", sep=",", header=True)
spark.sparkContext.addFile("Total_clean.csv")

# Show dataframe
new_train_df.show()

+--------------------+-----+------+
|               title|label|status|
+--------------------+-----+------+
|As U.S. budget fi...|    0|  TRUE|
|U.S. military to ...|    0|  TRUE|
|Senior U.S. Repub...|    0|  TRUE|
|FBI Russia probe ...|    0|  TRUE|
|Trump wants Posta...|    0|  TRUE|
|White House, Cong...|    0|  TRUE|
|Trump says Russia...|    0|  TRUE|
|Factbox: Trump on...|    0|  TRUE|
|Trump on Twitter ...|    0|  TRUE|
|Alabama official ...|    0|  TRUE|
|Jones certified U...|    0|  TRUE|
|New York governor...|    0|  TRUE|
|Factbox: Trump on...|    0|  TRUE|
|Trump on Twitter ...|    0|  TRUE|
|Man says he deliv...|    0|  TRUE|
|Virginia official...|    0|  TRUE|
|U.S. lawmakers qu...|    0|  TRUE|
|Trump on Twitter ...|    0|  TRUE|
|U.S. appeals cour...|    0|  TRUE|
|Treasury Secretar...|    0|  TRUE|
+--------------------+-----+------+
only showing top 20 rows



In [6]:
from pyspark.sql.functions import length
# Create a length column to be used as a future feature 
length_train_df = new_train_df.withColumn('length', length(new_train_df['title']))
length_train_df.show()

+--------------------+-----+------+------+
|               title|label|status|length|
+--------------------+-----+------+------+
|As U.S. budget fi...|    0|  TRUE|    64|
|U.S. military to ...|    0|  TRUE|    64|
|Senior U.S. Repub...|    0|  TRUE|    60|
|FBI Russia probe ...|    0|  TRUE|    59|
|Trump wants Posta...|    0|  TRUE|    69|
|White House, Cong...|    0|  TRUE|    64|
|Trump says Russia...|    0|  TRUE|    63|
|Factbox: Trump on...|    0|  TRUE|    60|
|Trump on Twitter ...|    0|  TRUE|    42|
|Alabama official ...|    0|  TRUE|    76|
|Jones certified U...|    0|  TRUE|    58|
|New York governor...|    0|  TRUE|    73|
|Factbox: Trump on...|    0|  TRUE|    65|
|Trump on Twitter ...|    0|  TRUE|    46|
|Man says he deliv...|    0|  TRUE|    67|
|Virginia official...|    0|  TRUE|    78|
|U.S. lawmakers qu...|    0|  TRUE|    72|
|Trump on Twitter ...|    0|  TRUE|    57|
|U.S. appeals cour...|    0|  TRUE|    63|
|Treasury Secretar...|    0|  TRUE|    77|
+----------

In [7]:
# Create the Politifact testing df
newest_test_df = spark.read.csv("politifact_titles_no_commas.csv", sep=",", header=True)
spark.sparkContext.addFile("politifact_titles_no_commas.csv")
# Show dataframe
newest_test_df.show()

+--------------------+-----+------+
|        articletitle|label|status|
+--------------------+-----+------+
|"Says gloves shou...|    0|  TRUE|
|New York is one o...|    0|  TRUE|
|Says it's illegal...|    0|  TRUE|
|"""Small trials""...|    0|  TRUE|
|"Says Texas ""eit...|    0|  TRUE|
|"North Carolina i...|    0|  TRUE|
|"Virginia has ""l...|    0|  TRUE|
|"""Until this wee...|    0|  TRUE|
|"""The poor are i...|    0|  TRUE|
|"""Go look at oth...|    0|  TRUE|
|"""Africans livin...|    0|  TRUE|
|"Republicans ""ha...|    0|  TRUE|
|"Before COVID-19 ...|    0|  TRUE|
|"Says of the coro...|    0|  TRUE|
|"""On February 7 ...|    0|  TRUE|
|"""My mask will k...|    0|  TRUE|
|"Says longstandin...|    0|  TRUE|
|"""No city in the...|    0|  TRUE|
|"Says ""most"" NC...|    0|  TRUE|
|Says Spectrum wil...|    0|  TRUE|
+--------------------+-----+------+
only showing top 20 rows



In [9]:
# Create a length column to be used as a future feature 
length_test_df = newest_test_df.withColumn('length', length(newest_test_df['articletitle']))
length_test_df.show()

+--------------------+-----+------+------+
|        articletitle|label|status|length|
+--------------------+-----+------+------+
|"Says gloves shou...|    0|  TRUE|   158|
|New York is one o...|    0|  TRUE|    88|
|Says it's illegal...|    0|  TRUE|    95|
|"""Small trials""...|    0|  TRUE|   125|
|"Says Texas ""eit...|    0|  TRUE|   101|
|"North Carolina i...|    0|  TRUE|   139|
|"Virginia has ""l...|    0|  TRUE|    83|
|"""Until this wee...|    0|  TRUE|   208|
|"""The poor are i...|    0|  TRUE|   197|
|"""Go look at oth...|    0|  TRUE|   136|
|"""Africans livin...|    0|  TRUE|   176|
|"Republicans ""ha...|    0|  TRUE|   206|
|"Before COVID-19 ...|    0|  TRUE|   174|
|"Says of the coro...|    0|  TRUE|   158|
|"""On February 7 ...|    0|  TRUE|   180|
|"""My mask will k...|    0|  TRUE|    75|
|"Says longstandin...|    0|  TRUE|   150|
|"""No city in the...|    0|  TRUE|    73|
|"Says ""most"" NC...|    0|  TRUE|    82|
|Says Spectrum wil...|    0|  TRUE|    88|
+----------

In [0]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://s3.amazonaws.com/dataviz-curriculum/day_2/yelp_reviews.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("yelp_reviews.csv"), sep=",", header=True)

# Show DataFrame
df.show()

In [0]:
from pyspark.sql.functions import length
# Create a length column to be used as a future feature 
data_df = df.withColumn('length', length(df['text']))
data_df.show()

In [0]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
# Create all the features to the data set
pos_neg_to_num = StringIndexer(inputCol='label',outputCol='labeled')
tokenizer = Tokenizer(inputCol="title", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')


In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [0]:
# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[pos_neg_to_num, tokenizer, stopremove, hashingTF, idf, clean_up])

In [0]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(length_train_df)
cleaned = cleaner.transform(length_train_df)

In [26]:
# Show label and resulting features
cleaned.select(['labeled', 'features']).show(truncate=False)

+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|labeled|features                                                                                                                                                                                                                                                                         |
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0    |(262145,[26603,60268,61058,77806,90843,172123,220246,243967,262144],[4.916902816433549,4.857310719231303,3.881660141698028,8.73681053295389

In [31]:
# cleaned.dtypes
# length_test_df.dtypes
newest_test_df.dtypes

[('articletitle', 'string'), ('label', 'string'), ('status', 'string')]

In [27]:
from pyspark.ml.classification import NaiveBayes
# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3])
# training = cleaned 
# testing = length_test_df

# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

IllegalArgumentException: ignored

In [12]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(5)

+--------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|   class|                text|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+--------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|negative|"I don't know wha...|    85|  1.0|["i, don't, know,...|["i, know, big, d...|(262144,[8478,300...|(262144,[8478,300...|(262145,[8478,300...|[-633.68462561935...|[5.70171928547901...|       1.0|
|negative|"It was extremely...|    51|  1.0|["it, was, extrem...|["it, extremely, ...|(262144,[7388,163...|(262144,[7388,163...|(262145,[7388,163...|[-463.68982706177...|[4.37393635174908.

In [13]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting reviews was: %f" % acc)

Accuracy of model at predicting reviews was: 0.674060
