## Assignment 3 - Vectorization

### Import packages

In [1]:
!pip install pyspark
!pip install jip



In [2]:
import os
import pandas as pd
import numpy as np
import logging
import glob
import pyspark
import re

In [3]:
from pyspark import SparkContext #To connect to spark
from pyspark.sql import SparkSession #To connect to sql
#from pyspark.sql import Row
from pyspark.sql.functions import *
from pyspark.sql.types import ArrayType, StructField, StructType, StringType, IntegerType, FloatType
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover, CountVectorizer, HashingTF, IDF, OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.feature import CountVectorizer

from pyspark.ml import Pipeline


from pyspark.ml.classification import LogisticRegression, NaiveBayes
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

### Set up Spark context

In [4]:
sc

In [5]:
spark = SparkSession.builder \
    .getOrCreate()

### Loading data

We set the path where all of our historical data is storaged. Here we are reading all files that have .csv extension

In [6]:
parent_path = "C:/Users/Marce/OneDrive - KU Leuven/Advanced Analytics" 
all_tweets = glob.glob(parent_path + "/*.csv")
all_tweets

['C:/Users/Marce/OneDrive - KU Leuven/Advanced Analytics\\tweet_.csv',
 'C:/Users/Marce/OneDrive - KU Leuven/Advanced Analytics\\tweet_00.csv',
 'C:/Users/Marce/OneDrive - KU Leuven/Advanced Analytics\\tweet_01.csv',
 'C:/Users/Marce/OneDrive - KU Leuven/Advanced Analytics\\tweet_02.csv',
 'C:/Users/Marce/OneDrive - KU Leuven/Advanced Analytics\\tweet_03.csv',
 'C:/Users/Marce/OneDrive - KU Leuven/Advanced Analytics\\tweet_04.csv',
 'C:/Users/Marce/OneDrive - KU Leuven/Advanced Analytics\\tweet_05.csv']

Loading all csv to create a single DF

In [7]:
df= pd.concat((pd.read_csv(file, header=None)
          for file in all_tweets),ignore_index= True)
df.columns = ['category','tweet_id', 'tweet_text']
df = df.drop_duplicates(subset=['tweet_id'])
df.sample(5)

Unnamed: 0,category,tweet_id,tweet_text
10590,#china,1385644409971224579,What does #███████ want to do with #███████ ? ...
11917,#biden,1381634468788666368,"#███████ said, ""I'm not content that a Chinese..."
288,#china,1381659043341668359,The military threat from #███████ V USA accor...
545,#vaccine,1381585331795808256,#███████ appts available at Rite Aid in: Kings...
10204,#biden,1385613615655538688,Over $3.7 billion worth of crypto positions we...


Checking the dimension of our dataset




In [10]:
df.shape

(11577, 3)

#### Initializing the functionalities of Spark SQL

In [8]:
file = sqlContext.createDataFrame(df)
file.printSchema()

root
 |-- category: string (nullable = true)
 |-- tweet_id: long (nullable = true)
 |-- tweet_text: string (nullable = true)



checking the distribution of the hashtags collected

In [12]:
file.groupBy('category').count().show()

+--------------+-----+
|      category|count|
+--------------+-----+
|        #biden| 1804|
|      #vaccine| 3118|
|        #china| 2411|
|        #covid| 2887|
|#stopasianhate|  967|
|    #inflation|  390|
+--------------+-----+



We are going to check several models: 

1. Logistic regression using TF-IDF features
2. Naive bayes

### 1. Logistic Regression using TF-IDF Features

#### Pipeline:

1. Tokenizer 
2. Stopwords
3. HashingTF
4. IDF
5. Convert hashtag into numeric label

In [9]:
Tokenizer1 = Tokenizer(inputCol="tweet_text",
                       outputCol="words")  # separe words
sw = StopWordsRemover(inputCol="words", outputCol="filtered") # removes stop words
hashingTF = HashingTF(inputCol="filtered",
                      outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features",
          minDocFreq=5) 
label_stringIdx = StringIndexer(
    inputCol="category", outputCol="label")  # labels to numerics

# set up pipeline
pipeline = Pipeline(stages=[Tokenizer1, sw, hashingTF, idf, label_stringIdx])

Next step is to split our dataset. We are utilizing 80% for training and 20% for testing

In [11]:
### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = file.randomSplit([0.8, 0.2], seed = 1005)

#### Preprocessing on training data

Passing our training data through the pipeline that we defined above:

In [13]:
pipelineFit = pipeline.fit(trainingData)
dataset = pipelineFit.transform(trainingData)

Training model:

In [15]:
# Build the model
lr = LogisticRegression(maxIter=20, 
                        regParam=0.3,
                        elasticNetParam=0)

# Train model with Training Data
lrModel = lr.fit(dataset)

Now, we save the pipeline used to utilize it further in the streaming step (if we choose this proc ofc)

In [None]:
# pipeline
pipelineFit.write().overwrite().save(os.path.join(parent_path, 'IDF_train'))

In [None]:
#trainData.groupBy('category').count().show()

In [None]:
#testData.groupBy('category').count().show()

#### Prediction on test data

Let's test our previous trained pipeline on the test data:

In [None]:
dataset1 = pipelineFit.transform(testData)

predictions = lrModel.transform(dataset1)

# just checking some predictions
predictions.filter(predictions['prediction'] == 2) \
    .select("category", "filtered", "probability", "label", "prediction") \
    .orderBy("probability", ascending=False) \
    .show(n=10, truncate=30)

#### Evaluate model performance

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

conclusion: Not really a good model 0.55 accuracy...so we are going to use a 5-fold cross validation with 5 to see if our accuracy increases

### Cross Validation

In [None]:
# Build the model
lr = LogisticRegression(maxIter=20, 
                        regParam=0.3, 
                        elasticNetParam=0)

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=MulticlassClassificationEvaluator(), \
                    numFolds=5) 

# Run cross validations
pipelineFit = pipeline.fit(trainingData)
dataset1 = pipelineFit.transform(trainingData)

cvModel = cv.fit(dataset1)
# this will likely take a fair amount of time because of the amount of models that we're creating and testing

# Use test set here so we can measure the accuracy of our model on new data
dataset_pred = pipelineFit.transform(testData)
predictions = cvModel.transform(dataset_pred)

# cvModel uses the best model found from the Cross Validation

#### Evaluate model performance

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

In [None]:
#Validating the model in test set.
evaluator.evaluate(predictions)
acc_lr = evaluator.evaluate(predictions)
predictions.groupBy('prediction','label').count().show()

Saving the logistic regression model for further implementation:

In [22]:
cvModel.write().overwrite().save(os.path.join(parent_path, 'twt_pyspark_LRModel1'))

0.555961109001589