# Training 1: N-Gram + TfIdf using Count Vectorizer + SVM

- Code version: 1.0
- Python version: 3.11.6
- Owner: Aditya Patkar
- File created: 2023-11-16

## Configurations

In [None]:
#Set the JAVA_HOME environment variable to the path of Java installation.
import os

In [None]:
#Necessary imports
import warnings
warnings.filterwarnings('ignore')

import wandb

import findspark
findspark.init()
findspark.find()

import boto3
import matplotlib.pyplot as plt

import pyspark as ps
from pyspark.sql import SQLContext
from pyspark.ml.feature import IDF, Tokenizer, CountVectorizer, StringIndexer, NGram,  VectorAssembler
from pyspark.ml import Pipeline

from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [None]:
#login to wandb and initialize the project
#wandb.login(relogin=True ) #uncomment this line if you are running this code for the first time
wandb.init(project="msml651-sentiment-analysis", entity="apatkar", name="tfidf+ngram+svm")

In [None]:
#initialize spark context
try:
    # create SparkContext on all CPUs available)
    sc = ps.SparkContext( 'local[*]' )
    sqlContext = SQLContext(sc)
    print("Just created a SparkContext")
except ValueError:
    warnings.warn("SparkContext already exists in this scope")

## Dataset

In [None]:
#get the data from s3
s3 = boto3.resource('s3', region_name='us-east-1', aws_access_key_id="KEY", aws_secret_access_key="KEY")
bucket = s3.Bucket('msml651')
bucket.download_file('sentiment140_clean_no_stopwords.parquet', './data/sentiment140_clean_no_stopwords.parquet')

In [None]:
#read the data into a spark dataframe
df = sqlContext.read.parquet('./data/sentiment140_clean_no_stopwords.parquet')
df.show(5)

In [None]:
#Set the config parameters
config = {
    'train_size': 0.95,
    'test_size': 0.025,
    'val_size' : 0.025,
    'vocab_size': 5000,
    'idf_min_doc_freq': 5,
    'type': 'tfidf + ngram + svm',
    'max_iter': 100,
    'reg_param': 0.3,  
}
wandb.config.update(config)

In [None]:
#split the data into train, test and validation sets
(train_set, val_set, test_set) = df.randomSplit([config['train_size'], config['val_size'], config['test_size']], seed = 2000)

## Preprocessing

In [None]:
def create_ngrams(input_column = 'tweet_without_stopwords', target_column = 'target', n=3):
    """
    Create 1 to ngrams from the input column and apply a model to it
    """
    
    tokenizer = [Tokenizer(inputCol=input_column, outputCol="words")] 
    ngrams = [NGram(n=i, inputCol="words", outputCol=f"{i}_grams") for i in range(1, n+1)]
    cv = [CountVectorizer(vocabSize=config['vocab_size'], inputCol=f"{i}_grams", outputCol=f"{i}_tf") for i in range(1, n+1)]
    idf = [IDF(minDocFreq=config['idf_min_doc_freq'], inputCol=f"{i}_tf", outputCol=f"{i}_tfidf") for i in range(1, n+1)]
    assembler = [VectorAssembler(inputCols=[f"{i}_tfidf" for i in range(1, n+1)], outputCol="features")]
    label_stringIdx = [StringIndexer(inputCol = target_column, outputCol = 'label')]
    svm = [LinearSVC(maxIter=config['max_iter'], regParam=config['reg_param'])]
    
    pipeline = Pipeline(stages=tokenizer + ngrams + cv + idf + assembler + label_stringIdx + svm)
    return pipeline
    

## Training

In [None]:
pipeline = create_ngrams()

In [None]:
#fit the pipeline to the training data and transform the data
pipelineFit = pipeline.fit(train_set)
predictions = pipelineFit.transform(val_set)

## Evaluation

In [None]:
#evaluate the predictions
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") #create an evaluator
auc = evaluator.evaluate(predictions) #evaluate the predictions, this is the AUC
print("AUC on validation data = %g" % auc)

In [None]:
predictionAndLabels = predictions.select("prediction", "label").rdd #get the predictions and labels as an rdd because the MulticlassMetrics class needs an rdd
metrics = MulticlassMetrics(predictionAndLabels)


# Get confusion matrix
print(metrics.confusionMatrix().toArray()) 

# Get accuracy
print("Accuracy: %s" % (metrics.accuracy))

# Get precision, recall, f1

print("Precision for negative: %s" % (metrics.precision(label=1.0)))
print("Recall for negative: %s" % (metrics.recall(label=1.0)))
print("F1-Score for negative: %s" % (metrics.fMeasure(label=1.0, beta=1.0)))

print("Precision for positive: %s" % (metrics.precision(label=0.0)))
print("Recall for positive: %s" % (metrics.recall(label=0.0)))
print("F1-Score for positive: %s" % (metrics.fMeasure(label=0.0, beta=1.0)))

# calculate macro avg
precision = (metrics.precision(label=1.0) + metrics.precision(label=0.0))/2
recall = (metrics.recall(label=1.0) + metrics.recall(label=0.0))/2
f1 = (metrics.fMeasure(label=1.0, beta=1.0) + metrics.fMeasure(label=0.0, beta=1.0))/2

print("Macro Precision: %s" % (precision))
print("Macro Recall: %s" % (recall))
print("Macro F1-Score: %s" % (f1))




## Post-training

In [None]:
# log the results
wandb.log({"auc": auc, "accuracy": metrics.accuracy, "precision_negative": metrics.precision(label=1.0), "recall_negative": metrics.recall(label=1.0), "f1_negative": metrics.fMeasure(label=1.0, beta=1.0), "precision_positive": metrics.precision(label=0.0), "recall_positive": metrics.recall(label=0.0), "f1_positive": metrics.fMeasure(label=0.0, beta=1.0), "macro_precision": precision, "macro_recall": recall, "macro_f1": f1})

# save the model

#pipeline.save("svmModel-ngram-tfidf")

# push the model to wandb
#wandb.save('svmModel-ngram-tfidf')

# finish the run
wandb.finish()
