# Training 1: TF-IDF + Logistic Regression

- Code version: 1.0
- Python version: 3.11.6
- Owner: Aditya Patkar
- File created: 2023-11-16

## Configurations

In [None]:
#Set the JAVA_HOME environment variable to the path of Java installation.
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [None]:
#Necessary imports
import warnings
warnings.filterwarnings('ignore')

import wandb

import findspark
findspark.init()
findspark.find()

import boto3
import matplotlib.pyplot as plt

import pyspark as ps
from pyspark.sql import SQLContext
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator 
from pyspark.mllib.evaluation import MulticlassMetrics

In [None]:
#login to wandb and initialize the project
wandb.login(relogin=True ) #uncomment this line if you are running this code for the first time
wandb.init(project="msml651-sentiment-analysis", entity="apatkar", name="tfidf+lr")

In [None]:
#initialize spark context
try:
    # create SparkContext on all CPUs available)
    sc = ps.SparkContext( 'local[*]' )
    sqlContext = SQLContext(sc)
    print("Just created a SparkContext")
except ValueError:
    warnings.warn("SparkContext already exists in this scope")

## Dataset

In [None]:
#get the data from s3
s3 = boto3.resource('s3', region_name='us-east-1', aws_access_key_id="KEY", aws_secret_access_key="KEY")
bucket = s3.Bucket('msml651')
bucket.download_file('sentiment140_clean_no_stopwords.parquet', './data/sentiment140_clean_no_stopwords.parquet')

In [None]:
#read the data into a spark dataframe
df = sqlContext.read.parquet('./data/sentiment140_clean_no_stopwords.parquet')
df.show(5)

In [None]:
#Set the config parameters
config = {
    'reg_param': 0.001, 
    'max_iter': 200,   
    'elastic_net_param': 0.001,
    'train_size': 0.95,
    'test_size': 0.025,
    'val_size' : 0.025,
    'tf_hash_size': 2**16,
    'idf_min_doc_freq': 5,
    'type': 'tfidf + lr',
}
wandb.config.update(config)

In [None]:
#split the data into train, test and validation sets
(train_set, val_set, test_set) = df.randomSplit([config['train_size'], config['val_size'], config['test_size']], seed = 2000)

## Preprocessing

In [None]:
#Create a pipeline to transform the data 
tokenizer = Tokenizer(inputCol="tweet_without_stopwords", outputCol="words") #split tweet into words
hashtf = HashingTF(numFeatures=config['tf_hash_size'], inputCol="words", outputCol='tf') #hash the words into vectors, num features is the number of buckets to hash into
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=config['idf_min_doc_freq']) #use idf to reduce the importance of words that appear frequently
label_stringIdx = StringIndexer(inputCol = "target", outputCol = "label") #convert the target into a label
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx]) #create a pipeline


In [None]:
#fit the pipeline to the training data and transform the data
pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(val_set)
train_df.show(5)

In [None]:
#unique labels and the target for context
train_df.select("label", "target").distinct().show()

## Training

In [None]:
lr = LogisticRegression(maxIter=config['max_iter'], regParam=config['reg_param'], elasticNetParam=config['elastic_net_param']) #create a logistic regression model
lrModel = lr.fit(train_df) #fit the model to the training data
predictions = lrModel.transform(val_df) #use the model to make predictions on the validation data

#evaluate the predictions
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") #create an evaluator
auc = evaluator.evaluate(predictions) #evaluate the predictions, this is the AUC
print("AUC on validation data = %g" % auc)

## Evaluation

In [None]:
#plot the ROC curve
results = pipelineFit.stages[-1].summary.roc.select('FPR', 'TPR').toPandas()
plt.plot(results['FPR'], results['TPR'])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

# save to wandb
wandb.log({"roc": wandb.Image(plt)})

In [None]:
predictionAndLabels = predictions.select("prediction", "label").rdd #get the predictions and labels as an rdd because the MulticlassMetrics class needs an rdd
metrics = MulticlassMetrics(predictionAndLabels)


# Get confusion matrix
print(metrics.confusionMatrix().toArray()) 

# Get accuracy
print("Accuracy: %s" % (metrics.accuracy))

# Get precision, recall, f1

print("Precision for negative: %s" % (metrics.precision(label=1.0)))
print("Recall for negative: %s" % (metrics.recall(label=1.0)))
print("F1-Score for negative: %s" % (metrics.fMeasure(label=1.0, beta=1.0)))

print("Precision for positive: %s" % (metrics.precision(label=0.0)))
print("Recall for positive: %s" % (metrics.recall(label=0.0)))
print("F1-Score for positive: %s" % (metrics.fMeasure(label=0.0, beta=1.0)))


## Post-training

In [None]:
# log the results
wandb.log({"auc": auc, "accuracy": metrics.accuracy, "precision_negative": metrics.precision(label=1.0), "recall_negative": metrics.recall(label=1.0), "f1_negative": metrics.fMeasure(label=1.0, beta=1.0), "precision_positive": metrics.precision(label=0.0), "recall_positive": metrics.recall(label=0.0), "f1_positive": metrics.fMeasure(label=0.0, beta=1.0)})

# save the model

#lrModel.save("lrModel")

# push the model to wandb
#wandb.save('lrModel')

# finish the run
wandb.finish()
