# Training 1: TF-IDF + Logistic Regression

- Code version: 1.0
- Python version: 3.11.6
- Owner: Aditya Patkar
- File created: 2023-11-16

## Configurations

In [1]:
#Set the JAVA_HOME environment variable to the path of Java installation.
import os

In [None]:
config = {'train_size':0.8, 'test_size':0.2, 'embedding_dim':200, 'dropout':0.2,'batch_size':1024, 'epochs':15, 'factor_lr': 0.1, 'min_lr':0.01}

In [2]:
#Necessary imports
import warnings
warnings.filterwarnings('ignore')

import wandb

import findspark
findspark.init()
findspark.find()

import boto3
import matplotlib.pyplot as plt

import pyspark as ps
from pyspark.sql import SQLContext
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator 
from pyspark.mllib.evaluation import MulticlassMetrics

In [21]:
#login to wandb and initialize the project
wandb.login(relogin=True ) #uncomment this line if you are running this code for the first time
wandb.init(project="msml651-sentiment-analysis", entity="apatkar", name="tfidf+lr")

In [22]:
#initialize spark context
try:
    # create SparkContext on all CPUs available)
    sc = ps.SparkContext( 'local[*]' )
    sqlContext = SQLContext(sc)
    print("Just created a SparkContext")
except ValueError:
    warnings.warn("SparkContext already exists in this scope")



## Dataset

In [10]:
#get the data from s3
s3 = boto3.resource('s3', region_name='us-east-1', aws_access_key_id="AKIAVMCC766MHUJBYMEJ", aws_secret_access_key="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")
bucket = s3.Bucket('msml651')
bucket.download_file('sentiment140_clean_no_stopwords.parquet', './data/sentiment140_clean_no_stopwords.parquet')

In [23]:
#read the data into a spark dataframe
df = sqlContext.read.parquet('./data/sentiment140_clean_no_stopwords.parquet')
df.show(5)

+------+----------+--------------------+----------+---------------+--------------------+-----------------+----------------+---------------+----------------+-----------------------+
|target|  tweet_id|                date|query_flag|      user_name|               tweet|post_clean_length|pre_clean_length|pre_clean_words|post_clean_words|tweet_without_stopwords|
+------+----------+--------------------+----------+---------------+--------------------+-----------------+----------------+---------------+----------------+-----------------------+
|     0|1467810369|Mon Apr 06 22:19:...|  NO_QUERY|_TheSpecialOne_|awww that s a bum...|               44|             115|             19|               8|   awww bummer shoul...|
|     0|1467810672|Mon Apr 06 22:19:...|  NO_QUERY|  scotthamilton|is upset that he ...|               69|             111|             21|              11|   upset update face...|
|     0|1467810917|Mon Apr 06 22:19:...|  NO_QUERY|       mattycus|i dived many time...|       

In [24]:
#Set the config parameters
config = {
    'reg_param': 0.001, 
    'max_iter': 200,   
    'elastic_net_param': 0.001,
    'train_size': 0.95,
    'test_size': 0.025,
    'val_size' : 0.025,
    'tf_hash_size': 2**16,
    'idf_min_doc_freq': 5,
    'type': 'tfidf + lr',
}
wandb.config.update(config)

In [25]:
#split the data into train, test and validation sets
(train_set, val_set, test_set) = df.randomSplit([config['train_size'], config['val_size'], config['test_size']], seed = 2000)

## Preprocessing

In [26]:
#Create a pipeline to transform the data 
tokenizer = Tokenizer(inputCol="tweet_without_stopwords", outputCol="words") #split tweet into words
hashtf = HashingTF(numFeatures=config['tf_hash_size'], inputCol="words", outputCol='tf') #hash the words into vectors, num features is the number of buckets to hash into
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=config['idf_min_doc_freq']) #use idf to reduce the importance of words that appear frequently
label_stringIdx = StringIndexer(inputCol = "target", outputCol = "label") #convert the target into a label
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx]) #create a pipeline


In [27]:
#fit the pipeline to the training data and transform the data
pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(val_set)
train_df.show(5)

23/12/01 13:04:55 WARN DAGScheduler: Broadcasting large task binary with size 1105.5 KiB
23/12/01 13:04:55 WARN DAGScheduler: Broadcasting large task binary with size 1105.5 KiB

+------+----------+--------------------+----------+---------------+--------------------+-----------------+----------------+---------------+----------------+-----------------------+--------------------+--------------------+--------------------+-----+
|target|  tweet_id|                date|query_flag|      user_name|               tweet|post_clean_length|pre_clean_length|pre_clean_words|post_clean_words|tweet_without_stopwords|               words|                  tf|            features|label|
+------+----------+--------------------+----------+---------------+--------------------+-----------------+----------------+---------------+----------------+-----------------------+--------------------+--------------------+--------------------+-----+
|     0|1467810369|Mon Apr 06 22:19:...|  NO_QUERY|_TheSpecialOne_|awww that s a bum...|               44|             115|             19|               8|   awww bummer shoul...|[awww, bummer, sh...|(65536,[21640,272...|(65536,[21640,272...|  0.0|


                                                                                

In [16]:
#unique labels and the target for context
train_df.select("label", "target").distinct().show()

23/12/01 12:54:00 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.


+-----+------+
|label|target|
+-----+------+
|  0.0|     4|
|  1.0|     0|
+-----+------+



                                                                                

## Training

In [28]:
lr = LogisticRegression(maxIter=config['max_iter'], regParam=config['reg_param'], elasticNetParam=config['elastic_net_param']) #create a logistic regression model
lrModel = lr.fit(train_df) #fit the model to the training data
predictions = lrModel.transform(val_df) #use the model to make predictions on the validation data

#evaluate the predictions
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") #create an evaluator
auc = evaluator.evaluate(predictions) #evaluate the predictions, this is the AUC
print("AUC on validation data = %g" % auc)

23/12/01 13:05:03 WARN DAGScheduler: Broadcasting large task binary with size 1121.8 KiB
23/12/01 13:05:13 WARN DAGScheduler: Broadcasting large task binary with size 1123.0 KiB
23/12/01 13:05:13 WARN DAGScheduler: Broadcasting large task binary with size 1122.4 KiB
23/12/01 13:05:24 WARN DAGScheduler: Broadcasting large task binary with size 1123.6 KiB
23/12/01 13:05:24 WARN DAGScheduler: Broadcasting large task binary with size 1122.4 KiB
23/12/01 13:05:25 WARN DAGScheduler: Broadcasting large task binary with size 1123.6 KiB
23/12/01 13:05:25 WARN DAGScheduler: Broadcasting large task binary with size 1122.4 KiB
23/12/01 13:05:25 WARN DAGScheduler: Broadcasting large task binary with size 1123.6 KiB
23/12/01 13:05:25 WARN DAGScheduler: Broadcasting large task binary with size 1122.4 KiB
23/12/01 13:05:25 WARN DAGScheduler: Broadcasting large task binary with size 1123.6 KiB
23/12/01 13:05:25 WARN DAGScheduler: Broadcasting large task binary with size 1122.4 KiB
23/12/01 13:05:25 WAR

AUC on validation data = 0.841009


## Evaluation

In [None]:
#plot the ROC curve
results = pipelineFit.stages[-1].summary.roc.select('FPR', 'TPR').toPandas()
plt.plot(results['FPR'], results['TPR'])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

# save to wandb
wandb.log({"roc": wandb.Image(plt)})

In [29]:
predictionAndLabels = predictions.select("prediction", "label").rdd #get the predictions and labels as an rdd because the MulticlassMetrics class needs an rdd
metrics = MulticlassMetrics(predictionAndLabels)


# Get confusion matrix
print(metrics.confusionMatrix().toArray()) 

# Get accuracy
print("Accuracy: %s" % (metrics.accuracy))

# Get precision, recall, f1

print("Precision for negative: %s" % (metrics.precision(label=1.0)))
print("Recall for negative: %s" % (metrics.recall(label=1.0)))
print("F1-Score for negative: %s" % (metrics.fMeasure(label=1.0, beta=1.0)))

print("Precision for positive: %s" % (metrics.precision(label=0.0)))
print("Recall for positive: %s" % (metrics.recall(label=0.0)))
print("F1-Score for positive: %s" % (metrics.fMeasure(label=0.0, beta=1.0)))


23/12/01 13:06:08 WARN DAGScheduler: Broadcasting large task binary with size 1625.2 KiB
23/12/01 13:06:08 WARN DAGScheduler: Broadcasting large task binary with size 1625.2 KiB
23/12/01 13:06:11 WARN DAGScheduler: Broadcasting large task binary with size 1638.5 KiB

[[15023.  5061.]
 [ 4295. 15821.]]
Accuracy: 0.7672636815920398
Precision for negative: 0.7576381572646298
Recall for negative: 0.7864883674686817
F1-Score for negative: 0.7717937460363921
Precision for positive: 0.777668495703489
Recall for positive: 0.7480083648675563
F1-Score for positive: 0.7625501243591695


                                                                                

## Post-training

In [30]:
# log the results
wandb.log({"auc": auc, "accuracy": metrics.accuracy, "precision_negative": metrics.precision(label=1.0), "recall_negative": metrics.recall(label=1.0), "f1_negative": metrics.fMeasure(label=1.0, beta=1.0), "precision_positive": metrics.precision(label=0.0), "recall_positive": metrics.recall(label=0.0), "f1_positive": metrics.fMeasure(label=0.0, beta=1.0)})

# save the model

#lrModel.save("lrModel")

# push the model to wandb
#wandb.save('lrModel')

# finish the run
wandb.finish()


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
auc,▁
f1_negative,▁
f1_positive,▁
precision_negative,▁
precision_positive,▁
recall_negative,▁
recall_positive,▁

0,1
accuracy,0.76726
auc,0.84101
f1_negative,0.77179
f1_positive,0.76255
precision_negative,0.75764
precision_positive,0.77767
recall_negative,0.78649
recall_positive,0.74801
