# Training 1: Word2Vec + Logistic Regression

- Code version: 1.0
- Python version: 3.11.6
- Owner: Aditya Patkar
- File created: 2023-11-16

## Configurations

In [1]:
#Set the JAVA_HOME environment variable to the path of Java installation.
import os

In [1]:
#Necessary imports
import warnings
warnings.filterwarnings("ignore")

import wandb

import findspark
findspark.init()
findspark.find()

import boto3
import matplotlib.pyplot as plt

import pyspark as ps
from pyspark.sql import SQLContext
from pyspark.ml.feature import Tokenizer, NGram, Word2Vec, StringIndexer
from pyspark.ml import Pipeline

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [2]:
#login to wandb and initialize the project
#wandb.login(relogin=True ) #uncomment this line if you are running this code for the first time
wandb.init(project="msml651-sentiment-analysis", entity="apatkar", name="w2v_lr")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mapatkar[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
#initialize spark context
try:
    # create SparkContext on all CPUs available)
    sc = ps.SparkContext( 'local[*]' )
    sqlContext = SQLContext(sc)
    print("Just created a SparkContext")
except ValueError:
    warnings.warn("SparkContext already exists in this scope")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/01 15:01:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/12/01 15:01:28 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/12/01 15:01:28 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/12/01 15:01:28 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


Just created a SparkContext


In [4]:
sc.setLogLevel("ERROR")

## Dataset

In [5]:
#get the data from s3
s3 = boto3.resource('s3', region_name='us-east-1', aws_access_key_id="AKIAVMCC766MHUJBYMEJ", aws_secret_access_key="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")
bucket = s3.Bucket('msml651')
bucket.download_file('sentiment140_clean_no_stopwords.parquet', './data/sentiment140_clean_no_stopwords.parquet')

In [8]:
#read the data into a spark dataframe
df = sqlContext.read.parquet('./data/sentiment140_clean_no_stopwords.parquet')
df.show(5)

+------+----------+--------------------+----------+---------------+--------------------+-----------------+----------------+---------------+----------------+-----------------------+
|target|  tweet_id|                date|query_flag|      user_name|               tweet|post_clean_length|pre_clean_length|pre_clean_words|post_clean_words|tweet_without_stopwords|
+------+----------+--------------------+----------+---------------+--------------------+-----------------+----------------+---------------+----------------+-----------------------+
|     0|1467810369|Mon Apr 06 22:19:...|  NO_QUERY|_TheSpecialOne_|awww that s a bum...|               44|             115|             19|               8|   awww bummer shoul...|
|     0|1467810672|Mon Apr 06 22:19:...|  NO_QUERY|  scotthamilton|is upset that he ...|               69|             111|             21|              11|   upset update face...|
|     0|1467810917|Mon Apr 06 22:19:...|  NO_QUERY|       mattycus|i dived many time...|       

In [6]:
#Set the config parameters
config = {
    'type': 'w2v + lr',
    'vector_size': 65,
    'min_count': 5,
    'window_size': 5,
    'train_size': 0.95,
    'test_size': 0.25,
    'val_size': 0.25,
    'max_iter': 100,
    'reg_param': 0.1,
    'elastic_net_param': 0.1,
    
}
wandb.config.update(config)

In [9]:
#split the data into train, test and validation sets
(train_set, val_set, test_set) = df.randomSplit([config['train_size'], config['val_size'], config['test_size']], seed = 2000)

## Preprocessing

In [10]:
def create_w2v_pipeline(input_column = 'tweet_without_stopwords', target_column = 'target', n=2):
    """
    Create Word2Vec pipeline
    """

    #tokenize the tweets
    tokenizer = Tokenizer(inputCol=input_column, outputCol="tokens")

    #create ngrams
    ngram = NGram(n=n, inputCol="tokens", outputCol="ngrams")

    #create word2vec
    word2Vec = Word2Vec(vectorSize=config['vector_size'], minCount=config['min_count'], inputCol="ngrams", outputCol="features")
    
    #label
    label_stringIdx = StringIndexer(inputCol = target_column, outputCol = 'label')
    
    #lr
    lr = LogisticRegression(maxIter=config['max_iter'], regParam=config['reg_param'], elasticNetParam=config['elastic_net_param'], featuresCol='features', labelCol='label')
    
    #create the pipeline
    pipeline = Pipeline(stages=[tokenizer, ngram, word2Vec, label_stringIdx, lr])
    

    return pipeline



    
    

## Training

In [11]:
pipeline = create_w2v_pipeline()

In [12]:
#fit the pipeline to the training data and transform the data
pipelineFit = pipeline.fit(train_set)
predictions = pipelineFit.transform(val_set)

                                                                                

## Evaluation

In [13]:
predictions.select('tweet_without_stopwords', 'rawPrediction', 'prediction', 'probability').show(10)



+-----------------------+--------------------+----------+--------------------+
|tweet_without_stopwords|       rawPrediction|prediction|         probability|
+-----------------------+--------------------+----------+--------------------+
|   upset update face...|[0.22778971622185...|       0.0|[0.55670245866520...|
|   hey long time see...|[-0.1348445418892...|       1.0|[0.46633985275736...|
|                   nope|[-0.1017402144226...|       1.0|[0.47458686371214...|
|      day get much done|[1.06885400881405...|       0.0|[0.74437891891001...|
|      im sad miss lilly|[1.21706153282713...|       0.0|[0.77154602042009...|
|   hacked account ai...|[-0.1252114518831...|       1.0|[0.46873796996256...|
|   want go promote g...|[0.27330963887894...|       0.0|[0.56790523536298...|
|   wow tons replies ...|[-0.0185768140466...|       1.0|[0.49535593004251...|
|   leaving parking l...|[0.10916100303523...|       0.0|[0.52726318350969...|
|   sure right need s...|[0.41175833383571...|      

                                                                                

In [14]:
#evaluate the predictions
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction") #create an evaluator
auc = evaluator.evaluate(predictions) #evaluate the predictions, this is the AUC
print("AUC on validation data = %g" % auc)

                                                                                

AUC on validation data = 0.65797


In [15]:
predictionAndLabels = predictions.select("prediction", "label").rdd #get the predictions and labels as an rdd because the MulticlassMetrics class needs an rdd
metrics = MulticlassMetrics(predictionAndLabels)


# Get confusion matrix
print(metrics.confusionMatrix().toArray()) 

# Get accuracy
print("Accuracy: %s" % (metrics.accuracy))

# Get precision, recall, f1

print("Precision for negative: %s" % (metrics.precision(label=1.0)))
print("Recall for negative: %s" % (metrics.recall(label=1.0)))
print("F1-Score for negative: %s" % (metrics.fMeasure(label=1.0, beta=1.0)))

print("Precision for positive: %s" % (metrics.precision(label=0.0)))
print("Recall for positive: %s" % (metrics.recall(label=0.0)))
print("F1-Score for positive: %s" % (metrics.fMeasure(label=0.0, beta=1.0)))

# calculate macro avg
precision = (metrics.precision(label=1.0) + metrics.precision(label=0.0))/2
recall = (metrics.recall(label=1.0) + metrics.recall(label=0.0))/2
f1 = (metrics.fMeasure(label=1.0, beta=1.0) + metrics.fMeasure(label=0.0, beta=1.0))/2

print("Macro Precision: %s" % (precision))
print("Macro Recall: %s" % (recall))
print("Macro F1-Score: %s" % (f1))






[[ 62832.  74862.]
 [ 33786. 104416.]]
Accuracy: 0.6061994374691912
Precision for negative: 0.5824250605205323
Recall for negative: 0.7555317578616807
F1-Score for negative: 0.6577800176389064
Precision for positive: 0.6503136061603428
Recall for positive: 0.45631617935422025
F1-Score for positive: 0.5363105602786029
Macro Precision: 0.6163693333404376
Macro Recall: 0.6059239686079505
Macro F1-Score: 0.5970452889587546


                                                                                

## Post-training

In [16]:
# log the results
wandb.log({"auc": auc, "accuracy": metrics.accuracy, "precision_negative": metrics.precision(label=1.0), "recall_negative": metrics.recall(label=1.0), "f1_negative": metrics.fMeasure(label=1.0, beta=1.0), "precision_positive": metrics.precision(label=0.0), "recall_positive": metrics.recall(label=0.0), "f1_positive": metrics.fMeasure(label=0.0, beta=1.0), "macro_precision": precision, "macro_recall": recall, "macro_f1": f1})

# save the model

#pipeline.save("svmModel-ngram-tfidf")

# push the model to wandb
#wandb.save('svmModel-ngram-tfidf')

# finish the run
wandb.finish()


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
auc,▁
f1_negative,▁
f1_positive,▁
macro_f1,▁
macro_precision,▁
macro_recall,▁
precision_negative,▁
precision_positive,▁
recall_negative,▁

0,1
accuracy,0.6062
auc,0.65797
f1_negative,0.65778
f1_positive,0.53631
macro_f1,0.59705
macro_precision,0.61637
macro_recall,0.60592
precision_negative,0.58243
precision_positive,0.65031
recall_negative,0.75553
