# 4.0 Model Training
## 4.1 Model Training

###### Author: Yeap Jie Shen, Gan Yee Jing
###### Last Edited: 01/09/2024

### 4.1.1 Importing Libraries 

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import udf
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import sys
sys.path.append(r'/home/student/RDS2S3G4_CLO2_B')

from data_stores.mongodbClient import MongoDBClient
from data_stores.redisClient import RedisClient
from data_stores.vectorArrayConverter import VectorArrayConverter

import pickle

### 4.1.2 Initialising Spark Session, MongoDB Client and Redis Client

In [2]:
# create spark session
spark = SparkSession.builder.appName('model training').getOrCreate()

# instantiate mongodb client
mongodb_client = MongoDBClient()

# instantiate redis client
redis_client = RedisClient(host = 'localhost', port = 6379, db = 0, start_now = True)

24/09/01 19:31:26 WARN Utils: Your hostname, Gan. resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/09/01 19:31:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/01 19:31:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Pinged your deployment. You successfully connected to MongoDB!


[sudo] password for student: 

### 4.1.3 Fetching Data from MongoDB or Redis

In [3]:
# Reading data from Redis if cache, otherwise from MongoDB with id column excluded
if redis_client.exists_key('feature_engineered_train_dataset'):
    train_list = pickle.loads(redis_client.get_value('feature_engineered_train_dataset'))
else:
    train_list = list(mongodb_client.read_many('Feature_Engineered_Dataset', 'final_train_set', {'_id': 0}))
    redis_client.set_key_value('feature_engineered_train_dataset', pickle.dumps(train_list), seconds = 5 * 60)

if redis_client.exists_key('feature_engineered_test_dataset'):
    test_list = pickle.loads(redis_client.get_value('feature_engineered_test_dataset'))
else:
    test_list = list(mongodb_client.read_many('Feature_Engineered_Dataset', 'final_test_set', {'_id': 0}))
    redis_client.set_key_value('feature_engineered_test_dataset', pickle.dumps(test_list), seconds = 5 * 60)

df_train = (
    spark.createDataFrame(train_list)
    .select(
        'category_index', 
        'author', 'url', 'datetime', 'publisher',
        '1tf_idf_content', '2tf_idf_content', '3tf_idf_content', '4tf_idf_content', '5tf_idf_content', 'tf_idf_headline',
        '1gram_word2vec_content', '2gram_word2vec_content', '3gram_word2vec_content', '4gram_word2vec_content', '5gram_word2vec_content',
        'content_token_count')
)

df_test = (
    spark.createDataFrame(test_list)
    .select(
        'category_index', 
        'author', 'url', 'datetime', 'publisher',
        '1tf_idf_content', '2tf_idf_content', '3tf_idf_content', '4tf_idf_content', '5tf_idf_content', 'tf_idf_headline',
        '1gram_word2vec_content', '2gram_word2vec_content', '3gram_word2vec_content', '4gram_word2vec_content', '5gram_word2vec_content',
        'content_token_count')
)

df_train.show()

                                                                                

+--------------+------+--------------------+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+----------------------+----------------------+----------------------+----------------------+-------------------+
|category_index|author|                 url|            datetime|       publisher|     1tf_idf_content|     2tf_idf_content|     3tf_idf_content|     4tf_idf_content|     5tf_idf_content|     tf_idf_headline|1gram_word2vec_content|2gram_word2vec_content|3gram_word2vec_content|4gram_word2vec_content|5gram_word2vec_content|content_token_count|
+--------------+------+--------------------+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+----------------------+----------------------+----------------------+------------

### 4.1.4 Transform Data into Suitable Format

In [4]:
# Apply the UDF to create tf_idf with sparse vector, and word2vec with dense vector
for i in range(1,6):
    df_train = (
        df_train
        .withColumn(f'{i}tf_idf_content', VectorArrayConverter.array_to_vector(df_train[f'{i}tf_idf_content']))
        .withColumn(f'{i}gram_word2vec_content', VectorArrayConverter.array_to_vector(df_train[f'{i}gram_word2vec_content']))
    )
    df_test = (
        df_test
        .withColumn(f'{i}tf_idf_content', VectorArrayConverter.array_to_vector(df_test[f'{i}tf_idf_content']))
        .withColumn(f'{i}gram_word2vec_content', VectorArrayConverter.array_to_vector(df_test[f'{i}gram_word2vec_content']))
    )

df_train = df_train.withColumn('tf_idf_headline', VectorArrayConverter.array_to_vector(df_train['tf_idf_headline']))
df_test = df_test.withColumn('tf_idf_headline', VectorArrayConverter.array_to_vector(df_test['tf_idf_headline']))

df_train.show()

[Stage 1:>                                                          (0 + 1) / 1]

+--------------+------+--------------------+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+----------------------+----------------------+----------------------+----------------------+-------------------+
|category_index|author|                 url|            datetime|       publisher|     1tf_idf_content|     2tf_idf_content|     3tf_idf_content|     4tf_idf_content|     5tf_idf_content|     tf_idf_headline|1gram_word2vec_content|2gram_word2vec_content|3gram_word2vec_content|4gram_word2vec_content|5gram_word2vec_content|content_token_count|
+--------------+------+--------------------+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+----------------------+----------------------+----------------------+------------

                                                                                

### 4.1.5 Model Training Using Random Forest 

In [5]:
# Don't simply run, it takes around 5 minutes to run
# assemble features into a vector
assembler = VectorAssembler(inputCols = ['1tf_idf_content', 
                                         '2tf_idf_content', 
                                         '3tf_idf_content', 
                                         '4tf_idf_content', 
                                         '5tf_idf_content', 
                                         'tf_idf_headline',
                                         '1gram_word2vec_content', 
                                         '2gram_word2vec_content', 
                                         '3gram_word2vec_content', 
                                         '4gram_word2vec_content', 
                                         '5gram_word2vec_content',
                                         'content_token_count'], outputCol='features')
df_train = assembler.transform(df_train)

# Define the Random Forest model
random_forest = RandomForestClassifier(labelCol='category_index', featuresCol='features', seed=20)

# Define the hyperparameter grid
parameter_grid = ParamGridBuilder() \
    .addGrid(random_forest.numTrees, [10, 15, 20, 25, 30]) \
    .addGrid(random_forest.maxDepth, [3, 5, 7, 10, 15]) \
    .build()

# Create the cross-validator
cross_validator = CrossValidator(estimator = random_forest,
                          estimatorParamMaps = parameter_grid,
                          evaluator = MulticlassClassificationEvaluator(labelCol='category_index', metricName='f1'),
                          numFolds = 5, seed = 20)

# Train the model with the best hyperparameters
cv_model = cross_validator.fit(df_train)

24/09/01 19:31:47 WARN DAGScheduler: Broadcasting large task binary with size 1146.6 KiB
24/09/01 19:31:47 WARN DAGScheduler: Broadcasting large task binary with size 1339.7 KiB
24/09/01 19:31:48 WARN DAGScheduler: Broadcasting large task binary with size 1518.2 KiB
24/09/01 19:31:48 WARN DAGScheduler: Broadcasting large task binary with size 1670.5 KiB
24/09/01 19:31:48 WARN DAGScheduler: Broadcasting large task binary with size 1788.9 KiB
24/09/01 19:31:48 WARN DAGScheduler: Broadcasting large task binary with size 1269.0 KiB
24/09/01 19:31:52 WARN DAGScheduler: Broadcasting large task binary with size 1039.7 KiB
24/09/01 19:31:53 WARN DAGScheduler: Broadcasting large task binary with size 1319.8 KiB
24/09/01 19:31:53 WARN DAGScheduler: Broadcasting large task binary with size 1075.1 KiB
24/09/01 19:31:54 WARN DAGScheduler: Broadcasting large task binary with size 1039.7 KiB
24/09/01 19:31:55 WARN DAGScheduler: Broadcasting large task binary with size 1319.8 KiB
24/09/01 19:31:55 WAR

In [8]:
# get the best model
best_rf_model = cv_model.bestModel
best_rf_model.save(r'../model/full_features_model')

# print the parameter
for param, value in best_rf_model.extractParamMap().items():
    print(f"{param.name}: {value}")

bootstrap: True
cacheNodeIds: False
checkpointInterval: 10
featureSubsetStrategy: auto
featuresCol: features
impurity: gini
labelCol: category_index
leafCol: 
maxBins: 32
maxDepth: 15
maxMemoryInMB: 256
minInfoGain: 0.0
minInstancesPerNode: 1
minWeightFractionPerNode: 0.0
numTrees: 30
predictionCol: prediction
probabilityCol: probability
rawPredictionCol: rawPrediction
seed: 20
subsamplingRate: 1.0


### 4.1.6 Model Evaluation

In [9]:
# Make predictions on the test data
df_test = assembler.transform(df_test)
predictions = best_rf_model.transform(df_test)

# Evaluate using Precision, Recall and F1 Score
evaluator = MulticlassClassificationEvaluator(labelCol='category_index', predictionCol='prediction')
weighted_precision = evaluator.evaluate(predictions, {evaluator.metricName: 'weightedPrecision'})
weighted_recall = evaluator.evaluate(predictions, {evaluator.metricName: 'weightedRecall'})
f1_score = evaluator.evaluate(predictions, {evaluator.metricName: 'f1'})

print(f'Weighted Precision: {weighted_precision:.4f}')
print(f'Weigthed Recall: {weighted_recall:.4f}')
print(f'F1-Score: {f1_score:.4f}')

24/09/01 19:36:52 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/09/01 19:36:53 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
24/09/01 19:36:54 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB


Weighted Precision: 0.3161
Weigthed Recall: 0.3940
F1-Score: 0.3259


                                                                                

In [10]:
# stop spark session
redis_client.stop_service()
spark.stop()

[sudo] password for student: 