# 3.0 Data Annotation 
## 3.3 Feature Engineering

###### Author: Yeap Jie Shen, Gan Yee Jing, Thong Pei Ting
###### Last Edited: 01/09/2024

### 3.3.1 Importing Libraries 

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF, Word2Vec, NGram, CountVectorizer, StringIndexer
from pyspark.ml.linalg import DenseVector
from pyspark.sql.functions import udf, size
from pyspark.sql.types import ArrayType, FloatType

import sys
sys.path.append(r'/home/student/RDS2S3G4_CLO2_B')

from data_stores.mongodbClient import MongoDBClient
from data_stores.redisClient import RedisClient
from data_stores.vectorArrayConverter import VectorArrayConverter

import pickle

### 3.3.2 Initialising Spark Session, MongoDB Client and Redis Client

In [2]:
# create spark session
spark = SparkSession.builder.appName('feature engineering').getOrCreate()

# instantiate mongodb client
mongodb_client = MongoDBClient()

# instantiate redis client
redis_client = RedisClient(host = 'localhost', port = 6379, db = 0, start_now = True)

24/09/01 19:27:58 WARN Utils: Your hostname, Gan. resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/09/01 19:27:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/01 19:27:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Pinged your deployment. You successfully connected to MongoDB!


[sudo] password for student: 

### 3.3.3 Fetching Split Dataset from MongoDB or Redis

In [3]:
# Reading data from Redis if cache, otherwise from MongoDB with id column excluded
if redis_client.exists_key('train_dataset'):
    train_list = pickle.loads(redis_client.get_value('train_dataset'))
else:
    train_list = list(mongodb_client.read_many('Split_Dataset', 'train_set', {'_id': 0}))
    redis_client.set_key_value('train_dataset', pickle.dumps(train_list), seconds = 5 * 60)

if redis_client.exists_key('test_dataset'):
    test_list = pickle.loads(redis_client.get_value('test_dataset'))
else:
    test_list = list(mongodb_client.read_many('Split_Dataset', 'test_set', {'_id': 0}))
    redis_client.set_key_value('test_dataset', pickle.dumps(test_list), seconds = 5 * 60)

df_train = spark.createDataFrame(train_list)

df_test = spark.createDataFrame(test_list)

df_train.show()

Traceback (most recent call last):                                  (0 + 1) / 1]
  File "/home/student/de-prj/de-venv/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 193, in manager
  File "/home/student/de-prj/de-venv/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 62, in worker
  File "/home/student/de-prj/de-venv/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 562, in loads
    s = stream.read(length)
ValueError: read length must be non-negative or -1
                                                                                

+------+--------------------+--------------------+----+----------------+--------------------+--------------------+--------------------+
|author|            category|            datetime| key|       publisher|   tokenised_content|  tokenised_headline|                 url|
+------+--------------------+--------------------+----+----------------+--------------------+--------------------+--------------------+
|      |       drug offences|2024-01-17T17:35:...|k317|Selangor Journal|[mother, among, t...|[mother, among, t...|https://selangorj...|
|      |       drug offences|2024-01-17T22:10:...|k312|Selangor Journal|[royal, custom, d...|[custom, foil, at...|https://selangorj...|
|      |       drug offences|2024-01-18T22:33:...|k309|Selangor Journal|[police, drug, ap...|[suspected, drug,...|https://selangorj...|
|      |       drug offences|2024-01-21T21:18:...|k300|Selangor Journal|[foreign, suspici...|[foreign, armed, ...|https://selangorj...|
|      |       drug offences|2024-01-23T18:18:..

### 3.3.4 Feature engineering
#### 3.3.4.1 Token Count (UniGram)

In [4]:
# Add the token count feature (counting the number of tokens)
df_train = df_train.withColumn('content_token_count', size(df_train['tokenised_content']))
df_test = df_test.withColumn('content_token_count', size(df_test['tokenised_content']))
df_train.select('tokenised_content', 'content_token_count').show()

+--------------------+-------------------+
|   tokenised_content|content_token_count|
+--------------------+-------------------+
|[mother, among, t...|                 87|
|[royal, custom, d...|                 83|
|[police, drug, ap...|                 75|
|[foreign, suspici...|                 91|
|[former, policema...|                126|
|[total, various, ...|                 80|
|[total, three, va...|                 70|
|[high, court, tod...|                 95|
|[shah, police, co...|                 75|
|[medium, police, ...|                 45|
|[march, police, f...|                 60|
|[march, police, w...|                 28|
|[town, march, pen...|                 94|
|[march, staff, pr...|                 60|
|[june, police, co...|                 83|
|[june, police, ni...|                 73|
|[june, court, tod...|                 61|
|[june, court, app...|                 54|
|[police, uncovere...|                 99|
|[shah, eleven, ba...|                 80|
+----------

#### 3.3.4.2 NGram

In [5]:
# n = 2, 3, 4, 5

for i in range(2, 6):
    n_gram = NGram(n=i, inputCol='tokenised_content', outputCol=f'{i}gram_content')
    df_train = n_gram.transform(df_train)
    df_test = n_gram.transform(df_test)

df_train.select('2gram_content', '3gram_content', '4gram_content', '5gram_content', 'category').show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|       2gram_content|       3gram_content|       4gram_content|       5gram_content|            category|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|[mother among, am...|[mother among thr...|[mother among thr...|[mother among thr...|       drug offences|
|[royal custom, cu...|[royal custom dep...|[royal custom dep...|[royal custom dep...|       drug offences|
|[police drug, dru...|[police drug appr...|[police drug appr...|[police drug appr...|       drug offences|
|[foreign suspicio...|[foreign suspicio...|[foreign suspicio...|[foreign suspicio...|       drug offences|
|[former policeman...|[former policeman...|[former policeman...|[former policeman...|       drug offences|
|[total various, v...|[total various th...|[total various th...|[total various th...|       drug offences|
|[total three, thr...|[total three va

#### 3.3.4.3 TF-IDF (apply to ngram_content, tokenised_headline)

In [6]:
df_train = df_train.withColumnRenamed('tokenised_content', '1gram_content')
df_test = df_test.withColumnRenamed('tokenised_content', '1gram_content')

# tf

# ngram content
for i in range(1, 6): 
    hashing_tf_content = HashingTF(inputCol=f'{i}gram_content', outputCol=f'{i}tf_content', numFeatures=20)
    df_train = hashing_tf_content.transform(df_train)
    df_test = hashing_tf_content.transform(df_test)
    
# headline
hashing_tf_headline = HashingTF(inputCol='tokenised_headline', outputCol='tf_headline', numFeatures=20)
df_train = hashing_tf_headline.transform(df_train)
df_test = hashing_tf_headline.transform(df_test)

# idf

# ngram content
for i in range(1, 6): 
    idf_content = IDF(inputCol=f'{i}tf_content', outputCol=f'{i}tf_idf_content')
    idf_content_model = idf_content.fit(df_train)
    idf_content_model.save(rf'../model/{i}_idf_content')
    df_train = idf_content_model.transform(df_train)
    df_test = idf_content_model.transform(df_test)

# headline
idf_headline = IDF(inputCol='tf_headline', outputCol='tf_idf_headline')
idf_headline_model = idf_headline.fit(df_train)
idf_headline_model.save(r'../model/idf_headline')
df_train = idf_headline_model.transform(df_train)
df_test = idf_headline_model.transform(df_test)

df_train = df_train.drop('1tf_content', '2tf_content', '3tf_content', '4tf_content', '5tf_content', 'tf_headline')
df_test = df_test.drop('1tf_content', '2tf_content', '3tf_content', '4tf_content', '5tf_content', 'tf_headline')
df_train.select('1tf_idf_content', '2tf_idf_content', '3tf_idf_content', '4tf_idf_content', '5tf_idf_content', 'tf_idf_headline', 'category').show()

                                                                                

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|     1tf_idf_content|     2tf_idf_content|     3tf_idf_content|     4tf_idf_content|     5tf_idf_content|     tf_idf_headline|            category|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|(20,[0,2,3,4,5,6,...|(20,[0,2,5,14],[1...|       drug offences|
|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,4,5,6,...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|(20,[3,9,11,16,18...|       drug offences|
|(20,[0,1,2,3,5,6,...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|(20,[0,10,11],[1....|       drug offences|
|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|(20,[0,1,2,3,4,5,...|

#### 3.3.4.4 Word2Vec

In [7]:
for i in range(1, 6):
    word2vec = Word2Vec(vectorSize = 3, minCount = 0, inputCol = f'{i}gram_content', outputCol = f'{i}gram_word2vec_content')
    word2vec_model = word2vec.fit(df_train)
    word2vec_model.save(rf'../model/{i}_gram_word2vec_content')
    df_train = word2vec_model.transform(df_train)
    df_test = word2vec_model.transform(df_test)

df_train.select('1gram_word2vec_content', '2gram_word2vec_content', '3gram_word2vec_content', '4gram_word2vec_content', '5gram_word2vec_content', 'category').show()

24/09/01 19:28:31 WARN TaskSetManager: Stage 73 contains a task of very large size (1039 KiB). The maximum recommended task size is 1000 KiB.
24/09/01 19:28:37 WARN TaskSetManager: Stage 82 contains a task of very large size (1156 KiB). The maximum recommended task size is 1000 KiB.


+----------------------+----------------------+----------------------+----------------------+----------------------+--------------------+
|1gram_word2vec_content|2gram_word2vec_content|3gram_word2vec_content|4gram_word2vec_content|5gram_word2vec_content|            category|
+----------------------+----------------------+----------------------+----------------------+----------------------+--------------------+
|  [0.11861965946596...|  [-0.1109606240237...|  [0.02741811161806...|  [0.03796430583488...|  [-0.0104894171179...|       drug offences|
|  [0.05214155997228...|  [-0.0183636057247...|  [0.04561822884500...|  [0.03104129807761...|  [0.01331566021589...|       drug offences|
|  [0.09970149492224...|  [-0.1521963680861...|  [0.02622188978560...|  [0.01141541506998...|  [-0.0096559518026...|       drug offences|
|  [0.21609623593042...|  [-0.1518462432444...|  [0.03763698163879...|  [0.01178657563146...|  [-0.0077459959623...|       drug offences|
|  [0.18276758440991...|  [-0.0309

### 3.3.5 Convert Category to Index Representation

In [8]:
indexer = StringIndexer(inputCol = 'category', outputCol = 'category_index')
index_model = indexer.fit(df_train)
index_model.save(r'../model/category_indexer')
df_train = index_model.transform(df_train)
df_test = index_model.transform(df_test)

df_train.select('key', 'category', 'category_index').show()

+----+--------------------+--------------+
| key|            category|category_index|
+----+--------------------+--------------+
|k317|       drug offences|           0.0|
|k312|       drug offences|           0.0|
|k309|       drug offences|           0.0|
|k300|       drug offences|           0.0|
|k291|       drug offences|           0.0|
|k280|       drug offences|           0.0|
|k265|       drug offences|           0.0|
|k262|       drug offences|           0.0|
|k259|       drug offences|           0.0|
|k213|       drug offences|           0.0|
|k199|       drug offences|           0.0|
|k189|       drug offences|           0.0|
|k171|       drug offences|           0.0|
|k164|       drug offences|           0.0|
| k60|       drug offences|           0.0|
| k58|       drug offences|           0.0|
| k50|       drug offences|           0.0|
| k32|       drug offences|           0.0|
|  k5|       drug offences|           0.0|
|k249|firearm or terrorism|           7.0|
+----+-----

### 3.2.5 Store Feature Engineered Dataset to MongoDB

In [9]:
# Apply the UDF to create tf_idf with sparse vector, and word2vec with dense vector
for i in range(1,6):
    df_train = (
        df_train
        .withColumn(f'{i}tf_idf_content', VectorArrayConverter.vector_to_array(df_train[f'{i}tf_idf_content']))
        .withColumn(f'{i}gram_word2vec_content', VectorArrayConverter.vector_to_array(df_train[f'{i}gram_word2vec_content']))
    )
    df_test = (
        df_test
        .withColumn(f'{i}tf_idf_content', VectorArrayConverter.vector_to_array(df_test[f'{i}tf_idf_content']))
        .withColumn(f'{i}gram_word2vec_content', VectorArrayConverter.vector_to_array(df_test[f'{i}gram_word2vec_content']))
    )

df_train = df_train.withColumn('tf_idf_headline', VectorArrayConverter.vector_to_array(df_train['tf_idf_headline']))
df_test = df_test.withColumn('tf_idf_headline', VectorArrayConverter.vector_to_array(df_test['tf_idf_headline']))

df_train.show()

                                                                                

+------+--------------------+--------------------+----+----------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+----------------------+----------------------+----------------------+----------------------+--------------+
|author|            category|            datetime| key|       publisher|       1gram_content|  tokenised_headline|                 url|content_token_count|       2gram_content|       3gram_content|       4gram_content|       5gram_content|     1tf_idf_content|     2tf_idf_content|     3tf_idf_content|     4tf_idf_content|     5tf_idf_content|     tf_idf_headline|1gram_word2vec_content|2gram_word2vec_content|3gram_word2vec_content|4gram_word2vec_content|5gram_word2vec_content|category_index|
+------+

In [10]:
# Convert dataframe into list of dictionaries to efficiently insert data into MongoDB
train_documents = df_train.rdd.map(lambda row: {
    'author': row[0], 
    'datetime' : row[2],
    'key': row[3], 
    'publisher' : row[4],
    'url' : row[7],
    'content_token_count' : row[8],
    '1tf_idf_content' : row[13],
    '2tf_idf_content' : row[14],
    '3tf_idf_content' : row[15],
    '4tf_idf_content' : row[16],
    '5tf_idf_content' : row[17],
    'tf_idf_headline': row[18],
    '1gram_word2vec_content' : row[19],
    '2gram_word2vec_content' : row[20],
    '3gram_word2vec_content' : row[21],
    '4gram_word2vec_content' : row[22],
    '5gram_word2vec_content' : row[23],
    'category_index': row[24]}
).collect()

                                                                                

In [11]:
test_documents = df_test.rdd.map(lambda row: {
    'author': row[0], 
    'datetime' : row[2],
    'key': row[3], 
    'publisher' : row[4],
    'url' : row[7],
    'content_token_count' : row[8],
    '1tf_idf_content' : row[13],
    '2tf_idf_content' : row[14],
    '3tf_idf_content' : row[15],
    '4tf_idf_content' : row[16],
    '5tf_idf_content' : row[17],
    'tf_idf_headline': row[18],
    '1gram_word2vec_content' : row[19],
    '2gram_word2vec_content' : row[20],
    '3gram_word2vec_content' : row[21],
    '4gram_word2vec_content' : row[22],
    '5gram_word2vec_content' : row[23],
    'category_index': row[24]}
).collect()

                                                                                

In [12]:
redis_client.set_key_value('feature_engineered_train_dataset', pickle.dumps(train_documents), seconds = 30 * 60)

True

In [13]:
redis_client.set_key_value('feature_engineered_test_dataset', pickle.dumps(test_documents), seconds = 30 * 60)

True

In [14]:
mongodb_client.insert_many('Feature_Engineered_Dataset', 'final_train_set', train_documents)

Documents successfully inserted: [ObjectId('66d4501c88268fd87a5d1939'), ObjectId('66d4501c88268fd87a5d193a'), ObjectId('66d4501c88268fd87a5d193b'), ObjectId('66d4501c88268fd87a5d193c'), ObjectId('66d4501c88268fd87a5d193d'), ObjectId('66d4501c88268fd87a5d193e'), ObjectId('66d4501c88268fd87a5d193f'), ObjectId('66d4501c88268fd87a5d1940'), ObjectId('66d4501c88268fd87a5d1941'), ObjectId('66d4501c88268fd87a5d1942'), ObjectId('66d4501c88268fd87a5d1943'), ObjectId('66d4501c88268fd87a5d1944'), ObjectId('66d4501c88268fd87a5d1945'), ObjectId('66d4501c88268fd87a5d1946'), ObjectId('66d4501c88268fd87a5d1947'), ObjectId('66d4501c88268fd87a5d1948'), ObjectId('66d4501c88268fd87a5d1949'), ObjectId('66d4501c88268fd87a5d194a'), ObjectId('66d4501c88268fd87a5d194b'), ObjectId('66d4501c88268fd87a5d194c'), ObjectId('66d4501c88268fd87a5d194d'), ObjectId('66d4501c88268fd87a5d194e'), ObjectId('66d4501c88268fd87a5d194f'), ObjectId('66d4501c88268fd87a5d1950'), ObjectId('66d4501c88268fd87a5d1951'), ObjectId('66d450

In [15]:
mongodb_client.insert_many('Feature_Engineered_Dataset', 'final_test_set', test_documents)

Documents successfully inserted: [ObjectId('66d4502788268fd87a5d2036'), ObjectId('66d4502788268fd87a5d2037'), ObjectId('66d4502788268fd87a5d2038'), ObjectId('66d4502788268fd87a5d2039'), ObjectId('66d4502788268fd87a5d203a'), ObjectId('66d4502788268fd87a5d203b'), ObjectId('66d4502788268fd87a5d203c'), ObjectId('66d4502788268fd87a5d203d'), ObjectId('66d4502788268fd87a5d203e'), ObjectId('66d4502788268fd87a5d203f'), ObjectId('66d4502788268fd87a5d2040'), ObjectId('66d4502788268fd87a5d2041'), ObjectId('66d4502788268fd87a5d2042'), ObjectId('66d4502788268fd87a5d2043'), ObjectId('66d4502788268fd87a5d2044'), ObjectId('66d4502788268fd87a5d2045'), ObjectId('66d4502788268fd87a5d2046'), ObjectId('66d4502788268fd87a5d2047'), ObjectId('66d4502788268fd87a5d2048'), ObjectId('66d4502788268fd87a5d2049'), ObjectId('66d4502788268fd87a5d204a'), ObjectId('66d4502788268fd87a5d204b'), ObjectId('66d4502788268fd87a5d204c'), ObjectId('66d4502788268fd87a5d204d'), ObjectId('66d4502788268fd87a5d204e'), ObjectId('66d450

In [16]:
pickle.loads(redis_client.get_value('feature_engineered_train_dataset'))

[{'author': '',
  'datetime': '2024-01-17T17:35:08+08:00',
  'key': 'k317',
  'publisher': 'Selangor Journal',
  'url': 'https://selangorjournal.my/2024/01/mother-son-among-three-held-for-drug-trafficking/',
  'content_token_count': 87,
  '1tf_idf_content': [0.09475792199373245,
   0.30641424655914307,
   0.14563611149787903,
   0.06533516198396683,
   0.15498901903629303,
   0.4488479495048523,
   0.22276671230793,
   0.4865248501300812,
   0.17688356339931488,
   0.22442397475242615,
   0.0753980204463005,
   0.21984700858592987,
   0.41895991563796997,
   0.7418197989463806,
   0.9046169519424438,
   0.2763420641422272,
   0.09016839414834976,
   0.5922505855560303,
   0.16482193768024445,
   0.5042296051979065],
  '2tf_idf_content': [0.17688356339931488,
   0.07638319581747055,
   0.18288899958133698,
   0.17246127128601074,
   0.06580562144517899,
   0.22049209475517273,
   0.16006089746952057,
   0.10435155779123306,
   0.36577799916267395,
   0.5545274615287781,
   0.28236126899

In [17]:
pickle.loads(redis_client.get_value('feature_engineered_test_dataset'))

[{'author': '',
  'datetime': '2024-01-19T17:40:34+08:00',
  'key': 'k306',
  'publisher': 'Selangor Journal',
  'url': 'https://selangorjournal.my/2024/01/police-cripple-syndicate-in-latest-trend-of-drug-laced-juice-trafficking/',
  'content_token_count': 81,
  '1tf_idf_content': [0.04373442381620407,
   0.4596213698387146,
   0.07281805574893951,
   0.06533516198396683,
   0.15498901903629303,
   0.11221198737621307,
   0.13366003334522247,
   1.094680905342102,
   0.029480593279004097,
   0.4488479495048523,
   0.301592081785202,
   0.08793880045413971,
   0.41895991563796997,
   0.3709098994731903,
   1.1630789041519165,
   0.2763420641422272,
   0.18033678829669952,
   0.5922505855560303,
   0.49446582794189453,
   0.3781721889972687],
  '2tf_idf_content': [0.058961186558008194,
   0.17822745442390442,
   0.18288899958133698,
   0.17246127128601074,
   0.19741685688495636,
   0.2756151258945465,
   0.16006089746952057,
   0.26087889075279236,
   0.36577799916267395,
   0.302469551

In [18]:
# Stopping Redis and Spark session
redis_client.stop_service()
spark.stop()

[sudo] password for student: 