# 3.0 Data Annotation
## 3.2 Train-Test split

###### Author: Gan Yee Jing, Yeap Jie Shen
###### Last Edited: 01/09/2024

### 3.2.1 Importing Libraries 

In [1]:
from pyspark.sql import SparkSession

import sys
sys.path.append(r'/home/student/RDS2S3G4_CLO2_B')

from data_stores.mongodbClient import MongoDBClient
from data_stores.redisClient import RedisClient

import pickle

### 3.2.2 Initialising Spark Session and MongoDB Client

In [2]:
# create spark session
spark = SparkSession.builder.appName('train test split').getOrCreate()

# Instantiating MongoDB client
mongodb_client = MongoDBClient()

# Instantiating Redis Client
redis_client = RedisClient(host = 'localhost', port = 6379, db = 0, start_now = True)

24/09/02 19:30:42 WARN Utils: Your hostname, Gan. resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/09/02 19:30:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/02 19:30:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Pinged your deployment. You successfully connected to MongoDB!


[sudo] password for student: 

### 3.2.3 Fetching Data from MongoDB

In [3]:
# Reading data from Redis if cache, otherwise from MongoDB with id column excluded
if redis_client.exists_key('annotated_dataset'):
    data_list = pickle.loads(redis_client.get_value('annotated_dataset'))
else:
    data_list = list(mongodb_client.read_many('Annotated_Dataset', 'annotated_data', {'_id': 0}))
    redis_client.set_key_value('annotated_dataset', pickle.dumps(data_list), seconds = 5 * 60)

df = spark.createDataFrame(data_list)
df.show()

                                                                                

+------+--------------------+--------------------+---+----------------+--------------------+--------------------+--------------------+
|author|            category|            datetime|key|       publisher|   tokenised_content|  tokenised_headline|                 url|
+------+--------------------+--------------------+---+----------------+--------------------+--------------------+--------------------+
|      | murder and homicide|2024-07-19T20:15:...| k0|Selangor Journal|[port, five, peop...|[five, causing, d...|https://selangorj...|
|      |    money laundering|2024-07-19T17:30:...| k1|Selangor Journal|[session, court, ...|[businessman, che...|https://selangorj...|
|      |    money laundering|2024-07-18T21:12:...| k3|Selangor Journal|[former, chief, e...|[year, jail, fine...|https://selangorj...|
|      | murder and homicide|2024-07-18T19:58:...| k4|Selangor Journal|[pekan, body, lat...|[remains, laid, r...|https://selangorj...|
|      |       drug offences|2024-07-18T19:45:...| k5|S

### 3.2.4 Splitting Dataset into Training Set and Testing Set

In [4]:
df_train, df_test = df.randomSplit(weights = [0.7, 0.3], seed = 100)

print(f'Number of training rows: {df_train.count()}')
print(f'Number of testing rows: {df_test.count()}')

Number of training rows: 1789
Number of testing rows: 802


In [5]:
df_train.show()

+------+--------------------+--------------------+----+----------------+--------------------+--------------------+--------------------+
|author|            category|            datetime| key|       publisher|   tokenised_content|  tokenised_headline|                 url|
+------+--------------------+--------------------+----+----------------+--------------------+--------------------+--------------------+
|      |       drug offences|2024-01-17T17:35:...|k317|Selangor Journal|[mother, among, t...|[mother, among, t...|https://selangorj...|
|      |       drug offences|2024-01-17T22:10:...|k312|Selangor Journal|[royal, custom, d...|[custom, foil, at...|https://selangorj...|
|      |       drug offences|2024-01-18T22:33:...|k309|Selangor Journal|[police, drug, ap...|[suspected, drug,...|https://selangorj...|
|      |       drug offences|2024-01-21T21:18:...|k300|Selangor Journal|[foreign, suspici...|[foreign, armed, ...|https://selangorj...|
|      |       drug offences|2024-01-23T18:18:..

In [6]:
# To check for sample size for each categories
df_train.groupby('category').count().show()

+--------------------+-----+
|            category|count|
+--------------------+-----+
|       drug offences|  389|
|                scam|  268|
|              others|  119|
|sexual harassment...|   90|
| murder and homicide|  346|
|       physical hurt|  193|
|firearm or terrorism|   97|
|    money laundering|  119|
|    robbery or theft|  168|
+--------------------+-----+



In [7]:
# To check for sample size for each categories
df_test.groupby('category').count().show()

+--------------------+-----+
|            category|count|
+--------------------+-----+
|       drug offences|  177|
|                scam|  112|
|              others|   55|
|sexual harassment...|   37|
| murder and homicide|  154|
|       physical hurt|  120|
|firearm or terrorism|   31|
|    money laundering|   55|
|    robbery or theft|   61|
+--------------------+-----+



### 3.2.5 Storing Training Set and Testing Set to MongoDB

In [6]:
# Train set
# Convert dataframe into list of dictionaries to efficiently insert data into MongoDB
training_documents = (
    df_train
    .rdd
    .map(
        lambda row: {
            'key' : row[3],
            'category' : row[1],
            'author' : row[0],
            'url' : row[7],
            'datetime' : row[2],
            'publisher' : row[4],
            'tokenised_content' : row[5],
            'tokenised_headline' : row[6]}
    ).collect()
)

                                                                                

In [7]:
redis_client.set_key_value('train_dataset', pickle.dumps(training_documents), seconds = 30 * 60)

True

In [8]:
pickle.loads(redis_client.get_value('train_dataset'))

[{'key': 'k317',
  'category': 'drug offences',
  'author': '',
  'url': 'https://selangorjournal.my/2024/01/mother-son-among-three-held-for-drug-trafficking/',
  'datetime': '2024-01-17T17:35:08+08:00',
  'publisher': 'Selangor Journal',
  'tokenised_content': ['mother',
   'among',
   'three',
   'involved',
   'drug',
   'syndicate',
   'seizure',
   'worth',
   'million',
   'three',
   'district',
   'police',
   'chief',
   'zaman',
   'said',
   'woman',
   'well',
   'male',
   'friend',
   'criminal',
   'investigation',
   'department',
   'special',
   'said',
   'three',
   'involved',
   'drug',
   'syndicate',
   'local',
   'market',
   'since',
   'last',
   'year',
   'syndicate',
   'apartment',
   'rented',
   'drug',
   'storage',
   'room',
   'local',
   'market',
   'syndicate',
   'also',
   'drug',
   'said',
   'press',
   'conference',
   'police',
   'contingent',
   'headquarters',
   'today',
   'said',
   'worth',
   'million',
   'included',
   'ecstasy'

In [9]:
mongodb_client.insert_many('Split_Dataset', 'train_set', training_documents)

Documents successfully inserted: [ObjectId('66d43b1dba3e91b376e12c2f'), ObjectId('66d43b1dba3e91b376e12c30'), ObjectId('66d43b1dba3e91b376e12c31'), ObjectId('66d43b1dba3e91b376e12c32'), ObjectId('66d43b1dba3e91b376e12c33'), ObjectId('66d43b1dba3e91b376e12c34'), ObjectId('66d43b1dba3e91b376e12c35'), ObjectId('66d43b1dba3e91b376e12c36'), ObjectId('66d43b1dba3e91b376e12c37'), ObjectId('66d43b1dba3e91b376e12c38'), ObjectId('66d43b1dba3e91b376e12c39'), ObjectId('66d43b1dba3e91b376e12c3a'), ObjectId('66d43b1dba3e91b376e12c3b'), ObjectId('66d43b1dba3e91b376e12c3c'), ObjectId('66d43b1dba3e91b376e12c3d'), ObjectId('66d43b1dba3e91b376e12c3e'), ObjectId('66d43b1dba3e91b376e12c3f'), ObjectId('66d43b1dba3e91b376e12c40'), ObjectId('66d43b1dba3e91b376e12c41'), ObjectId('66d43b1dba3e91b376e12c42'), ObjectId('66d43b1dba3e91b376e12c43'), ObjectId('66d43b1dba3e91b376e12c44'), ObjectId('66d43b1dba3e91b376e12c45'), ObjectId('66d43b1dba3e91b376e12c46'), ObjectId('66d43b1dba3e91b376e12c47'), ObjectId('66d43b

In [10]:
# Test set
# Convert dataframe into list of dictionaries to efficiently insert data into MongoDB
testing_documents = (
    df_test
    .rdd
    .map(
        lambda row: {
            'key' : row[3],
            'category' : row[1],
            'author' : row[0],
            'url' : row[7],
            'datetime' : row[2],
            'publisher' : row[4],
            'tokenised_content' : row[5],
            'tokenised_headline' : row[6]}
    ).collect()
)

In [11]:
redis_client.set_key_value('test_dataset', pickle.dumps(testing_documents), seconds = 30 * 60)

True

In [12]:
pickle.loads(redis_client.get_value('test_dataset'))

[{'key': 'k306',
  'category': 'drug offences',
  'author': '',
  'url': 'https://selangorjournal.my/2024/01/police-cripple-syndicate-in-latest-trend-of-drug-laced-juice-trafficking/',
  'datetime': '2024-01-19T17:40:34+08:00',
  'publisher': 'Selangor Journal',
  'tokenised_content': ['police',
   'syndicate',
   'latest',
   'trend',
   'lacing',
   'juice',
   'several',
   'across',
   'valley',
   'netted',
   'mastermind',
   'investigation',
   'department',
   'director',
   'said',
   'syndicate',
   'type',
   'juice',
   'various',
   'beverage',
   'valley',
   'entertainment',
   'revealed',
   'syndicate',
   'operational',
   'since',
   'luxury',
   'headquarters',
   'laced',
   'juice',
   'entertainment',
   'private',
   'discovered',
   'mixed',
   'various',
   'sold',
   'bottle',
   'depending',
   'size',
   'drug',
   'content',
   'syndicate',
   'member',
   'received',
   'month',
   'wage',
   'making',
   'lucrative',
   'profit',
   'said',
   'press',
 

In [13]:
mongodb_client.insert_many('Split_Dataset', 'test_set', testing_documents)

Documents successfully inserted: [ObjectId('66d43b31ba3e91b376e1332c'), ObjectId('66d43b31ba3e91b376e1332d'), ObjectId('66d43b31ba3e91b376e1332e'), ObjectId('66d43b31ba3e91b376e1332f'), ObjectId('66d43b31ba3e91b376e13330'), ObjectId('66d43b31ba3e91b376e13331'), ObjectId('66d43b31ba3e91b376e13332'), ObjectId('66d43b31ba3e91b376e13333'), ObjectId('66d43b31ba3e91b376e13334'), ObjectId('66d43b31ba3e91b376e13335'), ObjectId('66d43b31ba3e91b376e13336'), ObjectId('66d43b31ba3e91b376e13337'), ObjectId('66d43b31ba3e91b376e13338'), ObjectId('66d43b31ba3e91b376e13339'), ObjectId('66d43b31ba3e91b376e1333a'), ObjectId('66d43b31ba3e91b376e1333b'), ObjectId('66d43b31ba3e91b376e1333c'), ObjectId('66d43b31ba3e91b376e1333d'), ObjectId('66d43b31ba3e91b376e1333e'), ObjectId('66d43b31ba3e91b376e1333f'), ObjectId('66d43b31ba3e91b376e13340'), ObjectId('66d43b31ba3e91b376e13341'), ObjectId('66d43b31ba3e91b376e13342'), ObjectId('66d43b31ba3e91b376e13343'), ObjectId('66d43b31ba3e91b376e13344'), ObjectId('66d43b

In [14]:
# stop spark session
redis_client.stop_service()
spark.stop()

[sudo] password for student: 