# 3.0 Data Annotation 
## 3.1 Handling Annotated Dataset

###### Author: Gan Yee Jing, Yeap Jie Shen
###### Last Edited: 01/09/2024

### 3.1.1 Importing Libraries 

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lower, col, when, regexp_replace, trim

import sys
sys.path.append(r'/home/student/RDS2S3G4_CLO2_B')

from data_stores.hdfsClient import HdfsClient
from data_stores.mongodbClient import MongoDBClient
from data_stores.redisClient import RedisClient

import pickle

### 3.1.2 Initialising Spark Session and HDFS Client

In [2]:
# create spark session
spark = SparkSession.builder.appName('handle annotated dataset').getOrCreate()

# instantiate hbase client
hdfs_client = HdfsClient(spark)

24/09/01 17:38:37 WARN Utils: Your hostname, Gan. resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/09/01 17:38:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/01 17:38:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### 3.1.3 Retrieving Annotated Dataset from HDFS

In [3]:
df = hdfs_client.read_file(file_format = 'csv', source_path = 'annotated_dataset.csv', header = True, multiline = False)

df.show()

+---+--------------------+--------------------+--------------------+
|key|      cleaned_tokens|               label|              label2|
+---+--------------------+--------------------+--------------------+
| k0|port,five,people,...| Murder and Homicide| Murder and Homicide|
| k1|session,court,bus...|    Money Laundering|    Money Laundering|
| k3|former,chief,exec...|    Money Laundering|    Money Laundering|
| k4|pekan,body,late,b...| Murder and Homicide| Murder and Homicide|
| k5|police,uncovered,...|       Drug Offences|       Drug Offences|
| k6|court,fixed,augus...|              Others|              Others|
| k8|former,security,g...| Murder and Homicide| Murder and Homicide|
| k9|federal,court,uph...| Murder and Homicide| Murder and Homicide|
|k10|court,appeal,redu...|Sexual Harassment...|Sexual Harassment...|
|k11|investment,consul...|    Money Laundering|    Money Laundering|
|k12|police,eight,fore...| Murder and Homicide| Murder and Homicide|
|k14|eighteen,foreign,...|        

### 3.1.4 Resolving Inconsistency in Annotated Dataset
#### 3.1.4.1 Fixing Incorrect Label Values

In [4]:
# To check for any inconsistencies
df_temp = (
    df.select('label').union(
        df.select('label2').withColumnRenamed('label2', 'label')
    ).groupby('label')
    .count()
)
df_temp.show(df_temp.count(), truncate = False)

+---------------------------------------------+-----+
|label                                        |count|
+---------------------------------------------+-----+
|religious Offences                           |4    |
|Scam                                         |760  |
|Sexual Harassment or Sexual Offences or Rape |212  |
|not applicable                               |7    |
|Drug Offences                                |1269 |
|Embezzlement                                 |45   |
|NULL                                         |9    |
|Cyber Crime                                  |137  |
|Cybar Crime                                  |2    |
|Religious Offences                           |68   |
|Physical Hurt                                |886  |
|Smuggling                                    |72   |
|sexual Harassment or Sexual Offences or Rape |9    |
|Smuggling                                    |138  |
|Money Laundering                             |343  |
|Robbery or Theft           

__Rename the following labels__:
1) 'Cybar Crime' to 'Cyber Crime'
2) 'Illegal Fishing' to 'Others'
3) '-' and 'NULL' to 'Not Applicable'
4) 'Terrorism or Firearm' to 'Firearm or Terrorism'
5) 'Dangerous Drugs' to 'Drug Offences'
6) 'Sexual Harrasment or Sexual Offences or Rape' and 'Sexual Harrassment or Sexual Offences or Rape' to 'Sexual Harasment or Sexual Offences or Rape'
7) 'Moner Laundering' to 'Money Laundering'
8) 'Robbery of Theft' to 'Robbery or Theft'
9) 'Smuglging' to 'Smuggling'
10) 'Firearm or Terrorsim' to 'Firearm or Terrorism'

In [5]:
# lower casing and remove trailing and leading spcaes
df_lc = (
    df
    .withColumn('label', regexp_replace('label', r'\s+',' '))
    .withColumn('label2', regexp_replace('label2', r'\s+',' '))
    .select('key', trim(lower('label')).alias('label'), trim(lower('label2')).alias('label2'))
)

df_lc = df_lc.withColumn(
    'label',
    when(df_lc['label'] == 'cybar crime', 'cyber crime')
    .when(df_lc['label'] == 'illegal fishing', 'others')
    .when((df_lc['label'] == '-') | (df_lc['label'].isNull()), 'not applicable')
    .when(df_lc['label'] == 'terrorism or firearm', 'firearm or terrorism')
    .when(df_lc['label'] == 'dangerous drugs', 'drug offences')
    .when(
        (df_lc['label'] == 'sexual harrasment or sexual offences or rape') | 
        (df_lc['label'] == 'sexual harrassment or sexual offences or rape'), 
        'sexual harassment or sexual offences or rape')
    .when(df_lc['label'] == 'moner laundering', 'money laundering')
    .when(df_lc['label'] == 'robbery of theft', 'robbery or theft')
    .when(df_lc['label'] == 'smuglging', 'smuggling')
    .when(df_lc['label'] == 'firearm or terrorsim', 'firearm or terrorism')
    .otherwise(df_lc['label'])
)

df_lc = df_lc.withColumn(
    'label2',
    when(df_lc['label2'] == 'cybar crime', 'cyber crime')
    .when(df_lc['label2'] == 'illegal fishing', 'others')
    .when((df_lc['label2'] == '-') | (df_lc['label2'].isNull()), 'not applicable')
    .when(df_lc['label2'] == 'terrorism or firearm', 'firearm or terrorism')
    .when(df_lc['label2'] == 'dangerous drugs', 'drug offences')
    .when(
        (df_lc['label2'] == 'sexual harrasment or sexual offences or rape') | 
        (df_lc['label2'] == 'sexual harrassment or sexual offences or rape'), 
        'sexual harassment or sexual offences or rape')
    .when(df_lc['label2'] == 'moner laundering', 'money laundering')
    .when(df_lc['label2'] == 'robbery of theft', 'robbery or theft')
    .when(df_lc['label2'] == 'smuglging', 'smuggling')
    .when(df_lc['label2'] == 'firearm or terrorsim', 'firearm or terrorism')
    .otherwise(df_lc['label2'])
)

# To check for any inconsistencies
df_temp = (
    df_lc.select('label').union(
        df_lc.select('label2').withColumnRenamed('label2', 'label')
    ).groupby('label')
    .count()
)
df_temp.show(df_temp.count(), truncate = False)

+--------------------------------------------+-----+
|label                                       |count|
+--------------------------------------------+-----+
|drug offences                               |1270 |
|not applicable                              |1935 |
|cyber crime                                 |140  |
|scam                                        |760  |
|smuggling                                   |211  |
|others                                      |854  |
|sexual harassment or sexual offences or rape|317  |
|murder and homicide                         |1206 |
|physical hurt                               |887  |
|firearm or terrorism                        |361  |
|money laundering                            |398  |
|religious offences                          |72   |
|embezzlement                                |45   |
|robbery or theft                            |570  |
+--------------------------------------------+-----+



#### 3.1.4.2 Indicating Mismatch Labels

In [6]:
# Compare label and label2
df_lc = df_lc.withColumn('equal', df_lc.label == df_lc.label2)

# Generate category column (if equal is True, category = label, else = not applicable)
df_category = df_lc.withColumn(
    'category',
    when(col('equal') == True, col('label'))
    .when(col('equal') == False, 'not applicable')
)

df_category.show()

+---+--------------------+--------------------+-----+--------------------+
|key|               label|              label2|equal|            category|
+---+--------------------+--------------------+-----+--------------------+
| k0| murder and homicide| murder and homicide| true| murder and homicide|
| k1|    money laundering|    money laundering| true|    money laundering|
| k3|    money laundering|    money laundering| true|    money laundering|
| k4| murder and homicide| murder and homicide| true| murder and homicide|
| k5|       drug offences|       drug offences| true|       drug offences|
| k6|              others|              others| true|              others|
| k8| murder and homicide| murder and homicide| true| murder and homicide|
| k9| murder and homicide| murder and homicide| true| murder and homicide|
|k10|sexual harassment...|sexual harassment...| true|sexual harassment...|
|k11|    money laundering|    money laundering| true|    money laundering|
|k12| murder and homicide

### 3.1.5 Retrieving Cleaned Dataset from MongoDB

In [7]:
mongodb_client = MongoDBClient()

Pinged your deployment. You successfully connected to MongoDB!


In [8]:
# read data from mongoDB, exclude id generated by MongoDB
cursor = mongodb_client.read_many('Cleaned_Dataset', 'cleaned_data', {'_id': False}) # exclude id objects

# convert cursor to list of dictionaries
data_list = list(cursor)

# convert list of dictionaries to dataframe
df_cleaned = spark.createDataFrame(data_list)

df_cleaned.show()

+------+--------------------+---+----------------+--------------------+--------------------+--------------------+
|author|            datetime|key|       publisher|   tokenised_content|  tokenised_headline|                 url|
+------+--------------------+---+----------------+--------------------+--------------------+--------------------+
|      |2024-07-19T20:15:...| k0|Selangor Journal|[port, five, peop...|[five, causing, d...|https://selangorj...|
|      |2024-07-19T17:30:...| k1|Selangor Journal|[session, court, ...|[businessman, che...|https://selangorj...|
|      |2024-07-18T21:12:...| k3|Selangor Journal|[former, chief, e...|[year, jail, fine...|https://selangorj...|
|      |2024-07-18T19:58:...| k4|Selangor Journal|[pekan, body, lat...|[remains, laid, r...|https://selangorj...|
|      |2024-07-18T19:45:...| k5|Selangor Journal|[police, uncovere...|[police, seize, c...|https://selangorj...|
|      |2024-07-18T17:09:...| k6|Selangor Journal|[court, fixed, au...| [guilty, therapi

### 3.1.6 Joining Annotated Column to Cleaned Dataset

In [9]:
df_annotated = (
    df_cleaned
    .join(df_category, df_cleaned.key == df_category.key)
    .select(df_cleaned.key, 
            df_cleaned.tokenised_content, 
            df_cleaned.tokenised_headline, 
            df_cleaned.author, 
            df_cleaned.datetime, 
            df_cleaned.publisher, 
            df_cleaned.url, 
            df_category.category)
)
df_annotated.show()

+---+--------------------+--------------------+------+--------------------+----------------+--------------------+--------------------+
|key|   tokenised_content|  tokenised_headline|author|            datetime|       publisher|                 url|            category|
+---+--------------------+--------------------+------+--------------------+----------------+--------------------+--------------------+
| k0|[port, five, peop...|[five, causing, d...|      |2024-07-19T20:15:...|Selangor Journal|https://selangorj...| murder and homicide|
| k1|[session, court, ...|[businessman, che...|      |2024-07-19T17:30:...|Selangor Journal|https://selangorj...|    money laundering|
| k3|[former, chief, e...|[year, jail, fine...|      |2024-07-18T21:12:...|Selangor Journal|https://selangorj...|    money laundering|
| k4|[pekan, body, lat...|[remains, laid, r...|      |2024-07-18T19:58:...|Selangor Journal|https://selangorj...| murder and homicide|
| k5|[police, uncovere...|[police, seize, c...|      |2

### 3.1.7 Dropping Not Applicable Data

In [10]:
df_annotated = df_annotated.filter(df_annotated['category'] != 'not applicable')
df_annotated.show()

+---+--------------------+--------------------+------+--------------------+----------------+--------------------+--------------------+
|key|   tokenised_content|  tokenised_headline|author|            datetime|       publisher|                 url|            category|
+---+--------------------+--------------------+------+--------------------+----------------+--------------------+--------------------+
| k0|[port, five, peop...|[five, causing, d...|      |2024-07-19T20:15:...|Selangor Journal|https://selangorj...| murder and homicide|
| k1|[session, court, ...|[businessman, che...|      |2024-07-19T17:30:...|Selangor Journal|https://selangorj...|    money laundering|
| k3|[former, chief, e...|[year, jail, fine...|      |2024-07-18T21:12:...|Selangor Journal|https://selangorj...|    money laundering|
| k4|[pekan, body, lat...|[remains, laid, r...|      |2024-07-18T19:58:...|Selangor Journal|https://selangorj...| murder and homicide|
| k5|[police, uncovere...|[police, seize, c...|      |2

In [11]:
# check remaining row number
df_annotated.count()

2591

In [12]:
# check number of rows for each categories
df_annotated.groupBy('category').count().show(truncate = False)

+--------------------------------------------+-----+
|category                                    |count|
+--------------------------------------------+-----+
|drug offences                               |566  |
|cyber crime                                 |20   |
|scam                                        |360  |
|others                                      |154  |
|sexual harassment or sexual offences or rape|127  |
|murder and homicide                         |500  |
|physical hurt                               |313  |
|firearm or terrorism                        |128  |
|money laundering                            |114  |
|religious offences                          |20   |
|robbery or theft                            |229  |
|smuggling                                   |48   |
|embezzlement                                |12   |
+--------------------------------------------+-----+



__Findings__: We consider to merge categories that having small sample size. Hence, 
1) 'embezzlement' and 'smuggling' are merged into 'money laundering'
2) 'religious offences' is merged into 'others'
3) 'cyber crime' is merged into 'scam'

### 3.1.8 Merging Categories

In [13]:
df_merged = df_annotated.withColumn(
    'category',
    when((df_annotated['category'] == 'embezzlement') | (df_annotated['category'] == 'smuggling'), 'money laundering')
    .when(df_annotated['category'] == 'religious offences', 'others')
    .when(df_annotated['category'] == 'cyber crime', 'scam')
    .otherwise(df_annotated['category'])
)

df_merged.groupBy('category').count().show(truncate = False)

+--------------------------------------------+-----+
|category                                    |count|
+--------------------------------------------+-----+
|drug offences                               |566  |
|scam                                        |380  |
|others                                      |174  |
|sexual harassment or sexual offences or rape|127  |
|murder and homicide                         |500  |
|physical hurt                               |313  |
|firearm or terrorism                        |128  |
|money laundering                            |174  |
|robbery or theft                            |229  |
+--------------------------------------------+-----+



### 3.1.9 Storing Annotated Dataset to MongoDB and Redis (for caching)

In [14]:
redis_client = RedisClient(host = 'localhost', port = 6379, db = 0, start_now = True)

[sudo] password for student: 

In [15]:
# Convert dataframe into list of dictionaries to efficiently insert data into MongoDB
documents = df_merged.rdd.map(lambda row: {
    'key': row[0], 
    'tokenised_content': row[1], 
    'tokenised_headline': row[2], 
    'author': row[3],
    'datetime': row[4],
    'publisher': row[5],
    'url': row[6],
    'category': row[7]}
).collect()

                                                                                

In [16]:
redis_client.set_key_value('annotated_dataset', pickle.dumps(documents), seconds = 30 * 60)

True

In [17]:
mongodb_client.insert_many('Annotated_Dataset', 'annotated_data', documents)

Documents successfully inserted: [ObjectId('66d43635f047a97f234a6255'), ObjectId('66d43635f047a97f234a6256'), ObjectId('66d43635f047a97f234a6257'), ObjectId('66d43635f047a97f234a6258'), ObjectId('66d43635f047a97f234a6259'), ObjectId('66d43635f047a97f234a625a'), ObjectId('66d43635f047a97f234a625b'), ObjectId('66d43635f047a97f234a625c'), ObjectId('66d43635f047a97f234a625d'), ObjectId('66d43635f047a97f234a625e'), ObjectId('66d43635f047a97f234a625f'), ObjectId('66d43635f047a97f234a6260'), ObjectId('66d43635f047a97f234a6261'), ObjectId('66d43635f047a97f234a6262'), ObjectId('66d43635f047a97f234a6263'), ObjectId('66d43635f047a97f234a6264'), ObjectId('66d43635f047a97f234a6265'), ObjectId('66d43635f047a97f234a6266'), ObjectId('66d43635f047a97f234a6267'), ObjectId('66d43635f047a97f234a6268'), ObjectId('66d43635f047a97f234a6269'), ObjectId('66d43635f047a97f234a626a'), ObjectId('66d43635f047a97f234a626b'), ObjectId('66d43635f047a97f234a626c'), ObjectId('66d43635f047a97f234a626d'), ObjectId('66d436

In [18]:
pickle.loads(redis_client.get_value('annotated_dataset'))

[{'key': 'k0',
  'tokenised_content': ['port',
   'five',
   'people',
   'court',
   'today',
   'causing',
   'death',
   'friend',
   'swimming',
   'pool',
   'last',
   'week',
   'accused',
   'guilty',
   'charge',
   'read',
   'magistrate',
   'jointly',
   'accused',
   'causing',
   'death',
   'hotel',
   'swimming',
   'pool',
   'port',
   'charge',
   'section',
   'penal',
   'code',
   'read',
   'together',
   'section',
   'maximum',
   'jail',
   'term',
   'fine',
   'upon',
   'conviction',
   'deputy',
   'public',
   'prosecutor',
   'bail',
   'accused',
   'court',
   'impose',
   'additional',
   'condition',
   'disturb',
   'prosecution',
   'case',
   'resolved',
   'court',
   'accused',
   'bail',
   'surety',
   'lawyer',
   'court',
   'five',
   'accused',
   'guilty',
   'second',
   'charge',
   'allegedly',
   'port',
   'police',
   'headquarters',
   'court',
   'bail',
   'fixed',
   'mention'],
  'tokenised_headline': ['five',
   'causing',
   

In [19]:
spark.stop()

In [20]:
redis_client.stop_service()

[sudo] password for student: 