# Neo4J Graph Database

Author: Yeap Jie Shen

Last Edited: 02/09/2024

In [1]:
from pyspark.sql import SparkSession
import sys
sys.path.append(r'/home/student/RDS2S3G4_CLO2_B')

from data_stores.mongodbClient import MongoDBClient
from data_stores.redisClient import RedisClient
from data_stores.neo4jClient import Neo4j
from data_stores.vectorArrayConverter import VectorArrayConverter

from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import udf, col
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.sql.types import FloatType

import numpy as np
import pickle

### Establishing Connection to Data Stores

In [2]:
# Instantiating Spark Session
spark = SparkSession.builder.appName('Neo4j AuraDB').getOrCreate()

# Instantiating connection to MongoDB
mongodb_client = MongoDBClient()

# Starting Redis service
redis_client = RedisClient(host = 'localhost', port = 6379, db = 0, start_now = True)

# Instantiating connection to Neo4J
neo4j_client = Neo4j()

24/09/02 19:44:16 WARN Utils: Your hostname, Gan. resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/09/02 19:44:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/02 19:44:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Pinged your deployment. You successfully connected to MongoDB!


[sudo] password for student: 

Connection established


## Getting Data from Data Stores

In [3]:
# Reading data from Redis if cache, otherwise from MongoDB with id column excluded
if redis_client.exists_key('feature_engineered_test_dataset'):
    test_list = pickle.loads(redis_client.get_value('feature_engineered_test_dataset'))
else:
    test_list = list(mongodb_client.read_many('Feature_Engineered_Dataset', 'final_test_set', {'_id': 0}))
    redis_client.set_key_value('feature_engineered_test_dataset', pickle.dumps(test_list), seconds = 5 * 60)

df_test = (
    spark.createDataFrame(test_list)
    .select(
        'key', 'category_index', 
        'author', 'url', 'datetime', 'publisher',
        '1tf_idf_content', '2tf_idf_content', '3tf_idf_content', '4tf_idf_content', '5tf_idf_content', 'tf_idf_headline',
        '1gram_word2vec_content', '2gram_word2vec_content', '3gram_word2vec_content', '4gram_word2vec_content', '5gram_word2vec_content',
        'content_token_count')
)

df_test.show()

                                                                                

+----+--------------+------+--------------------+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+----------------------+----------------------+----------------------+----------------------+-------------------+
| key|category_index|author|                 url|            datetime|       publisher|     1tf_idf_content|     2tf_idf_content|     3tf_idf_content|     4tf_idf_content|     5tf_idf_content|     tf_idf_headline|1gram_word2vec_content|2gram_word2vec_content|3gram_word2vec_content|4gram_word2vec_content|5gram_word2vec_content|content_token_count|
+----+--------------+------+--------------------+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+----------------------+----------------------+--------------------

In [4]:
# Apply the UDF to create tf_idf with sparse vector, and word2vec with dense vector
for i in range(1,6):
    df_test = (
        df_test
        .withColumn(f'{i}tf_idf_content', VectorArrayConverter.array_to_vector(df_test[f'{i}tf_idf_content']))
        .withColumn(f'{i}gram_word2vec_content', VectorArrayConverter.array_to_vector(df_test[f'{i}gram_word2vec_content']))
    )

df_test = df_test.withColumn('tf_idf_headline', VectorArrayConverter.array_to_vector(df_test['tf_idf_headline']))

## Load Model and Make Prediction

In [5]:
assembler = VectorAssembler(inputCols = ['1tf_idf_content', 
                                         '2tf_idf_content', 
                                         '3tf_idf_content', 
                                         '4tf_idf_content', 
                                         '5tf_idf_content', 
                                         'tf_idf_headline',
                                         '1gram_word2vec_content', 
                                         '2gram_word2vec_content', 
                                         '3gram_word2vec_content', 
                                         '4gram_word2vec_content', 
                                         '5gram_word2vec_content',
                                         'content_token_count'], outputCol='features')
df_test = assembler.transform(df_test)

                                                                                

In [7]:
rf_model = RandomForestClassificationModel.load(r'../model/best_model')

df_predictions = rf_model.transform(df_test)

distinct_predictions = [
    row.asDict()['prediction'] for row in 
    df_predictions.select('prediction').distinct().collect()
]

df_predictions.show()

24/09/02 19:45:30 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:45:32 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:45:32 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB


+----+--------------+------+--------------------+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+----------------------+----------------------+----------------------+----------------------+-------------------+--------------------+--------------------+--------------------+----------+
| key|category_index|author|                 url|            datetime|       publisher|     1tf_idf_content|     2tf_idf_content|     3tf_idf_content|     4tf_idf_content|     5tf_idf_content|     tf_idf_headline|1gram_word2vec_content|2gram_word2vec_content|3gram_word2vec_content|4gram_word2vec_content|5gram_word2vec_content|content_token_count|            features|       rawPrediction|         probability|prediction|
+----+--------------+------+--------------------+--------------------+----------------+--------------------+--------------------+--------------------+----

## Uploading Predictions to Neo4j

### Clearing All Data in Neo4j

In [10]:
"""
    Delete all nodes in Neo4j
"""

records, summary, keys = neo4j_client.execute_query(
    """
        MATCH (n)
        DETACH DELETE n
    """,
    database = '_neo4j',
)

### Inserting All Publishers into Neo4j

In [11]:
"""
    Insert all publishers into Neo4j
"""

publisher_name_list = [
    'Free Malaysia Today',
    'NST',
    'Selangor Journal',
    'The Borneo Post',
    'The Star',
    'Newsdata io'
]

records, summary, keys = neo4j_client.execute_query(
    """
        UNWIND $publisher_name_list AS publisher_name
        CREATE (:Publisher {publisher_name: publisher_name})
    """,
    database = '_neo4j',
    publisher_name_list = publisher_name_list 
)

"""
    Insert all publishers into Neo4j
"""

news_category_list = [
    {'name': 'drug offences', 'index': 0.0},
    {'name': 'murder and homicide','index': 1.0},
    {'name': 'scam', 'index': 2.0},
    {'name': 'physical hurt','index': 3.0},
    {'name': 'robbery or theft','index': 4.0},
    {'name': 'money laundering','index': 5.0},
    {'name': 'others','index': 6.0},
    {'name': 'firearm or terrorism','index': 7.0},
    {'name': 'sexual harassment or sexual offences or rape','index': 8.0},
]

records, summary, keys = neo4j_client.execute_query(
    """
        UNWIND $news_category_list AS category
        CREATE (:Category {category_name: category.name, category_index: category.index})
    """,
    database = '_neo4j',
    news_category_list = news_category_list 
)

### Uploading News Nodes and Relationships

__Note__: For reducing computational workload purposes, IS_SIMILAR relationship only limited to those with the same 'Predicted Category' only

In [12]:
# UDF to calculate cosine similarity
@udf(returnType = FloatType())
def compute_cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1.toArray(), vec2.toArray())
    norm1 = np.linalg.norm(vec1.toArray())
    norm2 = np.linalg.norm(vec2.toArray())
    return float(dot_product / (norm1 * norm2))

In [13]:
for category_index in distinct_predictions:
    # Getting predictions for that particular category
    df_predicted_category = (
        df_predictions
        .withColumnRenamed('1tf_idf_content', 'tf_idf_content1')
        .select('key', 'author', 'url', 'datetime', 'publisher', 'prediction', 'tf_idf_content1')
        .filter(col('prediction') == category_index)
    )

    # Fetching news nodes from Neo4j
    records, summary, keys = neo4j_client.execute_query(
        """
            MATCH (n:News) --> (:Category {category_index: $category_index})
            MATCH (n)-[:PUBLISHED_BY]->(p:Publisher)
            RETURN n, p
        """,
    database = '_neo4j',
    category_index = category_index
    )

    print(f'Fetching news Node of category index ({category_index}) tooked: {summary.result_available_after} ms')
    
    # Formatting news data fetched from Neo4j
    raw_results = [(record.data()['n'], record.data()['p']) for record in records]
    for result in raw_results:
        result[0]['tf_idf_content1'] = pickle.loads(result[0]['tf_idf_content1'])
        result[0]['publisher'] = result[1]['publisher_name']
        result[0]['prediction'] = category_index
    results = [result[0] for result in raw_results]

    # Combining news data from Neo4j with local data
    df_result = spark.createDataFrame(results, schema = df_predicted_category.schema)
    df_union = df_predicted_category.union(df_result).dropDuplicates(subset=['key'])

    # Finding similar articles
    # Filtering out those have cosine similarity larger than 0.8
    df_similar = (
        df_union
        .alias('df1')
        .join(df_union.alias('df2'), col('df1.key') < col('df2.key'))
        .withColumn('similarity',
                     compute_cosine_similarity('df1.tf_idf_content1', 'df2.tf_idf_content1'))
    )
    df_similar_filtered = (
        df_similar
        .select(col('df1.key').alias('key1'), col('df2.key').alias('key2'), 'similarity')
        .filter(col('similarity') >= 0.85)
    )

    # Converting new News to list to insert into Neo4j
    news_records = [news.asDict() for news in df_union.collect()]
    for news in news_records:
        news['tf_idf_content1'] = pickle.dumps(news['tf_idf_content1'])

    # Converting new (IS_SIMILAR) relationships to insert into Neo4j
    similarity_list = [similarity.asDict() for similarity in df_similar_filtered.collect()]

    # Uploading new News Node to Neo4j
    records, summary, keys = neo4j_client.execute_query(
        """
            UNWIND $news_records AS news_record
            MERGE (publisher: Publisher {publisher_name: news_record.publisher})
            MERGE (category: Category {category_index: news_record.prediction})
            MERGE (news_node:News {
            key: news_record.key, url: news_record.url, author: news_record.author,
            datetime: news_record.datetime, tf_idf_content1: news_record.tf_idf_content1})
            MERGE (news_node) - [:IS_CATEGORY] -> (category)
            MERGE (news_node) - [:PUBLISHED_BY] -> (publisher)
        """,
        database = '_neo4j',
        news_records = news_records
    )
    print(f'Uploading new news Node tooked: {summary.result_available_after} ms')
    
    # Uploading new relationships to Neo4j
    records, summary, keys = neo4j_client.execute_query(
        """
            UNWIND $similarity_list AS similarity
            MATCH (news1:News {key: similarity.key1})
            MATCH (news2:News {key: similarity.key2})
            MERGE (news1) -[:IS_SIMILAR {similarity: similarity.similarity}]-> (news2)
            MERGE (news2) -[:IS_SIMILAR {similarity: similarity.similarity}]-> (news1)
        """,
        database = '_neo4j',
        similarity_list = similarity_list
    )
    
    print(f'Uploading new (IS_SIMILAR) relationships tooked: {summary.result_available_after} ms')
    print()

Fetching news Node of category index (0.0) tooked: 86 ms


24/09/02 19:46:09 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:10 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:11 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:12 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:12 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB


Uploading new news Node tooked: 176 ms
Uploading new (IS_SIMILAR) relationships tooked: 163 ms

Fetching news Node of category index (1.0) tooked: 1 ms


24/09/02 19:46:16 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:18 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:18 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:19 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:19 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
                                                                                

Uploading new news Node tooked: 374 ms
Uploading new (IS_SIMILAR) relationships tooked: 4411 ms

Fetching news Node of category index (4.0) tooked: 308 ms


24/09/02 19:46:31 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:32 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:33 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:34 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:34 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB


Uploading new news Node tooked: 422 ms
Uploading new (IS_SIMILAR) relationships tooked: 215 ms

Fetching news Node of category index (3.0) tooked: 3 ms


24/09/02 19:46:38 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:39 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:39 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:40 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:40 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB


Uploading new news Node tooked: 94 ms
Uploading new (IS_SIMILAR) relationships tooked: 253 ms

Fetching news Node of category index (2.0) tooked: 2 ms


24/09/02 19:46:44 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:45 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:45 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:46 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:47 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB


Uploading new news Node tooked: 49 ms
Uploading new (IS_SIMILAR) relationships tooked: 337 ms

Fetching news Node of category index (6.0) tooked: 3 ms


24/09/02 19:46:50 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:51 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:51 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:52 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:53 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB


Uploading new news Node tooked: 23 ms
Uploading new (IS_SIMILAR) relationships tooked: 86 ms

Fetching news Node of category index (5.0) tooked: 2 ms


24/09/02 19:46:56 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:57 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:57 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:58 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:46:58 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB


Uploading new news Node tooked: 62 ms
Uploading new (IS_SIMILAR) relationships tooked: 302 ms

Fetching news Node of category index (8.0) tooked: 2 ms


24/09/02 19:47:03 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:47:04 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:47:04 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:47:05 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:47:05 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB


Uploading new news Node tooked: 20 ms
Uploading new (IS_SIMILAR) relationships tooked: 39 ms

Fetching news Node of category index (7.0) tooked: 5 ms


24/09/02 19:47:08 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:47:09 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:47:09 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:47:10 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
24/09/02 19:47:11 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB


Uploading new news Node tooked: 31 ms
Uploading new (IS_SIMILAR) relationships tooked: 66 ms



## Querying from Neo4j

In [14]:
# Finding news that is similar to news with key'??' where the similarity is greater than or equal to 0.9
target_key = 'k101'

records, summary, keys = neo4j_client.execute_query(
    """
        MATCH (n:News) -[s:IS_SIMILAR]-> (:News {key: $key})
        MATCH (n) -[:PUBLISHED_BY]-> (p: Publisher)
        WHERE s.similarity >= 0.9
        RETURN n, s, p
        ORDER BY s.similarity
    """,
    database = '_neo4j',
    key = target_key
)

print(f'Similar news (Fetched in {summary.result_available_after} ms)')
for record in records:
    print(f" Key: {record['n']['key']:10s} Publisher: {record['p']['publisher_name']:20s} URL: {record['n']['url'] if record['n']['url'] else 'MISSING URL':.80s} Cosine Similarity: {record['s']['similarity']:.4f}")

print(f'Total similar articles:{len(records)}')

Similar news (Fetched in 84 ms)
 Key: k212       Publisher: Selangor Journal     URL: https://selangorjournal.my/2024/02/ex-religious-teacher-gets-33-years-jail-canin Cosine Similarity: 0.9014
 Key: k471       Publisher: Selangor Journal     URL: https://selangorjournal.my/2023/11/couples-death-sentence-commuted-to-30-years-j Cosine Similarity: 0.9044
 Key: k5471      Publisher: NST                  URL: MISSING URL Cosine Similarity: 0.9052
 Key: k587       Publisher: Selangor Journal     URL: https://selangorjournal.my/2023/10/two-minors-among-11-individuals-charged-with- Cosine Similarity: 0.9054
 Key: k725       Publisher: Selangor Journal     URL: https://selangorjournal.my/2023/07/man-charged-with-murdering-girlfriends-son/ Cosine Similarity: 0.9072
 Key: k3875      Publisher: The Borneo Post      URL: https://www.theborneopost.com/2020/09/02/cryptocurrency-miners-found-stealing-rm Cosine Similarity: 0.9100
 Key: k327       Publisher: Selangor Journal     URL: https://selangorjou

In [15]:
# Finding number of news published by predicted category for each publisher

records, summary, keys = neo4j_client.execute_query(
    """
        MATCH (n:News)-[:IS_CATEGORY]->(c:Category)
        MATCH (n)-[:PUBLISHED_BY]->(p:Publisher)
        RETURN p.publisher_name AS publisher, 
            c.category_name AS category, 
            COUNT(n) AS news_count
        ORDER BY publisher, news_count
    """,
    database = '_neo4j',
)

total = 0

print(f'Number of news published by Category (Fetched in {summary.result_available_after} ms)')
for record in records:
    print(f"Publisher: {record['publisher']:20s} Category: {record['category'] if record['category'] else 'Not Applicable':50s} Count: {record['news_count']}")
    total += record['news_count']
    
print(f"Total news count: {total}")

Number of news published by Category (Fetched in 72 ms)
Publisher: Free Malaysia Today  Category: murder and homicide                                Count: 2
Publisher: NST                  Category: drug offences                                      Count: 1
Publisher: NST                  Category: robbery or theft                                   Count: 1
Publisher: NST                  Category: physical hurt                                      Count: 2
Publisher: NST                  Category: scam                                               Count: 2
Publisher: NST                  Category: others                                             Count: 3
Publisher: NST                  Category: firearm or terrorism                               Count: 4
Publisher: NST                  Category: murder and homicide                                Count: 19
Publisher: Newsdata io          Category: murder and homicide                                Count: 1
Publisher: Selangor Journ

In [16]:
redis_client.stop_service()
spark.stop()

[sudo] password for student: 