# Create dictionary full of adjectives and insert them into the key-value db redis

## Parse dictionary files (json and custom file format)

In [1]:
import re
import os
import json

# Natural Language Toolkit already offers many libraries regarding computer linguistics
# E.g. "from nltk.corpus import wordnet as wn" could be used to categorise the part of speech of a given word
# For education's sake, we try to create our own list of adjectives using Wordnet and/or a json file

# allowed_specials (syntactic markers)
# p predicate position
# a prenominal (attributive) position
# ip immediately postnominal position
def read_word_net_dictionary(file_path, allowed_specials = [], allow_adjective_satellite = True):
    adjectives_word_net = set()

    with open(file_path, "r") as file:
        lines = file.readlines()

    for line in lines:
        #https://wordnet.princeton.edu/documentation/wndb5wn
        #https://wordnet.princeton.edu/documentation/wninput5wn
                                             # word with opt. info (.) followed by space + hexCode + space
        match = re.search(r"\d+ \w{2} (\w) \w{2} ((?:[a-zA-Z_\-.']+(?:\((a|p|ip)\))? [0-9a-fA-F] )+)", line)
        if match is not None:
            # between words is a one-digit hex code distinctly identifying a word within a lexicographer's file
            words = re.sub(r" [0-9a-fA-F] ", " ", match.group(2)).strip()
            # replace multiple spaces with single space
            words = re.sub(r" +", " ", words).split(" ")
            for word in words:
                # _ in word means space -> two words | remove all words which are adjectives only in a certain context | potentially remove adjective satellite
                syntactic_marker_match = re.search(r'\((.{1,2})\)$', word)
                # if no markers exist or this marker is whitelisted
                is_marker_allowed = syntactic_marker_match is None or syntactic_marker_match.group(1) in allowed_specials
                # if adjective satellites are allowed, or it isn't an adjective satellite
                show_adjective_satellites = allow_adjective_satellite or match.group(1) != 's'
                if "_" not in word and show_adjective_satellites and is_marker_allowed:
                    word = re.sub(r"\(.+\)", "", word)
                    adjectives_word_net.add(word.lower())
    return adjectives_word_net

def read_nltk_extraction(directory_path):
    adjectives_nltk = set()

    # traverse directory
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        # if path leads to file
        if os.path.isfile(file_path):
            # open file and read as json
            with open(file_path, "r") as file:
                file_json = json.load(file)
            # loop through json dictionary entries
            for key, value in file_json.items():
                # if meanings is empty
                if value['MEANINGS'] is None or not value['MEANINGS']:
                    continue
                for key_meaning, value_meaning in value['MEANINGS'].items():
                    # only take first meaning into consideration - if not enough adjectives are found, evaluate better approach
                    # e.g. by taking words whose adjective-meanings make up >= 50% of all meanings
                    if value_meaning[0].lower() == 'adjective':
                        adjectives_nltk.add(key.lower())
                    break
    return adjectives_nltk

import redis

# create connection to redis db
redis_conn = redis.Redis(host='localhost', port=6379, db=0, decode_responses=True)
# remove existing entries
redis_conn.flushdb()

# all adjectives + markers
#adjectives = read_word_net_dictionary("./adjectives_wordnet.adj", ['a', 'ip', 'p'], True)

# without adjective satellites
#adjectives = read_word_net_dictionary("./adjectives_wordnet.adj", [], False)

# using dictionary json
adjectives  = read_nltk_extraction('./Dictionary JSON')

# preview
print(list(adjectives)[0:10])

for adjective in adjectives:
    # insert adjective into redis db (value is irrelevant)
    redis_conn.set(adjective, 1)

# test - value is string if not None (weird, but okay)
assert redis_conn.get("excited") == '1'
assert redis_conn.get("house") is None

from Kafka_Helpers import Producer, Consumer

# fill blacklist if necessary
blacklist = { }

dictionary_producer = Producer('localhost', 29092)

def subscribe_handler(key, value):
    infos_and_reviews = json.loads(value)
    reviews = infos_and_reviews['reviews']
    reviews_words = [[word for word in review if word not in blacklist and redis_conn.get(word) is not None] for review in reviews]
    dictionary_producer.send('adjectives', key, {
        'movie_id': infos_and_reviews['movie_id'],
        'title': infos_and_reviews['title'],
        'reviews': reviews_words
    })

dictionary_consumer = Consumer('localhost', 29092, 'movie_reviews', subscribe_handler)


['celtic', 'ambulacral', 'austere', 'breakaway', 'leafed', 'lobed', 'maternalistic', 'predaceous', 'stillborn', 'young']
Waiting for new events...


New event received


## Get adjectives and fill RedisDB

In [None]:
import redis

# create connection to redis db
redis_conn = redis.Redis(host='localhost', port=6379, db=0, decode_responses=True)
# remove existing entries
# redis_conn.flushdb()

# all adjectives + markers
#adjectives = read_word_net_dictionary("./adjectives_wordnet.adj", ['a', 'ip', 'p'], True)

# without adjective satellites
#adjectives = read_word_net_dictionary("./adjectives_wordnet.adj", [], False)

# using dictionary json
adjectives  = read_nltk_extraction('./Dictionary JSON')

# preview
print(list(adjectives)[0:10])

# for adjective in adjectives:
#     # insert adjective into redis db (value is irrelevant)
#     redis_conn.set(adjective, 1)

# test - value is string if not None (weird, but okay)
assert redis_conn.get("excited") == '1'
assert redis_conn.get("house") is None

print(redis_conn.get("the"))

## Subscribe to the 'movie reviews' topic

In [None]:
from Kafka_Helpers import Producer, Consumer

# fill blacklist if necessary
blacklist = { }

dictionary_producer = Producer('localhost', 29092)

def subscribe_handler(key, value):
    infos_and_reviews = json.loads(value)
    reviews = infos_and_reviews['reviews']
    reviews_words = [[word for word in review if word not in blacklist and redis_conn.get(word) is not None] for review in reviews]
    dictionary_producer.send('adjectives', key, {
        'movie_id': infos_and_reviews['movie_id'],
        'tile': infos_and_reviews['title'],
        'reviews': reviews_words
    })

dictionary_consumer = Consumer('localhost', 29092, 'movie_reviews', subscribe_handler)

## Simulate receiving movie infos and reviews

In [None]:
import requests
# from secrets import api_key
import json

api_key = "105864a59e519ef281a74ca3af6c1b17"

test_producer = Producer('localhost', 29092)

request = requests.get("https://api.themoviedb.org/3/movie/now_playing?language=en-US", {'api_key': api_key})
response = request.json()
first_result = response['results'][0]

request = requests.get(f"https://api.themoviedb.org/3/movie/{first_result['id']}/reviews?language=en-US", {'api_key': api_key})
response = request.json()

reviews = [review['content'] for review in response['results']]# for word in re.sub(r"[^\w\-']", " ", result['content']).split(" ")]
result = [re.sub(r"[^\w \-]", "", review.lower()).split(" ") for review in reviews]

test_producer.send("movie_reviews", "key", json.dumps({
    "movie_id": first_result['id'],
    "title": first_result['title'],
    "reviews": result
}))