In [1]:
# !pip install rdflib  

In [2]:
from rdflib import Graph, Literal, RDF, RDFS, Namespace, URIRef

#Namespaces
SCHEMA = Namespace("http://schema.org/")
KG = Namespace("http://kg-course.io/food-nutrition/")

g = Graph()
g.bind("schema", SCHEMA)
g.bind("ex", KG)

#Classes
classes = [
    (KG.Recipe, SCHEMA.Recipe, "A food recipe"),
    (KG.Restaurant, SCHEMA.FoodEstablishment, "An enstablishment that serves food"),
    (KG.Nutrition, SCHEMA.NutritionInformation, "Nutritional facts about a recipe"),
    (KG.Review, SCHEMA.Review, "A user review")
]

for uri, s_type, comment in classes:
    g.add((uri, RDF.type, RDFS.Class))
    g.add((uri, RDFS.subClassOf, s_type))
    g.add((uri, RDFS.comment, Literal(comment)))


with open("vocabulary.ttl", "w") as f:
    f.write(g.serialize(format="turtle"))

print("Vocabulary defined")


Vocabulary defined


In [4]:
import pandas as pd
from rdflib import Graph, Literal, RDF, Namespace, URIRef
from rdflib.namespace import XSD

# graph and namespaces
g = Graph()
SCHEMA = Namespace("http://schema.org/")
KG = Namespace("http://kg-course.io/food-nutrition/")
g.bind("schema", SCHEMA)
g.bind("ex", KG)

recipes_df = pd.read_csv('data/cleaned_Recipes.csv', sep=';', engine='python', on_bad_lines='skip').head(10000)
restaurants_df = pd.read_csv('data/cleaned_Restaurants.csv', sep=';', engine='python', on_bad_lines='skip').head(10000)
nutrition_df = pd.read_csv('data/cleaned_Nutrition.csv', sep=';', engine='python', on_bad_lines='skip').head(10000)

print("Data loaded successfully from Drive!")

def create_kg():
    for index, row in recipes_df.iterrows():
        recipe_uri = KG[f"recipe_{row['RecipeId']}"]
        g.add((recipe_uri, RDF.type, SCHEMA.Recipe))
        g.add((recipe_uri, SCHEMA.name, Literal(row['Name'])))
        g.add((recipe_uri, SCHEMA.recipeCategory, Literal(row['RecipeCategory'])))
        g.add((recipe_uri, SCHEMA.cookTime, Literal(row['CookTime'])))

        nutri_uri = KG[f"nutrition_{row['RecipeId']}"]
        g.add((recipe_uri, SCHEMA.nutrition, nutri_uri))
        g.add((nutri_uri, RDF.type, SCHEMA.NutritionInformation))

    for index, row in restaurants_df.iterrows():
        rest_uri = KG[f"restaurant_{row['Restaurant.ID']}"]
        g.add((rest_uri, RDF.type, SCHEMA.FoodEstablishment))
        g.add((rest_uri, SCHEMA.name, Literal(row['Restaurant.Name'])))
        g.add((rest_uri, SCHEMA.address, Literal(row['Address'])))
        g.add((rest_uri, SCHEMA.servesCuisine, Literal(row['Cuisines'])))




    g.serialize(destination='KEN4256-structured-KG-TeamID.ttl', format='turtle')



Data loaded successfully from Drive!


## Task 3. Enriching graph with unstructured data and external KGs

In [None]:
import warnings

warnings.filterwarnings('ignore')
reviews_df = pd.read_csv("data/cleaned_reviews.csv")


### 3.1. Extracting knowledge from unstructured data
This task turned out to be quite complicated. Initially, we consider 3 options: 
- NER with SpaCy NLP model(e.g. en_core_web_trf)
- NER with an LLM
- NER+RE with an LLM

Models like en_core_web_trf are very optimized and are decent for general purpose, but we only need to extract particular types of entities from reviews which will require retraining.
This lefts us with LLMs, which are good, but are limited by API rates, which are usually not enough to process the amount of data we have. Thus, the most logical thing would be to download a lightweight LLM and run it locally, which is thankfully quite straightforward process with SpaCy.

The choice of model is the most important part here, as some models may be too heavy to run with limited hardware resources, others, on the other hand, are just not 'smart' enough for out task. It seemed that the optimal model for our needs, among the [available from Spacy](https://spacy.io/api/large-language-models#models-hf), would be Mistral-7B-instruct, which is quite fast and has a good performance. However, due to limited resources we decided to limit ourselves ONLY to NER, no Relation Extraction was made, as it would be to computationally expensive to run on our devices.

Unfortunately due to an internal error in SpaCy source code it used some outdated parameter, which mistral model didn't accept, this lead to an error during assemble. The only straightforward way of fixing it was to edit [this file](.venv/Lib/site-packages/spacy_llm/models/hf/mistral.py) in Spacy source code(deleting resume_download=true parameter from \_\_init__() method). 

**NOTE: If you want to run llm inference on your computer, consider that it requires a GPU, extensive amount of RAM and editing aforementioned file.
We provide file with results of NER in the [data](data) folder**

The following code is commented out to avoid errors during run of the Jupyter notebook

In [None]:
# from spacy_llm.util import assemble
# import spacy
# from transformers import AutoModelForCausalLM, AutoTokenizer

# nlp = assemble("config/ner_rel_llm.cfg")

`torch_dtype` is deprecated! Use `dtype` instead!
Loading weights: 100%|██████████| 291/291 [00:00<00:00, 1043.08it/s, Materializing param=model.norm.weight]                              
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [None]:
# from tqdm import tqdm
# import warnings

# warnings.filterwarnings('ignore')

# docs = nlp.pipe(reviews_df["Review"].head(500), batch_size=1)

# extracted_entities = []
# for doc in tqdm(docs, total=500):
#     extracted_entities.append(doc.ents)

# reviews_df["extracted_ingredients"] = pd.Series(extracted_entities)

# reviews_df.to_csv("data/extracted_reviews.csv")

  0%|          | 0/500 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  0%|          | 1/500 [00:24<3:21:47, 24.26s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 2/500 [00:57<4:03:25, 29.33s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `atten

In [None]:
# doc = nlp(". ".join(["123: I have made this pie instead of plain ol' pumpkin pie for the last 7 years.  Everyone always raves about it.  The flavor is wonderful and the texture is slightly lighter than traditional pumpin pie\"	 I suspect due to the substitution of light cream instead of canned milk.  	If you try this	\" you won't go back to plain ol' pumkin again!", "I hate this freaking recipe it's the worst thing i've every eaten in my life crazy", "I don't know, it seems fine, but nothing special really. I don't know what to think about it", "I decided to add milk chocolate and it resulted in a more colourful flavour!"]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [None]:
extracted_ingredients = pd.read_csv("data/extracted_reviews.csv")


### 3.2. Sentiment Analysis
For the Sentiment analysis we are going to use [cardiffnlp/twitter-roberta-base-sentiment-latest](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest) classification model. It was trained on posts from Twitter, which are short and often informal, which makes them similar to reviews, so this model is a perfect fit for our needs. The model output has 3 labels: Negative, Neutral, Positive

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np

In [None]:
# Use softmax to normalize the output to probability distribution
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

Example(until data cleaning code is finished. After that we will be using dataframe from cleaned_reviews.scv file)

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"

# Using tokenizer and config provided by the model
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
output = model(**encoded_input)

scores = output[0].detach().numpy()
scores = softmax(scores)

ranking = np.argsort(scores, axis=1)
ranking = np.flip(ranking, axis=1)

reviews_df["Sentiment"] = ranking[:, 0]
reviews_df["Sentiment"] = reviews_df["Sentiment"].map(lambda x: config.id2label[x])
print(reviews_df)

# for i in range(len(ranking)):
#     for j in range(ranking.shape[1]):
#         l = config.id2label[ranking[i][j]]
#         s = scores[i][ranking[i][j]]
#         print(f"{i+1}) {l} {np.round(float(s), 4)}")
#
#     print()
