In [1]:
!pip install rdflib  



In [2]:

from rdflib import Graph, Literal, RDF, RDFS, Namespace, URIRef

#Namespaces
SCHEMA = Namespace("http://schema.org/")
KG = Namespace("http://kg-course.io/food-nutrition/")

g = Graph()
g.bind("schema", SCHEMA)
g.bind("ex", KG)

#Classes
classes = [
    (KG.Recipe, SCHEMA.Recipe, "A food recipe"),
    (KG.Restaurant, SCHEMA.FoodEstablishment, "An enstablishment that serves food"),
    (KG.Nutrition, SCHEMA.NutritionInformation, "Nutritional facts about a recipe"),
    (KG.Review, SCHEMA.Review, "A user review")
]

for uri, s_type, comment in classes:
    g.add((uri, RDF.type, RDFS.Class))
    g.add((uri, RDFS.subClassOf, s_type))
    g.add((uri, RDFS.comment, Literal(comment)))


with open("vocabulary.ttl", "w") as f:
    f.write(g.serialize(format="turtle"))

print("Vocabulary defined")


Vocabulary defined


In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
import pandas as pd
from rdflib import Graph, Literal, RDF, Namespace, URIRef
from rdflib.namespace import XSD

# graph and namespaces
g = Graph()
SCHEMA = Namespace("http://schema.org/")
KG = Namespace("http://kg-course.io/food-nutrition/")
g.bind("schema", SCHEMA)
g.bind("ex", KG)

recipes_df = pd.read_csv('data/Recipes.csv', sep=';', engine='python', on_bad_lines='skip').head(10000)
restaurants_df = pd.read_csv('data/Restaurants.csv', sep=';', engine='python', on_bad_lines='skip').head(10000)
nutrition_df = pd.read_csv('data/Nutrition.csv', sep=';', engine='python', on_bad_lines='skip').head(10000)

print("Data loaded successfully from Drive!")

def create_kg():
    for index, row in recipes_df.iterrows():
        recipe_uri = KG[f"recipe_{row['RecipeId']}"]
        g.add((recipe_uri, RDF.type, SCHEMA.Recipe))
        g.add((recipe_uri, SCHEMA.name, Literal(row['Name'])))
        g.add((recipe_uri, SCHEMA.recipeCategory, Literal(row['RecipeCategory'])))
        g.add((recipe_uri, SCHEMA.cookTime, Literal(row['CookTime'])))

        nutri_uri = KG[f"nutrition_{row['RecipeId']}"]
        g.add((recipe_uri, SCHEMA.nutrition, nutri_uri))
        g.add((nutri_uri, RDF.type, SCHEMA.NutritionInformation))

    for index, row in restaurants_df.iterrows():
        rest_uri = KG[f"restaurant_{row['Restaurant.ID']}"]
        g.add((rest_uri, RDF.type, SCHEMA.FoodEstablishment))
        g.add((rest_uri, SCHEMA.name, Literal(row['Restaurant.Name'])))
        g.add((rest_uri, SCHEMA.address, Literal(row['Address'])))
        g.add((rest_uri, SCHEMA.servesCuisine, Literal(row['Cuisines'])))




    g.serialize(destination='KEN4256-structured-KG-TeamID.ttl', format='turtle')



Data loaded successfully from Drive!


## Task 3. Enriching graph with unstructured data and external KGs

In [6]:
import warnings

warnings.filterwarnings('ignore')
# reviews_df = pd.read_csv("data/cleaned_reviews.csv")


text = ["I have made this pie instead of plain ol' pumpkin pie for the last 7 years.  Everyone always raves about it.  The flavor is wonderful and the texture is slightly lighter than traditional pumpin pie\"	 I suspect due to the substitution of light cream instead of canned milk.  	If you try this	\" you won't go back to plain ol' pumkin again!", "I hate this freaking recipe it's the worst thing i've every eaten in my life crazy", "I don't know, it seems fine, but nothing special really. I don't know what to think about it", "I decided to add milk chocolate and it resulted in a more colourful flavour!"]


### 3.1. Extracting knowledge from unstructured data
WiP: So far I have a few options: using an llm(might be not feasible because on a scale of thousands of reviews we are most likely going to exceed free api limit, unless the OPENAI key provided in the lab includes these costs) or just connect all extracted features to the recipe with a single relation(e.g. ex:extractedIngredients or ex:mentionedIngredients). Second approach is not as high-quality, but might be enough for our needs.

In [None]:
!pip install spacy_llm
!pip install langchain
!pip install langchain-openai
!pip install dotenv
!pip install openai

In [9]:
import os
from dotenv import set_key, load_dotenv

with open("OPENAI_API_KEY.txt", "r") as f:
    api_key = f.read().strip().strip('"')

set_key('.env', 'OPENAI_API_KEY', api_key)

# Bring environment variables from .env into os.environ
load_dotenv(override=True)
print(os.getenv('OPENAI_API_KEY'))

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


In [10]:
from spacy_llm.util import assemble
import spacy

nlp = assemble("config/lab3_config_rel_1.cfg")

In [11]:
# consider adding RECIPE_ID to the review text, so that model can have an object for relation (RECIPE_ID [addition] Chocolate milk)
doc = nlp(". ".join(text))

In [12]:
spacy.displacy.render(doc,jupyter=1, style = "ent")

In [13]:
triples = []
for r in doc._.rel:
        print(f"  - {doc.ents[r.dep]} [{r.relation}] {doc.ents[r.dest]}")
        triples.append((doc.ents[r.dep].text, r.relation, doc.ents[r.dest].text))

  - light cream [substitution] canned milk
  - milk chocolate [addition] pumpkin pie


### 3.2. Sentiment Analysis
For the Sentiment analysis we are going to use [cardiffnlp/twitter-roberta-base-sentiment-latest](https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest) classification model. It was trained on posts from Twitter, which are short and often informal, which makes them similar to reviews, so this model is a perfect fit for our needs. The model output has 3 labels: Negative, Neutral, Positive

In [None]:
!pip install transformers datasets evaluate accelerate

In [15]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np

In [16]:
# Use softmax to normalize the output to probability distribution
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

Example(until data cleaning code is finished. After that we will be using dataframe from cleaned_reviews.scv file)

In [17]:
reviews_df = pd.DataFrame({"Review": text})
print(reviews_df)

                                              Review
0  I have made this pie instead of plain ol' pump...
1  I hate this freaking recipe it's the worst thi...
2  I don't know, it seems fine, but nothing speci...
3  I decided to add milk chocolate and it resulte...


In [18]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"

# Using tokenizer and config provided by the model
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Loading weights: 100%|██████████| 201/201 [00:00<00:00, 1291.40it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
[1mRobertaForSequenceClassification LOAD REPORT[0m from: cardiffnlp/twitter-roberta-base-sentiment-latest
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.pooler.dense.weight     | UNEXPECTED |  | 
roberta.embeddings.position_ids | UNEXPECTED |  | 
roberta.pooler.dense.bias       | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [19]:
encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
output = model(**encoded_input)

scores = output[0].detach().numpy()
scores = softmax(scores)

ranking = np.argsort(scores, axis=1)
ranking = np.flip(ranking, axis=1)

reviews_df["Sentiment"] = ranking[:, 0]
reviews_df["Sentiment"] = reviews_df["Sentiment"].map(lambda x: config.id2label[x])
print(reviews_df)

# for i in range(len(ranking)):
#     for j in range(ranking.shape[1]):
#         l = config.id2label[ranking[i][j]]
#         s = scores[i][ranking[i][j]]
#         print(f"{i+1}) {l} {np.round(float(s), 4)}")
#
#     print()


                                              Review Sentiment
0  I have made this pie instead of plain ol' pump...  positive
1  I hate this freaking recipe it's the worst thi...  negative
2  I don't know, it seems fine, but nothing speci...   neutral
3  I decided to add milk chocolate and it resulte...  positive
