In [10]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import CountVectorizer

# Load the spaCy model for NLP
nlp = spacy.load("en_core_web_sm")

# Load the cleaned wine dataset
file_path = '/Users/annelise/Documents/GitHub/Wine_tasting_KG/data_kaggle/'
cleaned_wine_data = pd.read_csv(file_path + 'cleaned_wine_data_price_max=50_desc=cleaned.csv')

# Vectorize the descriptions to find the minimal common words
vectorizer = CountVectorizer(stop_words='english', min_df=1)
X = vectorizer.fit_transform(cleaned_wine_data['processed_description'])

# Get the vocabulary and its frequency in the dataset
vocab = vectorizer.get_feature_names_out()
word_counts = X.toarray().sum(axis=0)

# Sort words by frequency
sorted_vocab = [word for _, word in sorted(zip(word_counts, vocab), reverse=True)]

# Initialize a set to track all used words
used_words_set = set()

# Function to keep the minimal common words that allow all wines to have at least 3 words
def minimal_common_words(row, sorted_vocab, min_words=3):
    words = row.split()
    selected_words = [word for word in sorted_vocab if word in words]
    used_words_set.update(selected_words[:min_words])  # Track the words used
    return " ".join(selected_words[:min_words])

# Apply the function to create the new flavor column
cleaned_wine_data['minimal_flavors'] = cleaned_wine_data['processed_description'].apply(
    lambda row: minimal_common_words(row, sorted_vocab))

# Calculate the number of words in the minimal_flavors column
cleaned_wine_data['flavor_word_count'] = cleaned_wine_data['minimal_flavors'].apply(lambda x: len(x.split()))

# Get the total number of unique words used
total_unique_words_used = len(used_words_set)

# Display the first few rows and the total number of unique words used
print(cleaned_wine_data[['processed_description', 'minimal_flavors', 'flavor_word_count']].head())
print(f"Total unique words used across all wines to meet the 3-word minimum: {total_unique_words_used}")

                               processed_description        minimal_flavors  \
0  ripe fruity firm tannin juicy red berry fruit ...   fruit acidity tannin   
1  tart snappy lime flesh green pineapple poke cr...    acidity crisp green   
2  pineapple rind lemon pith orange blossom palat...    palate finish lemon   
3  blackberry raspberry typical navarran whiff gr...   fruit finish acidity   
4  bright informal red candied berry white pepper...  palate acidity tannin   

   flavor_word_count  
0                  3  
1                  3  
2                  3  
3                  3  
4                  3  
Total unique words used across all wines to meet the 3-word minimum: 444


### Create Knowledge-Graph

In [12]:
# Create an empty graph
G = nx.Graph()

# Add nodes and edges to the graph based on the dataset
for _, row in cleaned_wine_data.iterrows():
    wine_node = f"Wine: {row['title']}"
    G.add_node(wine_node, type='wine')

    # Add winery, variety, and country as nodes connected to the wine
    G.add_node(row['winery'], type='winery')
    G.add_node(row['variety'], type='variety')
    G.add_node(row['country'], type='country')
    G.add_node(row['province'], type='province')

    # Connect province to country (subproperty relationship)
    G.add_edge(row['province'], row['country'], relation='Located in')

    # Connect wine to its attributes
    G.add_edge(wine_node, row['winery'], relation='Produced by')
    G.add_edge(wine_node, row['variety'], relation='From Variety')
    G.add_edge(wine_node, row['province'], relation='Produced in')

    # Add flavors as nodes connected to the wine
    for flavor in row['minimal_flavors'].split():
        G.add_node(flavor, type='flavor')
        G.add_edge(wine_node, flavor, relation='Has Flavor')

    # Add judge nodes and connect them with points and flavor profile
    if pd.notnull(row['taster_name']):
        judge_node = f"Judge: {row['taster_name']}"
        G.add_node(judge_node, type='judge')
        G.add_edge(judge_node, wine_node, relation='Rated', points=row['points'])
        for flavor in row['minimal_flavors'].split():
            G.add_edge(judge_node, flavor, relation='Attributed Flavor')

# Draw the graph
plt.figure(figsize=(15, 10))
pos = nx.spring_layout(G, k=0.3, iterations=50)  # Adjust k and iterations for clarity
nx.draw(G, pos, with_labels=True, node_color="skyblue", edge_color="gray", node_size=2000, font_size=8)
plt.title("Knowledge Graph of Wines, Judges, and Attributes")
plt.show()

# Optionally, save the graph for further use
#nx.write_gml(G, "wine_knowledge_graph_with_judges.gml")

KeyboardInterrupt: 

<Figure size 1500x1000 with 0 Axes>