In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import os

In [1]:
# Set the directory
os.chdir(r"C:\Users\...")

# Confirm the current directory
print("Current Directory:", os.getcwd())

Current Directory: C:\Users\31615\Desktop\thesis\triples


In [11]:
# Load relation-to-text mappings
relation2text = pd.read_csv("data/FB15k237/relation2text.txt", sep="\t", header=None, names=["relation", "text"])

# Vectorize relation descriptions using TF-IDF
vectorizer = TfidfVectorizer(max_features=100)
X = vectorizer.fit_transform(relation2text['text'])

# Cluster relations using k-means
n_clusters = 20  # Adjust the number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
relation2text['cluster'] = kmeans.fit_predict(X)

# Extract representative words for each cluster
terms = np.array(vectorizer.get_feature_names_out())
cluster_centers = kmeans.cluster_centers_
top_n = 5  # Number of top words to extract

# Create a mapping of clusters to their representative words
cluster_words = {
    f"cluster_{i}": terms[np.argsort(-cluster_centers[i])[:top_n]].tolist()
    for i in range(n_clusters)
}

# Add representative words to the DataFrame
relation2text['cluster_words'] = relation2text['cluster'].map(
    lambda x: ", ".join(cluster_words[f"cluster_{x}"])
)

# Save the result
relation2text[['relation', 'text', 'cluster', 'cluster_words']].to_csv("data/FB15k237/simplified_relation2text.csv", index=False)

# Display the mapping
print(relation2text[['relation', 'text', 'cluster', 'cluster_words']])


                                              relation  \
0    /soccer/football_team/current_roster./soccer/f...   
1                                 /music/artist/origin   
2    /ice_hockey/hockey_team/current_roster./sports...   
3    /food/food/nutrients./food/nutrition_fact/nutr...   
4              /film/actor/film./film/performance/film   
..                                                 ...   
232             /base/biblioness/bibs_location/country   
233  /user/ktrueman/default_domain/international_or...   
234  /music/performance_role/track_performances./mu...   
235  /olympics/olympic_games/medals_awarded./olympi...   
236  /base/saturdaynightlive/snl_cast_member/season...   

                                                  text  cluster  \
0    soccer football team current roster. soccer fo...       18   
1                                  music artist origin       11   
2    ice hockey hockey team current roster. sports ...       18   
3    food food nutrients. food nutr

In [12]:
# Load the CSV with cluster words
relation_clusters = pd.read_csv("data/FB15k237/simplified_relation2text.csv")

# Extract only the 'cluster' and 'cluster_words' columns
cluster_mapping = relation_clusters[['cluster', 'cluster_words']].drop_duplicates()

# Rename columns for clarity
cluster_mapping.columns = ['cluster_number', 'cluster_words']

# Save the result to a new CSV file
cluster_mapping.to_csv("data/FB15k237/clusters_description.csv", index=False)

# Display the new DataFrame
print("Cluster Mapping:")
print(cluster_mapping)


Cluster Mapping:
     cluster_number                                      cluster_words
0                18           roster, sports, position, football, team
1                11          music, artist, genre, contribution, track
3                10     government, administrative, base, held, county
4                 1              film, by, performance, release, actor
5                15                  award, honor, awards, for, winner
7                 2          measurement, unit, currency, money, value
8                 8                  people, person, place, of, tenure
11               14                 base, current, event, of, location
14               17  location, statistical, region, relationship, p...
15                0           tv, program, producer, regular, personal
18                5               category, award, of, actor, producer
19               19        user, member, olympic, organization, sports
20                9  education, educational, institution, de

In [9]:
# Load the CSV with cluster words
relation_clusters = pd.read_csv("data/FB15k237/simplified_relation2text.csv")

# Extract only the 'cluster' and 'cluster_words' columns
cluster_mapping = relation_clusters[['cluster', 'cluster_words']].drop_duplicates()

# Rename columns for clarity
cluster_mapping.columns = ['cluster_number', 'cluster_word']

# Make cluster words unique
used_words = set()
def ensure_unique(word):
    if word in used_words:
        return None  # Skip duplicates
    used_words.add(word)
    return word

# Apply uniqueness constraint
cluster_mapping['cluster_word'] = cluster_mapping['cluster_word'].apply(ensure_unique)

# Drop rows where words could not be made unique
cluster_mapping = cluster_mapping.dropna()

# Save the result to a new CSV file
cluster_mapping.to_csv("data/FB15k237/unique_clusters_description.csv", index=False)

# Display the new DataFrame
print("Unique Cluster Mapping:")
print(cluster_mapping)


Unique Cluster Mapping:
     cluster_number                         cluster_word
0                 5             roster, sports, position
1                29                 artist, music, track
3                20          county, celebrities, travel
4                 1                film, by, performance
5                10               award, category, honor
6                21        government, held, legislative
7                 2          measurement, unit, currency
8                25                people, person, place
10               24       administrative, division, area
11               18                   base, location, of
14                7        location, statistical, region
15                0                   tv, program, genre
19               19                user, member, olympic
20                4           education, students, field
21               26               business, value, money
25                6             business, tenure, people
27     