In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO
import spacy
import nltk
from nltk.corpus import stopwords

In [3]:
url = "https://www.ef.edu/english-resources/english-idioms/"

In [4]:
response = requests.get(url)

In [5]:
soup = BeautifulSoup(response.content, 'html.parser')

In [6]:
tables = soup.find_all('table')

In [7]:
all_dfs = []

# Loop through all tables and convert them to pandas DataFrames
for table in tables:
    # Convert the table to a string and wrap it with StringIO
    table_html = StringIO(str(table))
    df = pd.read_html(table_html)[0]
    all_dfs.append(df)

# Concatenate all DataFrames
if all_dfs:
    result = pd.concat(all_dfs, ignore_index=True)
    print("Concatenated table:")
    print(result)
else:
    print("No tables found on the page.")

Concatenated table:
                                                 Idiom  \
0                               A blessing in disguise   
1                                       A dime a dozen   
2                                 Beat around the bush   
3                               Better late than never   
4                                      Bite the bullet   
..                                                 ...   
150                            Well begun is half done   
151                             When it rains it pours   
152  You can catch more flies with honey than you c...   
153  You can lead a horse to water, but you can't m...   
154  You can't make an omelet without breaking some...   

                                               Meaning                  Usage  
0                a good thing that seemed bad at first  as part of a sentence  
1                                     Something common  as part of a sentence  
2    Avoid saying what you mean, usually be

In [8]:
en_idioms = result.iloc[:, :-1]

In [9]:
en_idioms['Transliteration'] = en_idioms['Idiom']

In [10]:
en_idioms

Unnamed: 0,Idiom,Meaning,Transliteration
0,A blessing in disguise,a good thing that seemed bad at first,A blessing in disguise
1,A dime a dozen,Something common,A dime a dozen
2,Beat around the bush,"Avoid saying what you mean, usually because it...",Beat around the bush
3,Better late than never,Better to arrive late than not to come at all,Better late than never
4,Bite the bullet,To get something over with because it is inevi...,Bite the bullet
...,...,...,...
150,Well begun is half done,Getting a good start is important,Well begun is half done
151,When it rains it pours,Everything is going wrong at once,When it rains it pours
152,You can catch more flies with honey than you c...,You'll get what you want by being nice,You can catch more flies with honey than you c...
153,"You can lead a horse to water, but you can't m...",You can't force someone to make the right deci...,"You can lead a horse to water, but you can't m..."


In [31]:
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ridhibandaru/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
nltk_stopwords = set(stopwords.words('english'))
combined_stopwords = spacy_stopwords.union(nltk_stopwords)

# Words to keep even if they're in the stop word list
important_words = {'not', 'no', 'very', 'too', 'only', 'just'}

In [33]:
def remove_stopwords(text):
    if not isinstance(text, str):
        return text
    
    # Process the text with spaCy
    doc = nlp(text.lower())
    
    # Filter out stop words, but keep important words and punctuation
    filtered_words = [token.text for token in doc if token.text not in combined_stopwords 
                      or token.text in important_words or token.is_punct]
    
    # Rejoin the words
    return ' '.join(filtered_words)

In [34]:
en_idioms['MeaningStripped'] = en_idioms['Meaning'].apply(remove_stopwords)

In [35]:
en_idioms

Unnamed: 0,Idiom,Meaning,Transliteration,MeaningStripped
0,A blessing in disguise,a good thing that seemed bad at first,A blessing in disguise,good thing bad
1,A dime a dozen,Something common,A dime a dozen,common
2,Beat around the bush,"Avoid saying what you mean, usually because it...",Beat around the bush,"avoid saying mean , usually uncomfortable"
3,Better late than never,Better to arrive late than not to come at all,Better late than never,better arrive late not come
4,Bite the bullet,To get something over with because it is inevi...,Bite the bullet,inevitable
...,...,...,...,...
150,Well begun is half done,Getting a good start is important,Well begun is half done,getting good start important
151,When it rains it pours,Everything is going wrong at once,When it rains it pours,going wrong
152,You can catch more flies with honey than you c...,You'll get what you want by being nice,You can catch more flies with honey than you c...,want nice
153,"You can lead a horse to water, but you can't m...",You can't force someone to make the right deci...,"You can lead a horse to water, but you can't m...",force right decision


In [24]:
from bertopic import BERTopic

In [54]:
docs = en_idioms['Meaning'].tolist()

In [56]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model, nr_topics=10)

topics, probs = topic_model.fit_transform(docs)

# Get the top topics
print(topic_model.get_topic_info())

# Get the topics for specific documents
print(topic_model.get_document_info(docs))

   Topic  Count                                  Name  \
0     -1     84           -1_good_just_problem_better   
1      0     14                 0_dont_hes_late_youre   
2      1     41              1_things_good_work_doing   
3      2     16  2_people_negative_understand_usually   

                                      Representation  \
0  [good, just, problem, better, dont, secret, st...   
1  [dont, hes, late, youre, thinking, hes dumb, d...   
2  [things, good, work, doing, money, things goin...   
3  [people, negative, understand, usually, proble...   

                                 Representative_Docs  
0  [The big issue, the problem people are avoidin...  
1  [It's too late, It's too late, The son is like...  
2  [Things are going from bad to worse, Things ar...  
3  [Understand the situation (usually negative), ...  
                                              Document  Topic  \
0                a good thing that seemed bad at first      1   
1                           

In [45]:
document_info = topic_model.get_document_info(docs)

# Add the topic assignments to your DataFrame
en_idioms['Topic'] = document_info['Topic']

In [39]:
en_idioms

Unnamed: 0,Idiom,Meaning,Transliteration,MeaningStripped,Topic
0,A blessing in disguise,a good thing that seemed bad at first,A blessing in disguise,good thing bad,0
1,A dime a dozen,Something common,A dime a dozen,common,-1
2,Beat around the bush,"Avoid saying what you mean, usually because it...",Beat around the bush,"avoid saying mean , usually uncomfortable",0
3,Better late than never,Better to arrive late than not to come at all,Better late than never,better arrive late not come,-1
4,Bite the bullet,To get something over with because it is inevi...,Bite the bullet,inevitable,-1
...,...,...,...,...,...
150,Well begun is half done,Getting a good start is important,Well begun is half done,getting good start important,-1
151,When it rains it pours,Everything is going wrong at once,When it rains it pours,going wrong,-1
152,You can catch more flies with honey than you c...,You'll get what you want by being nice,You can catch more flies with honey than you c...,want nice,-1
153,"You can lead a horse to water, but you can't m...",You can't force someone to make the right deci...,"You can lead a horse to water, but you can't m...",force right decision,-1


In [2]:
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

# Load the T5 model and tokenizer
model_name = "t5-base"  # You can use "t5-base" or "t5-large" for better results, but they require more memory
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Load your DataFrame
df = pd.read_csv('idioms_en.csv')

# The prompt you want to use
prompt = "Find the equivalent of this idiom in Hindi: "

def generate_response(input_text):
    input_text = prompt + input_text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    
    outputs = model.generate(input_ids, max_length=100, num_return_sequences=1, temperature=0.7)
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Apply the function to create a new column
df['T5_Response'] = df['Idiom'].apply(generate_response)

# Save the updated DataFrame
df.to_csv('output_with_t5_responses.csv', index=False)

print(df[['Idiom', 'T5_Response']].head())

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



                    Idiom                                        T5_Response
0  A blessing in disguise  in Hindi: in Hindi: A blessing in disguise. in...
1          A dime a dozen  in Hindi: A dime a dozen. in Hindi: A dime a d...
2    Beat around the bush  in Hindi: in Hindi: in Hindi: in Hindi: in Hin...
3  Better late than never  in Hindi: in Hindi: in Hindi: in Hindi:: Bette...
4         Bite the bullet  in Hindi: in Hindi: in Hindi: in Hindi: Bite t...


In [5]:
import pandas as pd
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (GPU) device")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA (GPU) device")
else:
    device = torch.device("cpu")
    print("GPU not found, using CPU")

# Load the model and tokenizer
model_name = "facebook/m2m100_418M"  # You can also use "facebook/m2m100_1.2B" for better results
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name).to(device)

# Load your DataFrame
df = pd.read_csv('idioms_en.csv')

def translate_text(text, src_lang="en", tgt_lang="hi"):
    # Set the source language
    tokenizer.src_lang = src_lang

    # Tokenize the input text
    encoded_input = tokenizer(text, return_tensors="pt").to(device)

    # Generate translation
    generated_tokens = model.generate(
        **encoded_input,
        forced_bos_token_id=tokenizer.get_lang_id(tgt_lang)
    )

    # Decode the generated tokens to text
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    
    return translation[0]

# Apply the translation function to create a new column
df['Translated'] = df['Idiom'].apply(lambda x: translate_text(x, src_lang="en", tgt_lang="hi"))

# Save the updated DataFrame
df.to_csv('output_with_translations.csv', index=False)

print(df[['Idiom', 'Translated']].head())

Using MPS (GPU) device
                    Idiom             Translated
0  A blessing in disguise  आशीर्वाद में आशीर्वाद
1          A dime a dozen          दसवीं शताब्दी
2    Beat around the bush   बश के चारों ओर घूमना
3  Better late than never  कभी नहीं देर से बेहतर
4         Bite the bullet          गेंद को मारना


In [7]:
from transformers import MarianMTModel, MarianTokenizer

model_name = f"Helsinki-NLP/opus-mt-en-hi"  # Change language pair as needed
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def translate(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    outputs = model.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test
idiom = "It's raining cats and dogs"
print(translate(idiom))

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

यह डाकुओं और कुत्तों की बारिश है


In [10]:
from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
from ai4bharat.transliteration import XlitEngine

# Initialize the transliteration engine
e = XlitEngine("hi")  # 'hi' for Hindi, change as needed

def translate_indic(text, src_lang, tgt_lang):
    # This is a simplified example. You'd need to implement the actual translation logic
    # using the IndicTrans model or API
    translated = some_indic_trans_function(text, src_lang, tgt_lang)
    return translated

# Example usage
idiom = "It's raining cats and dogs"
print(translate_indic(idiom, 'en', 'hi'))

Downloading Multilingual model for transliteration


MB[38;2;0;255;0m100%[39m [38;2;0;255;0m(121.0 of 121.0)[39m |################| Elapsed Time: 0:00:37 Time:  0:00:370003


Succefully Downloaded to: /opt/anaconda3/lib/python3.11/site-packages/ai4bharat/transliteration/transformer/models/en2indic/v1.0/model.zip
Models downloaded to: /opt/anaconda3/lib/python3.11/site-packages/ai4bharat/transliteration/transformer/models/en2indic/v1.0
NOTE: When uninstalling this library, REMEMBER to delete the models manually
Downloading language model probablitites dictionaries for rescoring module


MB[38;2;0;255;0m100%[39m [38;2;0;255;0m(812.0 of 812.0)[39m |################| Elapsed Time: 0:04:28 Time:  0:04:280222


Succefully Downloaded to: /opt/anaconda3/lib/python3.11/site-packages/ai4bharat/transliteration/transformer/models/en2indic/v1.0/dicts.zip
Initializing Multilingual model for transliteration


ValueError: mutable default <class 'fairseq.dataclass.configs.CommonConfig'> for field common is not allowed: use default_factory

In [11]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

# Load a pre-trained model (you'd need to fine-tune this on your data)
model_name = "facebook/bart-large"  # or another suitable model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def context_aware_translate(text, idiom, context_window=50):
    # Find the idiom in the text
    idiom_start = text.index(idiom)
    idiom_end = idiom_start + len(idiom)
    
    # Get context before and after the idiom
    context_start = max(0, idiom_start - context_window)
    context_end = min(len(text), idiom_end + context_window)
    
    # Create input with context and special tokens to highlight the idiom
    input_text = (
        f"{text[context_start:idiom_start]}"
        f"<idiom>{idiom}</idiom>"
        f"{text[idiom_end:context_end]}"
    )
    
    # Tokenize and generate translation
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(**inputs, max_length=150)
    
    # Decode and return the translation
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation

# Example usage
full_text = "The project was difficult, but John decided to bite the bullet and start working on it immediately."
idiom = "bite the bullet"

translated = context_aware_translate(full_text, idiom)
print(f"Original: {full_text}")
print(f"Translated: {translated}")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

Original: The project was difficult, but John decided to bite the bullet and start working on it immediately.
Translated: The project was difficult, but John decided to <idiom>bite the bullet</idiom>, and start working on it immediately.


In [3]:
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments

# Prepare your dataset
data = [
    {"input": "The project was difficult, but John decided to <idiom>bite the bullet</idiom> and start working on it immediately.",
     "target": "प्रोजेक्ट कठिन था, लेकिन जॉन ने हिम्मत करके तुरंत काम शुरू करने का फैसला किया।"},
    # Add more examples...
]
df = pd.DataFrame(data)

# Load a pre-trained model and tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Set up tokenizer for English input and Hindi output
tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "hi_IN"

# Tokenization function
def preprocess_function(examples):
    inputs = tokenizer(examples["input"].tolist(), max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    with tokenizer.as_target_tokenizer():
        targets = tokenizer(examples["target"].tolist(), max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]}

# Tokenize the dataset
tokenized_dataset = preprocess_function(df)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# Start training
trainer.train()

KeyError: 0

In [8]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Load your data
df = pd.read_csv('idioms_en.csv')

# Load a pre-trained sentence transformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Generate embeddings for the meanings
embeddings = model.encode(df['Meaning'].tolist())

# Determine optimal number of clusters using silhouette score
silhouette_scores = []
K = range(2, 20)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(embeddings)
    score = silhouette_score(embeddings, kmeans.labels_)
    silhouette_scores.append(score)

optimal_k = K[np.argmax(silhouette_scores)]

# Perform clustering with optimal K
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['cluster'] = kmeans.fit_predict(embeddings)

# Function to find similar idioms
def find_similar_idioms(idiom_index, top_n=2):
    cluster = df.loc[idiom_index, 'cluster']
    cluster_idioms = df[df['cluster'] == cluster].index
    idiom_embedding = embeddings[idiom_index]
    similarities = [np.dot(idiom_embedding, embeddings[i]) / (np.linalg.norm(idiom_embedding) * np.linalg.norm(embeddings[i])) for i in cluster_idioms]
    top_indices = np.argsort(similarities)[::-1][1:top_n+1]
    return df.iloc[cluster_idioms[top_indices]]

# Example usage
example_idiom_index = 0
similar_idioms = find_similar_idioms(example_idiom_index)
print(f"Idiom: {df.iloc[example_idiom_index]['Idiom']}")
print("Similar idioms:")
print(similar_idioms[['Idiom', 'Meaning']])

# Print cluster statistics
print("\nCluster statistics:")
print(df['cluster'].value_counts())



Idiom: A blessing in disguise
Similar idioms:
                    Idiom                       Meaning
40        A perfect storm  the worst possible situation
31  To make matters worse          Make a problem worse

Cluster statistics:
cluster
17    21
1     21
0     20
13    19
11    12
16    11
8      9
2      6
3      6
12     4
18     3
5      3
15     3
6      3
9      3
10     3
7      3
14     3
4      2
Name: count, dtype: int64


In [14]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load your data
df = pd.read_csv('idioms_en.csv')

# Load a pre-trained sentence transformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Generate embeddings for the meanings
embeddings = model.encode(df['Meaning'].tolist())

# Function to find idioms with very similar meanings
def find_similar_idioms(idiom_index, top_n=5, similarity_threshold=0.6):
    idiom_embedding = embeddings[idiom_index].reshape(1, -1)
    similarities = cosine_similarity(idiom_embedding, embeddings)[0]
    
    # Get indices of idioms above the similarity threshold, excluding the input idiom
    similar_indices = np.where((similarities > similarity_threshold) & (np.arange(len(similarities)) != idiom_index))[0]
    
    # Sort by similarity (highest first)
    similar_indices = similar_indices[np.argsort(similarities[similar_indices])[::-1]]
    
    # Take top N results
    top_indices = similar_indices[:top_n]
    
    return df.iloc[top_indices], similarities[top_indices]

# Example usage
example_idiom_index = 13
similar_idioms, similarity_scores = find_similar_idioms(example_idiom_index)

print(f"Original Idiom: {df.iloc[example_idiom_index]['Idiom']}")
print(f"Meaning: {df.iloc[example_idiom_index]['Meaning']}")
print("\nSimilar idioms:")
for (_, row), score in zip(similar_idioms.iterrows(), similarity_scores):
    print(f"\nIdiom: {row['Idiom']}")
    print(f"Meaning: {row['Meaning']}")
    print(f"Similarity score: {score:.2f}")



Original Idiom: Give someone the benefit of the doubt
Meaning: Trust what someone says

Similar idioms:


In [15]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load a pre-trained sentence transformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Generate embeddings for the phrases
phrase1 = "busy"
phrase2 = "no free time"

embedding1 = model.encode([phrase1])
embedding2 = model.encode([phrase2])

# Calculate cosine similarity
similarity = cosine_similarity(embedding1, embedding2)[0][0]

print(f"Similarity between '{phrase1}' and '{phrase2}': {similarity:.4f}")



Similarity between 'busy' and 'no free time': 0.2436


In [16]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.corpus import wordnet
import nltk

nltk.download('wordnet')

def get_expanded_phrases(phrase):
    synonyms = set()
    for syn in wordnet.synsets(phrase):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().replace('_', ' '))
    return list(synonyms)

def get_similarity(phrase1, phrase2, model):
    # Get expanded phrases
    expanded1 = get_expanded_phrases(phrase1) + [phrase1]
    expanded2 = get_expanded_phrases(phrase2) + [phrase2]
    
    # Generate embeddings
    embeddings1 = model.encode(expanded1)
    embeddings2 = model.encode(expanded2)
    
    # Calculate max similarity
    similarities = cosine_similarity(embeddings1, embeddings2)
    return np.max(similarities)

# Load model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Define phrases
phrase1 = "busy"
phrase2 = "no free time"

# Calculate similarity
similarity = get_similarity(phrase1, phrase2, model)

print(f"Similarity between '{phrase1}' and '{phrase2}': {similarity:.4f}")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ridhibandaru/nltk_data...


Similarity between 'busy' and 'no free time': 0.2815


In [17]:
import requests
from conceptnet5.language.english import english_filter
import numpy as np

def get_conceptnet_related(concept, limit=10):
    url = f"http://api.conceptnet.io/c/en/{concept}?limit={limit}"
    response = json.loads(requests.get(url).text)
    related = []
    for edge in response['edges']:
        if edge['start']['language'] == 'en' and edge['end']['language'] == 'en':
            if edge['start']['label'] == concept:
                related.append(edge['end']['label'])
            else:
                related.append(edge['start']['label'])
    return list(set(related))

def conceptnet_similarity(phrase1, phrase2):
    related1 = set(get_conceptnet_related(phrase1))
    related2 = set(get_conceptnet_related(phrase2))
    
    # Add the original phrases to their related sets
    related1.add(phrase1)
    related2.add(phrase2)
    
    # Calculate Jaccard similarity
    intersection = len(related1.intersection(related2))
    union = len(related1.union(related2))
    
    return intersection / union if union > 0 else 0

# Example usage
similarity = conceptnet_similarity("busy", "no free time")
print(f"Similarity between 'busy' and 'no free time': {similarity:.4f}")

Similarity between 'busy' and 'no free time': 0.0000


In [18]:
import requests
import json
from collections import Counter

def get_conceptnet_related(concept, limit=20):
    url = f"http://api.conceptnet.io/c/en/{concept.replace(' ', '_')}?limit={limit}"
    response = json.loads(requests.get(url).text)
    related = []
    for edge in response['edges']:
        if edge['start']['language'] == 'en':
            related.append(edge['start']['label'])
        if edge['end']['language'] == 'en':
            related.append(edge['end']['label'])
    return list(set(related))

def get_expanded_concepts(phrase):
    words = phrase.split()
    concepts = set()
    for word in words:
        concepts.update(get_conceptnet_related(word))
    concepts.update(get_conceptnet_related(phrase))
    return list(concepts)

def conceptnet_similarity(phrase1, phrase2):
    concepts1 = get_expanded_concepts(phrase1)
    concepts2 = get_expanded_concepts(phrase2)
    
    # Add original phrases and their words
    concepts1.extend([phrase1] + phrase1.split())
    concepts2.extend([phrase2] + phrase2.split())
    
    # Count occurrences
    counter1 = Counter(concepts1)
    counter2 = Counter(concepts2)
    
    # Calculate cosine similarity
    common = set(counter1.keys()) & set(counter2.keys())
    numerator = sum(counter1[x] * counter2[x] for x in common)
    sum1 = sum(counter1[x]**2 for x in counter1.keys())
    sum2 = sum(counter2[x]**2 for x in counter2.keys())
    denominator = (sum1 * sum2)**0.5
    
    if denominator == 0:
        return 0
    return numerator / denominator

# Test the similarity
similarity = conceptnet_similarity("busy", "no free time")
print(f"Similarity between 'busy' and 'no free time': {similarity:.4f}")

# Let's also print out the related concepts to see what ConceptNet is giving us
print("\nConcepts related to 'busy':")
print(get_expanded_concepts("busy")[:10])  # Showing first 10 for brevity

print("\nConcepts related to 'no free time':")
print(get_expanded_concepts("no free time")[:10])  # Showing first 10 for brevity

Similarity between 'busy' and 'no free time': 0.0000

Concepts related to 'busy':
['crowd', 'diligent', 'labouring', 'busy', 'dabble']

Concepts related to 'no free time':
['hour', 'nary', 'none', 'hours', 'negative', 'uncommitted', 'time', 'unbound', 'clocks', 'No']


In [19]:
import numpy as np
from sentence_transformers import SentenceTransformer
from conceptnet5.language.english import english_filter
import requests
import json

# Custom semantic mapping
custom_semantic_map = {
    "busy": ["occupied", "engaged", "swamped", "no free time", "hectic", "tied up"],
    "no free time": ["busy", "occupied", "swamped", "hectic", "tied up"],
    # Add more mappings as needed
}

# Load a pre-trained sentence transformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def get_conceptnet_related(concept, limit=10):
    url = f"http://api.conceptnet.io/c/en/{concept.replace(' ', '_')}?limit={limit}"
    response = json.loads(requests.get(url).text)
    related = []
    for edge in response['edges']:
        if edge['start']['language'] == 'en':
            related.append(edge['start']['label'])
        if edge['end']['language'] == 'en':
            related.append(edge['end']['label'])
    return list(set(related))

def custom_similarity(phrase1, phrase2):
    if phrase2 in custom_semantic_map.get(phrase1, []) or phrase1 in custom_semantic_map.get(phrase2, []):
        return 1.0
    return 0.0

def conceptnet_similarity(phrase1, phrase2):
    related1 = set(get_conceptnet_related(phrase1))
    related2 = set(get_conceptnet_related(phrase2))
    
    # Add the original phrases to their related sets
    related1.add(phrase1)
    related2.add(phrase2)
    
    # Calculate Jaccard similarity
    intersection = len(related1.intersection(related2))
    union = len(related1.union(related2))
    
    return intersection / union if union > 0 else 0

def embedding_similarity(phrase1, phrase2):
    emb1 = model.encode([phrase1])[0]
    emb2 = model.encode([phrase2])[0]
    return np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))

def hybrid_similarity(phrase1, phrase2):
    custom_sim = custom_similarity(phrase1, phrase2)
    if custom_sim > 0:
        return custom_sim
    
    cn_sim = conceptnet_similarity(phrase1, phrase2)
    emb_sim = embedding_similarity(phrase1, phrase2)
    
    return max(cn_sim, emb_sim)

# Test the hybrid similarity
similarity = hybrid_similarity("busy", "no free time")
print(f"Hybrid similarity between 'busy' and 'no free time': {similarity:.4f}")

# Function to cluster idioms
def cluster_idioms(idioms, meanings, similarity_threshold=0.7):
    clusters = {}
    
    for i, (idiom, meaning) in enumerate(zip(idioms, meanings)):
        added_to_cluster = False
        
        for cluster_key in clusters:
            if hybrid_similarity(meaning, cluster_key) > similarity_threshold:
                clusters[cluster_key].append((idiom, meaning))
                added_to_cluster = True
                break
        
        if not added_to_cluster:
            clusters[meaning] = [(idiom, meaning)]
    
    return clusters

# Example usage
idioms = [
    "It's raining cats and dogs",
    "Busy as a bee",
    "No time to breathe",
    "Time flies",
    "In hot water"
]
meanings = [
    "raining heavily",
    "very busy",
    "no free time",
    "time passes quickly",
    "in trouble"
]

idiom_clusters = cluster_idioms(idioms, meanings)

for cluster_key, cluster_items in idiom_clusters.items():
    print(f"\nCluster: {cluster_key}")
    for idiom, meaning in cluster_items:
        print(f"  - {idiom} ({meaning})")



Hybrid similarity between 'busy' and 'no free time': 1.0000

Cluster: raining heavily
  - It's raining cats and dogs (raining heavily)

Cluster: very busy
  - Busy as a bee (very busy)

Cluster: no free time
  - No time to breathe (no free time)

Cluster: time passes quickly
  - Time flies (time passes quickly)

Cluster: in trouble
  - In hot water (in trouble)


In [20]:
print(hybrid_similarity("very busy", "no free time"))
print(hybrid_similarity("time passes quickly", "no free time"))

0.21891384
0.38433766
