In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install contractions
!pip install indic-nlp-library

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━

In [None]:
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/Copy of Copy of shareddocgog.csv')

Mounted at /content/drive


**Pre-processing**

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import contractions

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define punctuation set
punctuation_set = set(string.punctuation)

# Function for expanding contractions
def expand_contractions(text):
    return contractions.fix(text)

# Function for tokenization
def tokenize(text):
    return word_tokenize(text)

# Function for removing punctuation
def remove_punctuation(tokens):
    return [word for word in tokens if word not in punctuation_set]

# Iterate over each row in the dataset
for index, row in df.iterrows():
    hindi_idiom = row['hindi_idiom']
    english_idiom = row['english_idiom']

    # Expand contractions
    english_idiom = expand_contractions(english_idiom)

    # Tokenization
    hindi_tokens = tokenize(hindi_idiom)
    english_tokens = tokenize(english_idiom)

    # Punctuation removal
    hindi_tokens_no_punc = remove_punctuation(hindi_tokens)
    english_tokens_no_punc = remove_punctuation(english_tokens)

    print("Hindi idiom:", hindi_idiom)
    print("English idiom:", english_idiom)
    print("\nTokenization:")
    print("Hindi tokens:", hindi_tokens)
    print("English tokens:", english_tokens)
    print("\nLowercasing:")
    print("\nPunctuation Removal:")
    print("Hindi tokens without punctuation:", hindi_tokens_no_punc)
    print("English tokens without punctuation:", english_tokens_no_punc)
    print("\n--------------------------------------------\n")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Hindi tokens without punctuation: ['चोर', '–', 'चोर', 'मौसेरे', 'भाई']
English tokens without punctuation: ['Birds', 'of', 'same', 'feather', 'flock', 'together']

--------------------------------------------

Hindi idiom: आप भले तो जग भला
English idiom: Good mind, good find

Tokenization:
Hindi tokens: ['आप', 'भले', 'तो', 'जग', 'भला']
English tokens: ['Good', 'mind', ',', 'good', 'find']

Lowercasing:

Punctuation Removal:
Hindi tokens without punctuation: ['आप', 'भले', 'तो', 'जग', 'भला']
English tokens without punctuation: ['Good', 'mind', 'good', 'find']

--------------------------------------------

Hindi idiom: एक हाथ से ताली नहीं बजती
English idiom: It takes two to make a quarrel

Tokenization:
Hindi tokens: ['एक', 'हाथ', 'से', 'ताली', 'नहीं', 'बजती']
English tokens: ['It', 'takes', 'two', 'to', 'make', 'a', 'quarrel']

Lowercasing:

Punctuation Removal:
Hindi tokens without punctuation: ['एक', 'हाथ', 'से', 'ताली', 

**Using Sequence Matcher**

In [None]:
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from difflib import SequenceMatcher

# Read the CSV file containing Hindi idioms
df = pd.read_csv('/content/drive/MyDrive/Copy of Copy of shareddocgog.csv')

# Define a function to preprocess sentences
def preprocess_sentence(sentence):
    # Tokenization
    tokens = word_tokenize(sentence)

    # Remove punctuation
    tokens = [word for word in tokens if word.isalnum()]

    return tokens

# Define a function to compute similarity between two strings
def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

# Define a function to detect Hindi idioms in a sentence
def detect_hindi_idioms(sentence, hindi_idioms):
    most_matching_idiom = None
    max_similarity = 0

    # Tokenize the input sentence
    tokenized_sentence = set(preprocess_sentence(sentence))

    # Iterate through Hindi idioms
    for idiom in hindi_idioms:
        # Tokenize the Hindi idiom
        tokenized_idiom = set(preprocess_sentence(idiom))

        # Calculate similarity between tokens of the idiom and tokens of the sentence
        sim = similarity(tokenized_sentence, tokenized_idiom)

        # Update most matching idiom if similarity is higher
        if sim > max_similarity:
            most_matching_idiom = idiom
            max_similarity = sim

    return most_matching_idiom  # Return the most matching idiom

# Example input sentence
example_input_sentence = "उसने अपने साथी के प्रति संदेह जताने के लिए कभी भी चोर की दाढ़ी में तिनका बना दिया।"
#example_input_sentence = "वह लोगों को मुह में राम बगल में छुरी बताता है, पर असल में उसके कारनामे कभी नहीं सुधरते।"
#example_input_sentence = "वह अपनी गलतियों को छुपाने के लिए उल्टा चोर कोतवाल को डांट रहा है।"
#example_input_sentence = "उसका व्यवहार देखकर लगता है, जैसा देश वैसा भेष है।"
#example_input_sentence = "वह सबका मार्गदर्शन करता है, अंधों में काना राजा बन गया है।"
#example_input_sentence = "उसके पास पैसे होने के कारण, वह हमेशा जिसकी लाठी उसकी भैंस बनता है।"
#example_input_sentence = "उसे उसकी जिम्मेदारी का महत्व नहीं समझाई जा सकती, उसके लिए घर की मुर्गी दाल बराबर है।"


# Initialize variables to store the most matching idiom and its similarity score
best_idiom = None
best_similarity = 0

# Iterate over each row in the dataset
for index, row in df.iterrows():
    hindi_idiom = row['hindi_idiom']  # Extract the Hindi idiom from the CSV row

    # Calculate similarity between the input sentence and the current Hindi idiom
    sim = similarity(example_input_sentence, hindi_idiom)

    # Update the best matching idiom if the similarity is higher
    if sim > best_similarity:
        best_idiom = hindi_idiom
        best_similarity = sim

# Print the best matching idiom
print("Input Sentence:", example_input_sentence)
if best_idiom:
    print("Detected Hindi Idiom:", best_idiom)
else:
    print("Hindi idiom not detected in the sentence.")


Input Sentence: उसने अपने साथी के प्रति संदेह जताने के लिए कभी भी चोर की दाढ़ी में तिनका बना दिया।
Detected Hindi Idiom: चोर की दाढ़ी में तिनका
