/kaggle/input/midddd2/mid.csv


In [2]:
!pip install pyiwn
!pip install googletrans==4.0.0-rc1
!pip install datasets



In [3]:
from enum import Enum, unique
import re
import logging
import glob
import ntpath
import os

import pandas as pd

import pyiwn.constants as constants


logging.basicConfig(format='[%(filename)s:%(lineno)d] %(message)s',
    datefmt='%Y-%m-%d:%H:%M:%S',
    level=logging.INFO)
logger = logging.getLogger(__name__)


@unique
class Language(Enum):
    ASSAMESE = 'assamese'
    BENGALI = 'bengali'
    BODO = 'bodo'
    GUJARATI = 'gujarati'
    HINDI = 'hindi'
    KANNADA = 'kannada'
    KASHMIRI = 'kashmiri'
    KONKANI = 'konkani'
    MALAYALAM = 'malayalam'
    MARATHI = 'marathi'
    MEITEI = 'meitei'
    NEPALI = 'nepali'
    ORIYA = 'oriya'
    PUNJABI = 'punjabi'
    SANSKRIT = 'sanskrit'
    TAMIL = 'tamil'
    TELUGU = 'telugu'
    URDU = 'urdu'


class IndoWordNet:
    def __init__(self, lang=Language.HINDI):
        logger.info(f'Loading {lang.value} language synsets...')
        self._synset_idx_map = {}
        self._synset_df = self._load_synset_file(lang.value)
        self._synset_relations_dict = self._load_synset_relations()

    def _load_synset_file(self, lang):
        filename = os.path.join(*[constants.IWN_DATA_PATH, 'synsets', 'all.{}'.format(lang)])
        f = open(filename, encoding='utf-8')
        synsets = list(map(lambda line: self._load_synset(line), f.readlines()))
        synset_df = pd.DataFrame(synsets, columns=['synset_id', 'synsets', 'pos'])
        synset_df = synset_df.dropna()
        synset_df = synset_df.set_index('synset_id')
        return synset_df

    def _load_synset_relations(self):
        relations_dict = {}
        for file_path, relation_name in self._relation_list():
            relations_dict[relation_name] = []
            d = {}
            for line in open(file_path):
                line_parts = line.split('\t')
                synset_id, synset_ids = line_parts
                synset_id = int(synset_id)
                synset_ids = list(map(int, synset_ids.split(',')))
                synset_ids = list(filter(lambda x: True if x in self._synset_df.index else False, synset_ids))
                if synset_id in d:
                    d[synset_id].extend(synset_ids)
                else:
                    if synset_ids:
                        d[synset_id] = synset_ids
            relations_dict[relation_name] = d
        return relations_dict

    def _relation_list(self, type='synset_relations'):
        relations = []
        path_parts = '{},{},*'.format(constants.IWN_DATA_PATH, type).split(',')
        for file_path in glob.glob(os.path.join(*path_parts)):
            file_name = ntpath.basename(file_path)
            file_name_parts = file_name.split('.')
            if len(file_name_parts) != 2:
                continue
            relation_name, pos_tag = file_name_parts
            relations.append((file_path, relation_name))
        return relations

    def _update_synset_idx_map(self, synset):
        synset_id = synset.synset_id()
        for word in synset.lemma_names():
            if word in self._synset_idx_map:
                self._synset_idx_map[word].append(synset_id)
            else:
                self._synset_idx_map[word] = [synset_id]
        return True

    def _load_synset(self, synset_string):
        if 'null' in synset_string:
            return None, None, None

        synset_string = synset_string.replace('\n', '').strip()
        synset_pattern = '([0-9]+)\t(.+)\t(.+)\t([a-zA-Z]+)'
        try:
            matches = re.findall(synset_pattern, synset_string)
            synset_id, synset_words, gloss_examples, pos = matches[0]
        except Exception as e:
            return None, None, None

        synset_id = int(synset_id)
        synset_words = list(filter(lambda x: False if x == '' else True, synset_words.split(',')))
        if not synset_words:
            return None, None, None
        head_word = synset_words[0]
        if gloss_examples != '':
            if ':"' in gloss_examples:
                ge_list = gloss_examples.split(':')
                gloss = ge_list[0]
                if len(ge_list) > 1:
                    examples = ''.join(ge_list[1:])
                    examples = re.sub('["]', '', examples)
                    examples = examples.split('  /  ')
                else:
                    examples = []
            else:
                gloss = gloss_examples
                examples = []
        else:
            return None, None, None
        synset = Synset(synset_id, head_word, synset_words, pos, gloss, examples)

        self._update_synset_idx_map(synset)

        return synset_id, synset, pos

    def all_synsets(self, pos=None):
        if pos is None:
            result = self._synset_df
        else:
            mask = (self._synset_df.pos == pos.value)
            result = self._synset_df[mask]
        return list(result['synsets'].values)

    def synsets(self, word, pos=None):
        synset_id_list = self._synset_idx_map[word]

        synsets = []
        if pos is not None:
            for synset_id in synset_id_list:
                synset = self._synset_df.loc[[synset_id]]['synsets'].values[0]
                if synset.pos() == pos:
                    synsets.append(synset)
        else:
            for synset_id in synset_id_list:
                synset = self._synset_df.loc[[synset_id]]['synsets'].values[0]
                synsets.append(synset)

        return synsets

    def all_words(self, pos=None):
        if pos is None:
            return list(self._synset_idx_map.keys())

        words = set()
        mask = (self._synset_df.pos == pos.value)
        for synset in self._synset_df[mask]['synsets'].values:
            for word in synset.lemma_names():
                words.add(word)
        words = list(words)
        return words

    def synset_relation(self, synset, relation):
        return list(self._synset_df[self._synset_df.index.isin(self._synset_relations_dict[relation.value].get(synset.synset_id(), []))]['synsets'])


class Synset:
    def __init__(self, synset_id, head_word, lemma_names, pos, gloss, examples):
        self._synset_id = synset_id
        self._head_word = head_word
        self._lemma_names = lemma_names
        self._pos = pos
        self._gloss = gloss
        self._examples = examples

    def __repr__(self):
        return 'Synset(\'{}.{}.{}\')'.format(self._head_word, self._pos, self._synset_id)

    def synset_id(self):
        return self._synset_id

    def head_word(self):
        return self._head_word

    def lemma_names(self):
        return self._lemma_names

    def lemmas(self):
        return [Lemma(self, lemma) for lemma in self._lemma_names]

    def pos(self):
        return self._pos  

    def gloss(self):
        return self._gloss

    def examples(self):
        return self._examples

    def ontology_nodes(self):
        raise NotImplementedError("This method will be implemented soon.")


class Lemma:
    def __init__(self, synset, name):
        self._synset = synset
        self._name = name

    def __repr__(self):
        return 'Lemma(\'{}.{}.{}.{}\')'.format(self._synset.head_word(), self._synset.pos(), self._synset.synset_id(), self._name)

    def name(self):
        return self._name

    def synset(self):
        return self._synset

    def gradation(self):
        raise NotImplementedError("This method will be implemented soon.")

    def antonym(self):
        raise NotImplementedError("This method will be implemented soon.")


@unique
class PosTag(Enum):
    NOUN = 'noun'
    VERB = 'verb'
    ADVERB = 'adverb'
    ADJECTIVE = 'adjective'


class IndoWordNetError(Exception):
    """ An exception class for IndoWordNet-related errors. """


@unique
class SynsetRelations(Enum):
    MERO_MEMBER_COLLECTION = 'mero_member_collection'
    ABILITY_VERB = 'ability_verb'
    CAUSATIVE = 'causative'
    CAPABILITY_VERB = 'capability_verb'
    MERO_COMPONENT_OBJECT = 'mero_component_object'
    HOLO_PORTION_MASS = 'holo_portion_mass'
    FUNCTION_VERB = 'function_verb'
    HOLO_COMPONENT_OBJECT = 'holo_component_object'
    HYPERNYMY = 'hypernymy'
    ENTAILMENT = 'entailment'
    ALSO_SEE = 'also_see'
    MERO_FEATURE_ACTIVITY = 'mero_feature_activity'
    HOLO_PLACE_AREA = 'holo_place_area'
    MODIFIES_VERB = 'modifies_verb'
    ATTRIBUTES = 'attributes'
    MERO_PORTION_MASS = 'mero_portion_mass'
    MODIFIES_NOUN = 'modifies_noun'
    HOLO_FEATURE_ACTIVITY = 'holo_feature_activity'
    MERO_STUFF_OBJECT = 'mero_stuff_object'
    TROPONYMY = 'troponymy'
    MERO_PLACE_AREA = 'mero_place_area'
    HOLO_MEMBER_COLLECTION = 'holo_member_collection'
    HYPONYMY = 'hyponymy'
    SIMILAR = 'similar'
    MERO_POSITION_AREA = 'mero_position_area'
    HOLO_POSITION_AREA = 'holo_position_area'
    HOLO_STUFF_OBJECT = 'holo_stuff_object'


In [6]:
!pip install indic-nlp-library


Collecting indic-nlp-library


[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip



  Downloading indic_nlp_library-0.92-py3-none-any.whl (40 kB)
                                              0.0/40.3 kB ? eta -:--:--
     -------------------------------------- 40.3/40.3 kB 968.4 kB/s eta 0:00:00
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.4.0-py3-none-any.whl (12 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Downloading sphinx_rtd_theme-2.0.0-py2.py3-none-any.whl (2.8 MB)
                                              0.0/2.8 MB ? eta -:--:--
     --                                       0.2/2.8 MB 5.0 MB/s eta 0:00:01
     -------                                  0.5/2.8 MB 6.3 MB/s eta 0:00:01
     -----------                              0.8/2.8 MB 6.6 MB/s eta 0:00:01
     ------------------                       1.3/2.8 MB 7.4 MB/s eta 0:00:01
     ---------------------                    1.5/2.8 MB 6.9 MB/s eta 0:00:01
     ---------------------------              1.9/2.8 MB 7.3 MB/s eta 0:00:01
     --------

In [2]:
import pandas as pd
df=pd.read_csv('mid.csv')
df.head()

Unnamed: 0,0,1,2,4,3
0,Woman in a black dress walking on the street.,A woman watches a dog jump down the stairs.,0.8,['एक महिला एक कुत्ता को सीढ़ियों से नीचे उछलने...,['सड़क पर घूमते हुए काले कपड़े पहने हुए एक महि...
1,A man and a woman looking at the camera.,A man and a woman laughing.,2.33,['एक आदमी और एक औरत हंस रहे हैं।'],['एक पुरुष और एक महिला कैमरा देख रही हैं।']
2,a brown dog running through the dirty muddy grass,The large brown dog is running outside in the ...,3.6,['बड़े भूरे कुत्ते बाहर रेत में दौड़ रहे हैं।'],['एक भूरा कुत्ता गंदगी में दौड़ता है']
3,Three dogs pulling a man on a bicycle through ...,"The dogs are pulling a man on a type of bike, ...",5.0,['कुत्ते बर्फ के बीच एक आदमी को एक प्रकार के स...,['तीन कुत्ते बर्फ के बीच एक आदमी को सायकल पर ख...
4,A baby in a red hat sitting in a stroller is h...,A man in a gray shirt sitting in a field of fl...,0.0,['फूलों के मैदान में बैठा एक धूसर कुर्ता पहना ...,['एक लाल टोपी पहने हुए बच्चे जो स्ट्रोलर में ब...


In [3]:
df=df.drop(columns=['1','0'])

In [4]:
stopwords_hi = ['तुम','मेरी','मुझे','क्योंकि','हम','प्रति','अबकी','आगे','माननीय','शहर','बताएं','कौनसी','क्लिक','किसकी','बड़े','मैं','and','रही','आज','लें','आपके','मिलकर','सब','मेरे','जी','श्री','वैसा','आपका','अंदर', 'अत', 'अपना', 'अपनी', 'अपने', 'अभी', 'आदि', 'आप', 'इत्यादि', 'इन', 'इनका', 'इन्हीं', 'इन्हें', 'इन्हों', 'इस', 'इसका', 'इसकी', 'इसके', 'इसमें', 'इसी', 'इसे', 'उन', 'उनका', 'उनकी', 'उनके', 'उनको', 'उन्हीं', 'उन्हें', 'उन्हों', 'उस', 'उसके', 'उसी', 'उसे', 'एक', 'एवं', 'एस', 'ऐसे', 'और', 'कई', 'कर','करता', 'करते', 'करना', 'करने', 'करें', 'कहते', 'कहा', 'का', 'काफ़ी', 'कि', 'कितना', 'किन्हें', 'किन्हों', 'किया', 'किर', 'किस', 'किसी', 'किसे', 'की', 'कुछ', 'कुल', 'के', 'को', 'कोई', 'कौन', 'कौनसा', 'गया', 'घर', 'जब', 'जहाँ', 'जा', 'जितना', 'जिन', 'जिन्हें', 'जिन्हों', 'जिस', 'जिसे', 'जीधर', 'जैसा', 'जैसे', 'जो', 'तक', 'तब', 'तरह', 'तिन', 'तिन्हें', 'तिन्हों', 'तिस', 'तिसे', 'तो', 'था', 'थी', 'थे', 'दबारा', 'दिया', 'दुसरा', 'दूसरे', 'दो', 'द्वारा', 'न', 'नहीं', 'ना', 'निहायत', 'नीचे', 'ने', 'पर', 'पर', 'पहले', 'पूरा', 'पे', 'फिर', 'बनी', 'बही', 'बहुत', 'बाद', 'बाला', 'बिलकुल', 'भी', 'भीतर', 'मगर', 'मानो', 'मे', 'में', 'यदि', 'यह', 'यहाँ', 'यही', 'या', 'यिह', 'ये', 'रखें', 'रहा', 'रहे', 'ऱ्वासा', 'लिए', 'लिये', 'लेकिन', 'व', 'वर्ग', 'वह', 'वह', 'वहाँ', 'वहीं', 'वाले', 'वुह', 'वे', 'वग़ैरह', 'संग', 'सकता', 'सकते', 'सबसे', 'सभी', 'साथ', 'साबुत', 'साभ', 'सारा', 'से', 'सो', 'ही', 'हुआ', 'हुई', 'हुए', 'है', 'हैं', 'हो', 'होता', 'होती', 'होते', 'होना', 'होने', 'अपनि', 'जेसे', 'होति', 'सभि', 'तिंहों', 'इंहों', 'दवारा', 'इसि', 'किंहें', 'थि', 'उंहों', 'ओर', 'जिंहें', 'वहिं', 'अभि', 'बनि', 'हि', 'उंहिं', 'उंहें', 'हें', 'वगेरह', 'एसे', 'रवासा', 'कोन', 'निचे', 'काफि', 'उसि', 'पुरा', 'भितर', 'हे', 'बहि', 'वहां', 'कोइ', 'यहां', 'जिंहों', 'तिंहें', 'किसि', 'कइ', 'यहि', 'इंहिं', 'जिधर', 'इंहें', 'अदि', 'इतयादि', 'हुइ', 'कोनसा', 'इसकि', 'दुसरे', 'जहां', 'अप', 'किंहों', 'उनकि', 'भि', 'वरग', 'हुअ', 'जेसा', 'नहिं']
punctuations = ['nn','n', '।','/', '`', '+', '\\', '"', '?', '▁(', '$', '@', '[', '_', "'", '!', ',', ':', '^', '|', ']', '=', '%', '&', '.', ')', '(', '#', '*', '', ';', '-', '}','|','"']

In [5]:
from indicnlp.tokenize import indic_tokenize
def tokenization(indic_string):
    tokens = []
    for t in indic_tokenize.trivial_tokenize(indic_string):
        tokens.append(t)
    return tokens


In [6]:
df['3']=df['3'].apply(lambda x: tokenization(x))
df['4']=df['4'].apply(lambda x: tokenization(x))

In [7]:
to_be_removed = stopwords_hi + punctuations 


In [8]:
mask = df['3'].apply(lambda x: x not in to_be_removed)
df.loc[mask, '3'] = df.loc[mask, '3'].apply(lambda x: [ele for ele in x if ele not in to_be_removed])

df.tail(10)

Unnamed: 0,2,4,3
1840,3.333,"[[, ', एक, आदमी, एक, डिजिटल, शुष्क, मिटाने, के...","[आदमी, रेखांकन]"
1841,2.0,"[[, ', बंदर, branch, to, branch, से, झुका, हुआ...","[बंदर, पेड़ों, झुक]"
1842,3.0,"[[, ', दो, लड़के, टर्मिनल, पर, खेल, रहे, हैं, ...","[बच्चे, टर्मोलाइन, चढ़]"
1843,4.0,"[[, ', एक, महिला, जो, हरी, पीपल, का, टुकड़ा, क...","[मादा, चाकू, चावल, टुकड़ा, काट]"
1844,2.0,"[[, ', एक, आदमी, एक, कार, में, anti, -, freeze...","[आदमी, कार, तेल, जोड़]"
1845,0.4,"[[, ', एक, आदमी, रस्सी, पर, चढ़, रहा, है, ।, '...","[आदमी, मशीन, carrot, टुकड़ा]"
1846,3.2,"[[, ', एक, व्यक्ति, एक, 피아노, कीबोर्ड, बजा, रहा...","[लड़के, कुंजीपट, बजाया]"
1847,0.4,"[[, ', एक, लड़का, मिट्टी, में, खेल, रहा, है, ।...","[बिल्ली, एंटिना, खेल]"
1848,2.0,"[[, ', एक, व्यक्ति, कंटालूप, का, टुकड़ा, कर, र...","[महिला, मक्खन, काट]"
1849,4.5,"[[, ', कुछ, लोग, चट्टान, से, एक, dummy, को, फे...","[लोगों, डमी, चट्टान, किनारे, फेंक]"


In [9]:
df.head()

Unnamed: 0,2,4,3
0,0.8,"[[, ', एक, महिला, एक, कुत्ता, को, सीढ़ियों, से...","[सड़क, घूमते, काले, कपड़े, पहने, महिला]"
1,2.33,"[[, ', एक, आदमी, और, एक, औरत, हंस, रहे, हैं, ।...","[पुरुष, महिला, कैमरा, देख]"
2,3.6,"[[, ', बड़े, भूरे, कुत्ते, बाहर, रेत, में, दौड...","[भूरा, कुत्ता, गंदगी, दौड़ता]"
3,5.0,"[[, ', कुत्ते, बर्फ, के, बीच, एक, आदमी, को, एक...","[तीन, कुत्ते, बर्फ, बीच, आदमी, सायकल, खींचते]"
4,0.0,"[[, ', फूलों, के, मैदान, में, बैठा, एक, धूसर, ...","[लाल, टोपी, पहने, बच्चे, स्ट्रोलर, बैठा, पुतली..."


In [10]:

mask = df['4'].apply(lambda x: x not in to_be_removed)
df.loc[mask, '4'] = df.loc[mask, '4'].apply(lambda x: [ele for ele in x if ele not in to_be_removed])

df.tail(10)

Unnamed: 0,2,4,3
1840,3.333,"[आदमी, डिजिटल, शुष्क, मिटाने, बोर्ड, चित्र, बना]","[आदमी, रेखांकन]"
1841,2.0,"[बंदर, branch, to, branch, झुका]","[बंदर, पेड़ों, झुक]"
1842,3.0,"[लड़के, टर्मिनल, खेल]","[बच्चे, टर्मोलाइन, चढ़]"
1843,4.0,"[महिला, हरी, पीपल, टुकड़ा, काटती]","[मादा, चाकू, चावल, टुकड़ा, काट]"
1844,2.0,"[आदमी, कार, anti, freeze, डाल]","[आदमी, कार, तेल, जोड़]"
1845,0.4,"[आदमी, रस्सी, चढ़]","[आदमी, मशीन, carrot, टुकड़ा]"
1846,3.2,"[व्यक्ति, 피아노, कीबोर्ड, बजा]","[लड़के, कुंजीपट, बजाया]"
1847,0.4,"[लड़का, मिट्टी, खेल]","[बिल्ली, एंटिना, खेल]"
1848,2.0,"[व्यक्ति, कंटालूप, टुकड़ा]","[महिला, मक्खन, काट]"
1849,4.5,"[लोग, चट्टान, dummy, फेंक]","[लोगों, डमी, चट्टान, किनारे, फेंक]"


In [11]:
def joining(lixt):
    return " ".join(lixt)

In [12]:
df['5']=df['3'].apply(joining)
df['6']=df['4'].apply(joining)

In [13]:
df.head()

Unnamed: 0,2,4,3,5,6
0,0.8,"[महिला, कुत्ता, सीढ़ियों, उछलने, देखती]","[सड़क, घूमते, काले, कपड़े, पहने, महिला]",सड़क घूमते काले कपड़े पहने महिला,महिला कुत्ता सीढ़ियों उछलने देखती
1,2.33,"[आदमी, औरत, हंस]","[पुरुष, महिला, कैमरा, देख]",पुरुष महिला कैमरा देख,आदमी औरत हंस
2,3.6,"[बड़े, भूरे, कुत्ते, बाहर, रेत, दौड़]","[भूरा, कुत्ता, गंदगी, दौड़ता]",भूरा कुत्ता गंदगी दौड़ता,बड़े भूरे कुत्ते बाहर रेत दौड़
3,5.0,"[कुत्ते, बर्फ, बीच, आदमी, प्रकार, साइकिल, खींच]","[तीन, कुत्ते, बर्फ, बीच, आदमी, सायकल, खींचते]",तीन कुत्ते बर्फ बीच आदमी सायकल खींचते,कुत्ते बर्फ बीच आदमी प्रकार साइकिल खींच
4,0.0,"[फूलों, मैदान, बैठा, धूसर, कुर्ता, पहना, आदमी]","[लाल, टोपी, पहने, बच्चे, स्ट्रोलर, बैठा, पुतली...",लाल टोपी पहने बच्चे स्ट्रोलर बैठा पुतली पकड़,फूलों मैदान बैठा धूसर कुर्ता पहना आदमी


In [8]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert')

model = AutoModel.from_pretrained('ai4bharat/indic-bert')

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
def word_em(text):    
    encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    word_embeddings = model(**encoded_input).last_hidden_state
    sentence_embedding = word_embeddings.mean(dim=1)
    return sentence_embedding

In [16]:
df['5']=df['5'].apply(word_em)
df['6']=df['6'].apply(word_em)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [17]:
df.head()

Unnamed: 0,2,4,3,5,6
0,0.8,"[महिला, कुत्ता, सीढ़ियों, उछलने, देखती]","[सड़क, घूमते, काले, कपड़े, पहने, महिला]","[[tensor(0.1311, grad_fn=<UnbindBackward0>), t...","[[tensor(0.0086, grad_fn=<UnbindBackward0>), t..."
1,2.33,"[आदमी, औरत, हंस]","[पुरुष, महिला, कैमरा, देख]","[[tensor(0.3081, grad_fn=<UnbindBackward0>), t...","[[tensor(-0.0399, grad_fn=<UnbindBackward0>), ..."
2,3.6,"[बड़े, भूरे, कुत्ते, बाहर, रेत, दौड़]","[भूरा, कुत्ता, गंदगी, दौड़ता]","[[tensor(0.2829, grad_fn=<UnbindBackward0>), t...","[[tensor(0.2920, grad_fn=<UnbindBackward0>), t..."
3,5.0,"[कुत्ते, बर्फ, बीच, आदमी, प्रकार, साइकिल, खींच]","[तीन, कुत्ते, बर्फ, बीच, आदमी, सायकल, खींचते]","[[tensor(0.1059, grad_fn=<UnbindBackward0>), t...","[[tensor(0.2275, grad_fn=<UnbindBackward0>), t..."
4,0.0,"[फूलों, मैदान, बैठा, धूसर, कुर्ता, पहना, आदमी]","[लाल, टोपी, पहने, बच्चे, स्ट्रोलर, बैठा, पुतली...","[[tensor(0.0699, grad_fn=<UnbindBackward0>), t...","[[tensor(0.0499, grad_fn=<UnbindBackward0>), t..."


In [18]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Example sentences
sentence1 = "The cat sat on the mat."
sentence2 = "the"

# Tokenize sentences and obtain input IDs
inputs1 = tokenizer(sentence1, return_tensors='pt', padding=True, truncation=True)
inputs2 = tokenizer(sentence2, return_tensors='pt', padding=True, truncation=True)

# Obtain BERT embeddings
with torch.no_grad():
    outputs1 = model(**inputs1)
    outputs2 = model(**inputs2)

# Extract [CLS] token embeddings (sentence embeddings)
sentence_embedding1 = outputs1.last_hidden_state.mean(dim=1)
sentence_embedding2 = outputs2.last_hidden_state.mean(dim=1)

# Compute cosine similarity between sentence embeddings
cosine_sim = torch.nn.functional.cosine_similarity(sentence_embedding1, sentence_embedding2, dim=1)

print("Cosine Similarity:", cosine_sim.item())

tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 16.0kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 6.88MB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 2.29MB/s]
config.json: 100%|██████████| 570/570 [00:00<00:00, 201kB/s]
model.safetensors: 100%|██████████| 440M/440M [00:33<00:00, 13.1MB/s] 


Cosine Similarity: 0.28673091530799866


In [19]:
def min_max_normalize(column):
    return (column - column.min()) / (column.max() - column.min())

In [20]:
from sklearn.preprocessing import MinMaxScaler
# Create a scaler object
scaler = MinMaxScaler()
# Fit the scaler to the 'similarity_score' column and transform it
df['2'] = scaler.fit_transform(df[['2']])

In [21]:
df.head()

Unnamed: 0,2,4,3,5,6
0,0.16,"[महिला, कुत्ता, सीढ़ियों, उछलने, देखती]","[सड़क, घूमते, काले, कपड़े, पहने, महिला]","[[tensor(0.1311, grad_fn=<UnbindBackward0>), t...","[[tensor(0.0086, grad_fn=<UnbindBackward0>), t..."
1,0.466,"[आदमी, औरत, हंस]","[पुरुष, महिला, कैमरा, देख]","[[tensor(0.3081, grad_fn=<UnbindBackward0>), t...","[[tensor(-0.0399, grad_fn=<UnbindBackward0>), ..."
2,0.72,"[बड़े, भूरे, कुत्ते, बाहर, रेत, दौड़]","[भूरा, कुत्ता, गंदगी, दौड़ता]","[[tensor(0.2829, grad_fn=<UnbindBackward0>), t...","[[tensor(0.2920, grad_fn=<UnbindBackward0>), t..."
3,1.0,"[कुत्ते, बर्फ, बीच, आदमी, प्रकार, साइकिल, खींच]","[तीन, कुत्ते, बर्फ, बीच, आदमी, सायकल, खींचते]","[[tensor(0.1059, grad_fn=<UnbindBackward0>), t...","[[tensor(0.2275, grad_fn=<UnbindBackward0>), t..."
4,0.0,"[फूलों, मैदान, बैठा, धूसर, कुर्ता, पहना, आदमी]","[लाल, टोपी, पहने, बच्चे, स्ट्रोलर, बैठा, पुतली...","[[tensor(0.0699, grad_fn=<UnbindBackward0>), t...","[[tensor(0.0499, grad_fn=<UnbindBackward0>), t..."


In [19]:
from sklearn.model_selection import train_test_split
import ast


In [2]:
df['5'][0]

NameError: name 'df' is not defined

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, Dataset

class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()
        self.fc1 = nn.Linear(768, 128)  
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)

    def forward_once(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.sigmoid(self.fc4(x))
        return x

    def forward(self, input1, input2):
        output1 = self.forward_once(input1)
        output2 = self.forward_once(input2)
        return output1, output2
class SiameseDataset(Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]


NameError: name 'model' is not defined

In [34]:
import numpy as np
import torch

In [12]:
!pip install torchmetrics
!pip install sentence-transformers




[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
from sentence_transformers import SentenceTransformer

# Initialize sentence transformer model
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

# Suppose we have the following sentences:
sentences = ['This is the first sentence.', 'This is the second sentence.']

# Generate sentence embeddings
embeddings = sbert_model.encode(sentences)

# Convert embeddings to tensors
input1 = torch.tensor(embeddings[0])
input2 = torch.tensor(embeddings[1])

# Create an instance of SiameseNetwork
model = SiameseNetwork()

# Pass the tensors to the model
output1, output2 = model(input1, input2)

modules.json: 100%|██████████| 229/229 [00:00<00:00, 115kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
config_sentence_transformers.json: 100%|██████████| 122/122 [00:00<00:00, 60.9kB/s]
README.md: 100%|██████████| 3.99k/3.99k [00:00<00:00, 1.98MB/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 26.5kB/s]
config.json: 100%|██████████| 625/625 [00:00<00:00, 313kB/s]
pytorch_model.bin: 100%|██████████| 438M/438M [00:33<00:00, 13.2MB/s] 
tokenizer_config.json: 100%|██████████| 399/399 [00:00<00:00, 199kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.07MB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 2.28MB/s]
added_tokens.json: 100%|██████████| 2.00/2.00 [00:00<00:00, 999B/s]
special_tokens_map.json: 100%|██████████| 112/1

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x768 and 300x128)

In [1]:
input1.shape

NameError: name 'input1' is not defined

In [43]:
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    output1, output2 = model(input1, input2)
    loss = criterion(output1, output2, target_scores)
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')

TypeError: linear(): argument 'input' (position 1) must be Tensor, not numpy.ndarray

In [46]:
type(df['5'])

pandas.core.series.Series

In [1]:
df.head()

NameError: name 'df' is not defined

In [3]:
import pandas as pd
import torch

# Assuming df is your DataFrame
df = pd.DataFrame({
    'col1': [1, 2, 3, 4],
    'col2': [5, 6, 7, 8],
    'col3': [9, 10, 11, 12]
})

# Convert DataFrame to numpy array, then to PyTorch Tensor
tensor = torch.from_numpy(df.values)

# Now you can pass this tensor to your model

In [4]:
tensor

tensor([[ 1,  5,  9],
        [ 2,  6, 10],
        [ 3,  7, 11],
        [ 4,  8, 12]])