<a href="https://colab.research.google.com/github/akshaygopan/Akku/blob/main/Akku_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install sentence-transformers
!pip install "tensorflow-text==2.11.*"
!pip install torch==2.1.0
!pip install transformers

In [None]:
%%capture
import pandas as pd
import sys
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import torch
import transformers as t
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
#import senteval
import time
import requests
import numpy as np
#from sentence_transformers import SentenceTransformer
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

In [None]:
model_list = ['sentence-transformers/bert-base-nli-mean-tokens'
              #'sentence-transformers/all-MiniLM-L6-v2',
              #'sentence-transformers/all-mpnet-base-v2',
              #'jinaai/jina-embeddings-v2-base-en']
              #'hkunlp/instructor-xl',
              #'sentence-transformers/paraphrase-multilingual-mpnet-base-v2']
              #'SupstarZh/whitenedcse-bert-base',
              #'SupstarZh/whitenedcse-bert-large',
              #'bert-base-uncased'
              ]
count = 1
models = {}
for model in model_list:
  models[model] = {'Model': AutoModel.from_pretrained(model) , 'Tokenizer' : AutoTokenizer.from_pretrained(model)}
  count = count +1

In [None]:
models.keys()

In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import nltk

In [2]:

nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Data Preprocessing
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub('<.*?>', '', text)
    # Remove punctuation and other non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

In [None]:
model = models['sentence-transformers/bert-base-nli-mean-tokens']['Model']
tokenizer = models['sentence-transformers/bert-base-nli-mean-tokens']['Tokenizer']

In [None]:
def get_encoding(text):
    #Tokenize sentences
    encoded_input = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt',is_split_into_words=True)

    #Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    #Perform pooling. In this case, mean pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    return sentence_embeddings

In [None]:
def embed_row(text):
  preprocessed = preprocess_text(text)
  embedding = get_encoding(preprocessed)
  return embedding

In [None]:
embedding

In [None]:
# Read the text file line by line
with open('data.txt', 'r') as file:
    lines = file.readlines()

# Convert each line to a DataFrame
data = []
for line in lines:
    # Assuming each line contains comma-separated values
    values = line.strip().split(',')  # adjust delimiter if needed
    data.append(values)

# Convert the list of lists to a pandas DataFrame
df = pd.DataFrame(data, columns = ['Text'])

# Optionally, you can specify column names if needed
# For example, if your first line contains column headers:
# df.columns = ['Column1', 'Column2', ...]

# Display the DataFrame
print(df)

In [None]:
df['Embedding'] = df['Text'].apply(embed_row)