In [13]:
from sqlalchemy.orm import declarative_base
from sqlalchemy.orm import sessionmaker, scoped_session
from sqlalchemy import create_engine

uri = f"postgresql://erik:erik@localhost:5432/erik_db"
engine = create_engine(uri)
session_factory = sessionmaker(bind=engine)
Session = scoped_session(session_factory)

Base = declarative_base()

In [14]:
from sqlalchemy import Column, Integer, String, ARRAY, Float

class Sentence(Base):
    __tablename__ = 'sentences'
    id = Column(Integer, primary_key=True)
    text = Column(String(255), nullable=False)
    vector = Column(ARRAY(Float), nullable=False)

In [32]:
import random
import string

# Drop the existing "sentences" table
Base.metadata.drop_all(engine)

# Recreate the "sentences" table
Base.metadata.create_all(engine)

def generate_random_text(length):
    letters = string.ascii_letters
    random_text = ''.join(random.choice(letters) for _ in range(length))
    return random_text

def generate_random_vector(n):
    return [random.random() for _ in range(n)]

# Create a session and add the dummy data
with Session() as session:
    for i in range(5000):
        s = Sentence()
        s.text = generate_random_text(100)
        s.vector = generate_random_vector(100)
        session.add(s)
    session.commit()

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
# add wikipedia to database
import re
from nltk.tokenize import sent_tokenize
import nltk

def clean_text(text):

    THRESHOLD = 60

    text = remove_brackets(text)

    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    if len(sentences) == 0:
        return None

    if len(sentences) == 1:
        if len(sentences[0]) < THRESHOLD:
            return None

    # Ensure one space between each word in each sentence
    formatted_sentences = [' '.join(sentence.split()) for sentence in sentences]
    return formatted_sentences

def remove_brackets(string):
    # Remove brackets and their contents (including nested brackets)
    pattern = r'\([^()]*\)|\[[^\]]*\]'
    while re.search(pattern, string):
        string = re.sub(pattern, '', string)
    return string


all_data = []
with open("data.txt", "r", encoding="utf-8") as f:
    raw_text = f.readlines()
    sentences = []

    vectorizer = TfidfVectorizer()

    for line in raw_text:
        text = clean_text(line)
        if text is None:
            continue
        # with Session() as session:

        for sent in text:
            vectorized_question = vectorizer.transform([sent])
            print(len(vectorized_question))
                # print(sent)
            # print(text)
            # all_data.extend(text)

print(all_data)

NotFittedError: The TF-IDF vectorizer is not fitted

In [36]:
import numpy as np
from scipy.spatial.distance import cosine

def find(target_vector, threshold=0.8, batch_size=32):
    close = []
    with Session() as session:
        offset = 0
        batch_count = 0
        while True:
            sentences = session.query(Sentence).offset(offset).limit(batch_size).all()
            if not sentences:
                break  # No more sentences, end the loop

            for sentence in sentences:
                sentence_vector = np.array(sentence.vector)
                similarity = 1 - cosine(target_vector, sentence_vector)

                if similarity >= threshold:
                    close.append(sentence)
            offset += batch_size
            batch_count += 1
        print(f"Total batches processed: {batch_count}")
    return close

In [24]:
import math

def cosine_similarity(vector1, vector2):
    dot_product = sum(vector1[i] * vector2[i] for i in range(len(vector1)))
    magnitude1 = math.sqrt(sum(vector1[i] ** 2 for i in range(len(vector1))))
    magnitude2 = math.sqrt(sum(vector2[i] ** 2 for i in range(len(vector2))))

    print("dot", dot_product)
    print("m1", magnitude1)
    print("m2", magnitude2)

    if magnitude1 == 0 or magnitude2 == 0:
        return 0  # To avoid division by zero if any vector has zero magnitude

    return dot_product / (magnitude1 * magnitude2)

print(cosine_similarity([0.1,0.2,0.3],[0.3,0.2,0.1]))

dot 0.1
m1 0.37416573867739417
m2 0.37416573867739417
0.7142857142857143
