In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import KDTree
import pandas as pd
import sqlite3
import spacy
from tqdm.notebook import tqdm
import joblib

In [37]:
# Read data from Database
conn = sqlite3.connect("scraped_data.db")

df = pd.read_sql_query("SELECT Preprocessed_Long_Text, Title, URL FROM reviews", conn)


In [38]:
corpus = df['Preprocessed_Long_Text'].tolist()

v = TfidfVectorizer()

vec = v.fit_transform(corpus)

joblib.dump(v, 'data/Ttfidf_vectorizer.joblib')
joblib.dump(vec, 'data/tfidf_matrix.joblib')

['data/tfidf_matrix.joblib']

In [39]:
# Load datasets
v = joblib.load('data/Ttfidf_vectorizer.joblib')
vec = joblib.load('data/tfidf_matrix.joblib')


In [None]:
# Suchmaschine

def search(query, top_k=3):
    query_vec = v.transform([query])

    # Kosinus-Ähnlichkeit berechnen
    similarities = cosine_similarity(query_vec, vec).flatten()
    ranked_indices = similarities.argsort()[::-1]

    print(ranked_indices)
    
    results = []
    for idx in ranked_indices[:top_k]:
        results.append({
            "doc_id": idx,
            "similarity": similarities[idx],
            "Title": df.iloc[idx]['Title'], 
            #"URL": df.iloc[idx]['URL']
        })
    
    return pd.DataFrame(results)

In [None]:
# Suchmaschine mit KDtree
# Normieren, damit euklidische Distanz der Cosine-Similarity entspricht
tree = KDTree(vec.toarray())

def search_kdtree(query, top_k=10):
    query_vec = v.transform([query])
    query_vec = normalize(query_vec)

    dist, ind = tree.query(query_vec.toarray(), k=top_k)

    results = []
    for i in range(top_k):
        idx = ind[0][i]
        results.append({
            "doc_id": idx,
            "similarity": 1 - dist[0][i],  
            "Title": df.iloc[idx]['Title'],
            # "URL": df.iloc[idx]['URL'],
        })
    
    return pd.DataFrame(results)
    

In [None]:
# Preprocessing for Queries
tqdm.pandas()
nlp = spacy.load("en_core_web_sm")

def queryPreprocessing(text): 
    if not isinstance(text, str):
        return []

    
    doc = nlp(text.lower()) 
    tokens = [
            token.lemma_ 
            for token in doc
            if token.is_alpha and not token.is_stop 
        ]
    string_list =" ".join(tokens)
    return string_list

In [43]:
# Suchmaschine testen 

query = "Resident has Evil"
query = queryPreprocessing(query)

print(query)
results = search(query, top_k=20)
print(results)

resident evil
[ 241 2394  281 ... 4243 4242 4219]
    doc_id  similarity                                              Title
0      241    0.786363               Resident Evil 3 Single-Player Review
1     2394    0.686827                  Resident Evil 7: Biohazard Review
2      281    0.593735                             Resident Evil 2 Review
3      188    0.579148                          Resident Evil 4 VR Review
4     5861    0.566460                               Resident Evil Gaiden
5     2784    0.540514                             Resident Evil 6 Review
6     5166    0.515354  Resident Evil Archives: Resident Evil Zero Review
7     1693    0.510899                               Resident Evil Review
8     3197    0.492573              Resident Evil Zero HD Remaster Review
9     1211    0.485018                       Resident Evil Village Review
10     821    0.481816               Resident Evil 5: Gold Edition Review
11    1033    0.479182                   Resident Evil 4 Remak

In [44]:
# search with KDTree 

query = "Resident has Evil"
query = queryPreprocessing(query)

print(query)
results = search_kdtree(query,top_k=20)
print(results)


resident evil
    doc_id  similarity                                              Title
0      241    0.346338               Resident Evil 3 Single-Player Review
1     2394    0.208579                  Resident Evil 7: Biohazard Review
2      281    0.098595                             Resident Evil 2 Review
3      188    0.082555                          Resident Evil 4 VR Review
4     5861    0.068829                               Resident Evil Gaiden
5     2784    0.041370                             Resident Evil 6 Review
6     5166    0.015474  Resident Evil Archives: Resident Evil Zero Review
7     1693    0.010959                               Resident Evil Review
8     5540    0.000000                            Error 502 - Bad Gateway
9     4394    0.000000                 Metal Gear Solid Touch iPad Review
10    2853    0.000000                            Error 502 - Bad Gateway
11    4395    0.000000                        Resident Evil 4 iPad Review
12    5417    0.000000  