<a href="https://colab.research.google.com/github/chabryl/AI-Redirect-Mapping/blob/main/redirect_mapping_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence-transformers
!pip install qdrant-client
from sentence_transformers import SentenceTransformer
import pandas as pd
import io
import time
from google.colab import files
from qdrant_client import QdrantClient
from qdrant_client.http import models

In [None]:
import time
import io
import pandas as pd
from google.colab import files
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http import models

# Initialisierung des SentenceTransformer-Modells
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

def encode_texts(df, selected_columns):
    """
    Kodiert die Texte im gegebenen DataFrame mit dem SentenceTransformer-Modell.
    """
    df['combined_text'] = df[selected_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
    start_time = time.time()
    embeddings = model.encode(df['combined_text'].tolist(), show_progress_bar=True)
    end_time = time.time()
    return embeddings, end_time - start_time

def detect_delimiter(file_content):
    """
    Versucht, das Trennzeichen in der CSV-Datei zu erkennen.
    """
    file_str = file_content.getvalue().decode('utf-8')
    first_line = file_str.splitlines()[0]
    if ',' in first_line:
        return ','
    elif ';' in first_line:
        return ';'
    else:
        raise ValueError("Unbekanntes Trennzeichen in der CSV-Datei.")

def upload_and_read_csv(file_name):
    """
    Lädt eine CSV-Datei hoch und versucht sie zu lesen.
    """
    uploaded = files.upload()
    file_content = io.BytesIO(uploaded[list(uploaded.keys())[0]])
    delimiter = detect_delimiter(file_content)
    file_content.seek(0)

    df = pd.read_csv(file_content, sep=delimiter, skip_blank_lines=True, on_bad_lines='skip')
    return df

def upsert_to_qdrant(client, collection_name, embeddings, ids):
    """
    Fügt Embeddings in die Qdrant-Kollektion ein.
    """
    batch_size = 100
    for i in range(0, len(embeddings), batch_size):
        batch_embeddings = embeddings[i:i+batch_size]
        batch_ids = ids[i:i+batch_size]
        points = [
            models.PointStruct(
                id=i + 1,
                vector=embedding.tolist(),
                payload={"url": id}
            ) for i, (id, embedding) in enumerate(zip(batch_ids, batch_embeddings))
        ]
        client.upsert(collection_name=collection_name, points=points)

def query_qdrant(client, collection_name, query_embeddings, top_k=1):
    """
    Führt eine Abfrage auf der Qdrant-Kollektion durch.
    """
    results = []
    for query in query_embeddings:
        search_result = client.search(
            collection_name=collection_name,
            query_vector=query.tolist(),
            limit=top_k
        )
        results.append(search_result)
    return results

def main():
    # Startzeit des gesamten Skripts
    total_start_time = time.time()

    # Datei zum Speichern der Berichte
    report_file = 'runtime_report.txt'
    with open(report_file, 'w') as f:

        f.write("### Laufzeitbericht ###\n\n")

        # Datei-Upload
        f.write("Modul 1: Datei-Upload und Einlesen\n")
        upload_start_time = time.time()
        origin_df = upload_and_read_csv("origin.csv")
        destination_df = upload_and_read_csv("destination.csv")
        upload_end_time = time.time()
        f.write(f"Zeit für Modul 1: {upload_end_time - upload_start_time:.2f} Sekunden\n\n")

        if origin_df is None or destination_df is None:
            f.write("Fehler beim Einlesen der Dateien. Beenden des Programms.\n")
            return

        selected_columns = ['url']  # Annahme, dass die Spalte 'url' verwendet wird

        # Kodierung der Texte
        f.write("Modul 2: Kodierung der Ursprungs- und Ziel-Texte\n")
        encoding_start_time = time.time()
        origin_embeddings, origin_encoding_time = encode_texts(origin_df, selected_columns)
        destination_embeddings, destination_encoding_time = encode_texts(destination_df, selected_columns)
        encoding_end_time = time.time()
        f.write(f"Zeit für Modul 2: {encoding_end_time - encoding_start_time:.2f} Sekunden\n")
        f.write(f"  - Kodierungszeit Ursprungs-Texte: {origin_encoding_time:.2f} Sekunden\n")
        f.write(f"  - Kodierungszeit Ziel-Texte: {destination_encoding_time:.2f} Sekunden\n\n")

        # Qdrant Initialisierung
        f.write("Modul 3: Qdrant Initialisierung\n")
        qdrant_start_time = time.time()
        client = QdrantClient(
            url="https://b80e8e91-bbbe-4a9b-9b36-d2ab69344ba1.europe-west3-0.gcp.cloud.qdrant.io",
            api_key="Upi_Hx819u4hFYNKaUH5np9BNFFHd44Vsey3qS_WI_tHGjGnIP4ULw"
        )
        collection_name = "url-mapping"
        client.recreate_collection(
            collection_name=collection_name,
            vectors_config=models.VectorParams(size=len(destination_embeddings[0]), distance=models.Distance.COSINE),
        )
        qdrant_end_time = time.time()
        f.write(f"Zeit für Modul 3: {qdrant_end_time - qdrant_start_time:.2f} Sekunden\n\n")

        # Einfügen der Ziel-Embeddings in Qdrant
        f.write("Modul 4: Einfügen der Ziel-Embeddings in Qdrant\n")
        upsert_start_time = time.time()
        upsert_to_qdrant(client, collection_name, destination_embeddings, destination_df['url'].tolist())
        upsert_end_time = time.time()
        f.write(f"Zeit für Modul 4: {upsert_end_time - upsert_start_time:.2f} Sekunden\n\n")

        # Abfrage der Qdrant-Datenbank
        f.write("Modul 5: Ähnlichkeitssuche in Qdrant\n")
        query_start_time = time.time()
        results = query_qdrant(client, collection_name, origin_embeddings)
        query_end_time = time.time()
        f.write(f"Zeit für Modul 5: {query_end_time - query_start_time:.2f} Sekunden\n\n")

        # Verarbeitung der Ergebnisse
        matched_urls = []
        similarity_scores = []
        for result in results:
            if result:
                matched_urls.append(result[0].payload['url'])
                similarity_scores.append(result[0].score)
            else:
                matched_urls.append(None)
                similarity_scores.append(0)

        report = pd.DataFrame({
            'origin_url': origin_df['url'],
            'matched_url': matched_urls,
            'similarity_score': similarity_scores,
        })

        # Speichern der Ergebnisse als CSV
        report.to_csv('redirect_mapping_results.csv')

        # Gesamtzeit messen
        total_end_time = time.time()
        f.write(f"Gesamtlaufzeit des Skripts: {total_end_time - total_start_time:.2f} Sekunden\n")

    print(f"Laufzeitbericht wurde in '{report_file}' gespeichert.")
    files.download(report_file)
    files.download('redirect_mapping_results.csv')

if __name__ == "__main__":
    main()
