In [None]:
import pandas as pd
import os
from docx import Document
import PyPDF2
import pytesseract
from PIL import Image
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

metadata_fields = [
    "Agreement Start Date",
    "Agreement End Date",
    "Party One",
    "Party Two",
    "Renewal Notice"
]

def extract_docx_text(path):
    doc = Document(path)
    return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])

def extract_pdf_text(path):
    with open(path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])

def extract_image_text(path):
    image = Image.open(path)
    return pytesseract.image_to_string(image)

def extract_text(files, base_dir="test"):
    for f in files:
        full_path = os.path.join(base_dir, f)
        if f.endswith('.docx'):
            return extract_docx_text(full_path)
        elif f.endswith('.pdf'):
            return extract_pdf_text(full_path)
        elif f.endswith(('.png', '.jpg', '.jpeg')):
            return extract_image_text(full_path)
    return ""

def match_files(name, all_files):
    return [f for f in all_files if f.startswith(name)]

def predict_metadata(text, labels):
    sentences = [s.strip() for s in text.split('\n') if len(s.strip()) > 10]
    if not sentences:
        return {label: "" for label in labels}

    vectorizer = TfidfVectorizer().fit(sentences + labels)
    sent_vecs = vectorizer.transform(sentences)
    label_vecs = vectorizer.transform(labels)

    sim = cosine_similarity(label_vecs, sent_vecs)
    predictions = {}

    for i, label in enumerate(labels):
        best = sim[i].argmax()
        predictions[label] = sentences[best]
    return predictions

def main():
    df = pd.read_csv("test.csv")
    all_files = os.listdir("test")
    results = []

    for _, row in df.iterrows():
        file_list = match_files(row["File Name"], all_files)
        print(f"Processing: {row['File Name']} => {file_list}")
        text = extract_text(file_list)
        pred = predict_metadata(text, metadata_fields)
        pred["File Name"] = row["File Name"]
        results.append(pred)

    output_df = pd.DataFrame(results)
    output_df.to_csv("torch_free_metadata_output.csv", index=False)
    print("Metadata saved to torch_free_metadata_output.csv")

if __name__ == "__main__":
    main()
