In [None]:
import os
import re
import fitz
import pickle
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Read `DataSet.xlsx`

In [None]:
file_path = "DataSet.xlsx"
train_data = pd.read_excel(file_path, sheet_name="train_data")
test_data = pd.read_excel(file_path, sheet_name="test_data")
train_data, test_data

# Download PDF files to `dataset/` directory

In [None]:
# Function to download a PDF and map its path to the label
def download_pdf_with_label(url, label, folder, mapping):
    """Downloads a PDF from a URL, saves it, and maps its path to the label."""
    file_name = url.split("/")[-1]
    save_path = os.path.join(folder, file_name)
    try:
        response = requests.get(url, stream=True, timeout=10)
        response.raise_for_status()  # Raise error for bad status codes
        with open(save_path, "wb") as f:
            f.write(response.content)
        print(f"Downloaded: {save_path}")
        mapping.append({"file_path": save_path, "label": label})
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")


# Function to process a dataset
def process_dataset(data, folder, max_threads=100):
    """
    Downloads files for a dataset, saves them in the specified folder,
    and returns a mapping of file paths to labels, then saves it as a CSV or Excel.
    """
    # Initialize an empty list to store the mappings
    mapping = []  # To store file path and label mappings
    urls_labels = data[["datasheet_link", "target_col"]].to_dict(
        orient="records"
    )  # Convert to a list of dicts

    # Download the PDFs and populate the mapping
    with ThreadPoolExecutor(max_threads) as executor:
        # Pass each URL and label to the downloader function
        executor.map(
            lambda x: download_pdf_with_label(
                x["datasheet_link"], x["target_col"], folder, mapping
            ),
            urls_labels,
        )

    # Convert the mapping list into a Pandas DataFrame
    mapping_df = pd.DataFrame(mapping, columns=["file_path", "label"])

    return mapping_df


# Directories to save PDFs
os.makedirs("dataset/train", exist_ok=True)
os.makedirs("dataset/test", exist_ok=True)

# Download train datasets and store its mappings
train_df = process_dataset(train_data, "dataset/train")
train_df.to_csv("dataset/train.csv", index=False)
print("Mapping saved as dataset/train.csv")

# Download test datasets and store its mappings
test_df = process_dataset(test_data, "dataset/test")
test_df.to_csv("dataset/test.csv", index=False)
print("Mapping saved as dataset/test.csv")

# Extracting and preprocessing data from PDF files

In [None]:
# Function to extract text from a PDF
def extract_text_from_pdf(file_path):
    """Extracts text from a PDF file."""
    print("Reading content of:", file_path)
    try:
        with fitz.open(file_path) as pdf:
            text = ""
            for page in pdf:
                text += page.get_text()
        return text
    except Exception as e:
        print(f"Error extracting text from {file_path}: {e}")
        return ""


def preprocess_text(text):
    """Basic text preprocessing."""
    text = text.lower()
    text = re.sub(r"[^\w\s]", " ", text)  # Remove punctuation
    text = re.sub(r"\d+", " ", text)  # Remove numbers
    tokens = text.split()
    tokens = [t for t in tokens if t not in ENGLISH_STOP_WORDS]  # Remove stopwords
    return " ".join(tokens)


# Read the train and test PDFs
train_df = pd.read_csv("dataset/train.csv")
test_df = pd.read_csv("dataset/test.csv")

# Apply text extraction
train_df["text"] = train_df["file_path"].apply(extract_text_from_pdf)
test_df["text"] = test_df["file_path"].apply(extract_text_from_pdf)

# Preprocess the text
train_df["clean_text"] = train_df["text"].apply(preprocess_text)
test_df["clean_text"] = test_df["text"].apply(preprocess_text)

# Saving train and test files after cleaning and preprocessing
train_df.to_pickle("dataset/train_cleaned.pkl")
test_df.to_pickle("dataset/test_cleaned.pkl")
print("Saved cleaned train and test files!")

# Train a model

In [None]:
# Read the train and test PDFs
train_cleaned = pd.read_pickle("dataset/train_cleaned.pkl")
test_cleaned = pd.read_pickle("dataset/test_cleaned.pkl")

# Convert text to features using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
X_train = vectorizer.fit_transform(train_cleaned["clean_text"])
X_test = vectorizer.transform(test_cleaned["clean_text"])

# Get the labels
y_train = train_cleaned["label"]
y_test = test_cleaned["label"]

# Train a Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Saving Trained Model

In [None]:
os.makedirs("models", exist_ok=True)

# Save the trained model
with open("models/product_classifier_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

# Save the TF-IDF vectorizer
with open("models/tfidf_vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

# Save the labels
with open("models/labels.pkl", "wb") as labels_file:
    pickle.dump(list(model.classes_), labels_file)

print("Model, labels and vectorizer saved successfully!")

# Classify PDFs

In [None]:
def classify_pdf(file_path):
    """Classify a new PDF file."""
    # Extract and preprocess text from the new file
    text = extract_text_from_pdf(file_path)
    clean_text = preprocess_text(text)

    # Transform the cleaned text into the same feature space as the training data
    features = vectorizer.transform([clean_text])

    # Predict the class
    prediction = model.predict(features)

    # Return the predicted label
    return prediction[0]


# Example usage
file_path = "dataset/test/some_new_pdf.pdf"
predicted_label = classify_pdf(file_path)
print(f"Predicted label: {predicted_label}")