# Install & Import Libraries

In [None]:
pip install pymupdf requests scikit-learn pandas tqdm nltk

In [1]:
import pandas as pd
import numpy as np
import requests
import fitz
from tqdm import tqdm
from collections import Counter
from nltk.stem import PorterStemmer
import re
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")


# Read PDFs

The "get_text_from_url" function uses request library to read data from URL webpage. The timeout method saves time on infinite loading webpages. Error messages are stored for data validation.

In [24]:
def get_text_from_url(url):
    pdf_data = []
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        pdf_doc = fitz.open(stream=response.content, filetype="pdf")
        for page_num in range(len(pdf_doc)):
          page = pdf_doc.load_page(page_num)
          text = page.get_text()
          pdf_data.append(text)

        pdf_doc.close()
    except requests.exceptions.Timeout:
        pdf_data.append("The request timed out.")
    except requests.exceptions.HTTPError as http_err:
        pdf_data.append(f"HTTP error occurred: {http_err}")
    except Exception as err:
        pdf_data.append(f"An error occurred: {err}")
    return "\n".join(pdf_data)  # Join all the pages' text into a single string


In [None]:
train_path = r"mle-1-assign-dataset - train_data.csv"
test_path = r"mle-1-assign-dataset - test_data.csv"

In [None]:
train_data = pd.read_csv(train_path)
train_data.head()

In [None]:
test_data = pd.read_csv(test_path)
test_data.head()

In [None]:
# tqdm to display progress bar
tqdm.pandas(desc="Reading PDF from URLs")

train_data['pdf_text'] = train_data['datasheet_link'].progress_apply(get_text_from_url)

In [None]:
# tqdm to display progress bar
tqdm.pandas(desc="Reading PDF from URLs")

test_data['pdf_text'] = test_data['datasheet_link'].progress_apply(get_text_from_url)

In [None]:
train_data.head()

In [None]:
test_data.head()

The extracted data from web PDFs is stored for future classification, reducing unnecessary computation.

In [None]:
train_data.to_csv("pdf_extracted_train_data.csv",index=False)

In [None]:
test_data.to_csv("pdf_extracted_test_data.csv",index=False)

# Clean PDF data

In [25]:
def retain_only_alphabets(input_string):
    # Retain only alphabets and spaces, and convert to lowercase
    cleaned_string = re.sub(r'[^a-zA-Z\s]', '', input_string).lower()
    
    # Remove words that are less than length 4
    filtered_string = ' '.join([word for word in cleaned_string.split() if len(word) >= 4])
    
    return filtered_string

In [None]:
train_data = pd.read_csv("pdf_extracted_train_data.csv")

In [None]:
test_data = pd.read_csv("pdf_extracted_test_data.csv")

In [None]:
tqdm.pandas(desc="Cleaning PDF data")

train_data['cleaned_text'] = train_data['pdf_text'].progress_apply(retain_only_alphabets)

In [None]:
tqdm.pandas(desc="Cleaning PDF data")

test_data['cleaned_text'] = test_data['pdf_text'].progress_apply(retain_only_alphabets)

In [None]:
train_data.to_csv("pdf_cleaned_train_data.csv",index=False)

In [None]:
test_data.to_csv("pdf_cleaned_test_data.csv",index=False)

# Remove unread PDFs

In [26]:
error_prefixes = ("request timed", "http error occurred", "error occurred")

# Function to check if `pdf` starts with any of the error prefixes
def starts_with_any_prefix(pdf):
    return any(pdf.startswith(prefix) for prefix in error_prefixes)

def remove_unread_pdfs(df, prefixes):
    # Create a boolean mask for rows to drop
    mask1 = df['cleaned_text'].isna()

    # Drop rows where the mask is True
    df = df[~mask1].reset_index(drop=True)

    mask2 = df['cleaned_text'].apply(lambda doc: starts_with_any_prefix(doc)) 
    
    # Drop rows where the mask is True
    df = df[~mask2].reset_index(drop=True)
    
    return df


In [None]:
train_data = pd.read_csv("pdf_cleaned_train_data.csv")

In [None]:
test_data = pd.read_csv("pdf_cleaned_test_data.csv")

In [None]:
train_data = remove_unread_pdfs(train_data, error_prefixes)

In [None]:
test_data = remove_unread_pdfs(test_data, error_prefixes)

In [None]:
len(train_data),len(test_data)

In [None]:
train_data.to_csv("preprocessed_train_data.csv",index=False)

In [None]:
test_data.to_csv("preprocessed_test_data.csv",index=False)

# Predict labels

### Brute Force Approach

In [2]:
stemmer = PorterStemmer()

# Predefined target root words and their targets
trgts = ['light', 'fuse', 'cabl', 'others', 'lamp']
replacements = {
    'light': 'lighting',
    'cabl': 'cable',
    'fuse': 'fuses',
    'lamp': 'lighting'
}

def get_predictions(doc):
    words = doc.split()
    root_words = [stemmer.stem(word) for word in words]
    freq = Counter(root_words)

    filtered_freq = Counter({
        word: freq[word] for word in trgts if word in freq
    })

    prediction = filtered_freq.most_common(1)[0][0] if filtered_freq else "others"

    return replacements.get(prediction, prediction)

In [3]:
train_data = pd.read_csv("preprocessed_train_data.csv")

In [4]:
test_data = pd.read_csv("preprocessed_test_data.csv")

In [5]:
tqdm.pandas(desc="Predicting labels")

train_data['preds'] = train_data['cleaned_text'].progress_apply(get_predictions)

Predicting labels: 100%|██████████| 982/982 [00:11<00:00, 82.45it/s] 


In [6]:
tqdm.pandas(desc="Predicting labels")

test_data['preds'] = test_data['cleaned_text'].progress_apply(get_predictions)

Predicting labels: 100%|██████████| 250/250 [00:04<00:00, 52.10it/s] 


In [7]:
set(train_data['preds'])

{'cable', 'fuses', 'lighting', 'others'}

In [None]:
train_data.to_csv("predicted_train_data.csv")

In [None]:
test_data.to_csv("predicted_test_data.csv")

In [8]:
def get_false_preds(df):
    # Boolean mask where prediction does not match the target
    false_preds_mask = df['preds'] != df['target_col']
    
    # Filter and return the DataFrame with false predictions
    chk_df = df[false_preds_mask].reset_index(drop=True)
    
    return chk_df

In [9]:
chk_df1 = get_false_preds(train_data)

In [10]:
chk_df2 = get_false_preds(test_data)

In [11]:
len(chk_df1), len(chk_df2)

(91, 19)

# Evaluation

In [12]:
def construct_confusion_matrix(df, pred_col='preds', target_col='target_col'):
    # Extract predictions and targets from the DataFrame
    y_true = df[target_col]
    y_pred = df[pred_col]
    
    # Compute the confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Create a DataFrame for better visualization
    cm_df = pd.DataFrame(cm, 
                         index=np.unique(y_true), 
                         columns=np.unique(y_pred))
    
    cr = classification_report(y_true, y_pred)
    
    return cm_df,cr

In [13]:
conf_matrix_df,cr = construct_confusion_matrix(train_data)
print(conf_matrix_df)
print(cr)

          cable  fuses  lighting  others
cable       102      0         5       0
fuses         0    464         0       0
lighting     24      2       301       8
others        1      1        50      24
              precision    recall  f1-score   support

       cable       0.80      0.95      0.87       107
       fuses       0.99      1.00      1.00       464
    lighting       0.85      0.90      0.87       335
      others       0.75      0.32      0.44        76

    accuracy                           0.91       982
   macro avg       0.85      0.79      0.80       982
weighted avg       0.90      0.91      0.90       982



In [14]:
conf_matrix_df,cr = construct_confusion_matrix(test_data)
print(conf_matrix_df)
print(cr)

          cable  fuses  lighting  others
cable        55      0         5       0
fuses         0     49         0       0
lighting      9      0        64       0
others        4      1         0      63
              precision    recall  f1-score   support

       cable       0.81      0.92      0.86        60
       fuses       0.98      1.00      0.99        49
    lighting       0.93      0.88      0.90        73
      others       1.00      0.93      0.96        68

    accuracy                           0.92       250
   macro avg       0.93      0.93      0.93       250
weighted avg       0.93      0.92      0.93       250



# Machine Learning models  

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [16]:
X_train = train_data['cleaned_text'].tolist()
y_train = train_data['target_col'].tolist()
X_test = test_data['cleaned_text'].tolist()
y_test = test_data['target_col'].tolist()

In [17]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [18]:
(X_train_tfidf).shape, (X_test_tfidf).shape

((982, 11266), (250, 11266))

In [19]:
# Train model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

y_pred_proba_rf = rf_model.predict_proba(X_test_tfidf) 

# Predict and evaluate
y_pred_rf = rf_model.predict(X_test_tfidf)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

cm = confusion_matrix(y_test, y_pred_rf)
cm_df = pd.DataFrame(cm,index=np.unique(y_test),columns=np.unique(y_test))
cm_df

Random Forest Classification Report:
              precision    recall  f1-score   support

       cable       1.00      0.95      0.97        60
       fuses       1.00      1.00      1.00        49
    lighting       0.51      1.00      0.67        73
      others       0.00      0.00      0.00        68

    accuracy                           0.72       250
   macro avg       0.63      0.74      0.66       250
weighted avg       0.58      0.72      0.63       250



Unnamed: 0,cable,fuses,lighting,others
cable,57,0,3,0
fuses,0,49,0,0
lighting,0,0,73,0
others,0,0,68,0


In [20]:
# Train model
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train_tfidf, y_train)

y_pred_proba_knn = knn_model.predict_proba(X_test_tfidf) 

# Predict and evaluate
y_pred_knn = knn_model.predict(X_test_tfidf)
print("KNN Classification Report:")
print(classification_report(y_test, y_pred_knn))

cm = confusion_matrix(y_test, y_pred_knn)
cm_df = pd.DataFrame(cm,index=np.unique(y_test),columns=np.unique(y_test))
cm_df

KNN Classification Report:
              precision    recall  f1-score   support

       cable       0.72      1.00      0.84        60
       fuses       0.72      1.00      0.84        49
    lighting       0.79      1.00      0.88        73
      others       1.00      0.10      0.19        68

    accuracy                           0.76       250
   macro avg       0.81      0.78      0.69       250
weighted avg       0.82      0.76      0.67       250



Unnamed: 0,cable,fuses,lighting,others
cable,60,0,0,0
fuses,0,49,0,0
lighting,0,0,73,0
others,23,19,19,7


In [21]:
# Train model
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train_tfidf, y_train)

y_pred_proba_gb = gb_model.predict_proba(X_test_tfidf) 

# Predict and evaluate
y_pred_gb = gb_model.predict(X_test_tfidf)
print("Gradient Boosting Classification Report:")
print(classification_report(y_test, y_pred_gb))

cm = confusion_matrix(y_test, y_pred_gb)
cm_df = pd.DataFrame(cm,index=np.unique(y_test),columns=np.unique(y_test))
cm_df


Gradient Boosting Classification Report:
              precision    recall  f1-score   support

       cable       1.00      0.98      0.99        60
       fuses       1.00      1.00      1.00        49
    lighting       0.54      1.00      0.71        73
      others       1.00      0.12      0.21        68

    accuracy                           0.76       250
   macro avg       0.89      0.78      0.73       250
weighted avg       0.87      0.76      0.70       250



Unnamed: 0,cable,fuses,lighting,others
cable,59,0,1,0
fuses,0,49,0,0
lighting,0,0,73,0
others,0,0,60,8


# Model Pipeline

In [27]:
def get_label_from_url(url):
    pdf_text = get_text_from_url(url)
    clean_text = retain_only_alphabets(pdf_text)
    if not isinstance(clean_text, str) or starts_with_any_prefix(clean_text):
        return "Error in reading PDF."
    else:
        label = get_predictions(clean_text)
    return label
        
        

In [28]:
url = 'https://lfillumination.com/files/specsheets/EF408B-Light-Unit.pdf'
get_label_from_url(url)

'lighting'