## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import fitz
import requests
from io import BytesIO

##  Data loading

In [2]:
df = pd.read_excel('train_data.xlsx')

In [55]:
df.shape

(1895, 3)

In [3]:
df.head()

Unnamed: 0,datasheet_link,target_col
0,https://lfillumination.com/files/specsheets/EF...,lighting
1,https://lfillumination.com/files/specsheets/EF...,lighting
2,https://lfillumination.com/files/specsheets/EF...,lighting
3,https://www.waclighting.com/storage/waclightin...,lighting
4,https://www.acuitybrands.com/api/products/geta...,lighting


In [None]:
df.head()

In [5]:
def text_from_url(df):
    for index, row in df.iterrows():
            url = row['datasheet_link']
            response = requests.get(url)
            if response.status_code == 200:
                pdf_data = BytesIO(response.content)
                document = fitz.open(stream=pdf_data, filetype='pdf')
                text = ""
                for page in range(len(document)):
                    p = document.load_page(page)
                    text += p.get_text()
                    return text
            else:
                raise Exception(f"Failed to download PDF. Status code: {response.status_code}")

In [14]:
df.head()

Unnamed: 0,datasheet_link,target_col
0,https://lfillumination.com/files/specsheets/EF...,lighting
1,https://lfillumination.com/files/specsheets/EF...,lighting
2,https://lfillumination.com/files/specsheets/EF...,lighting
3,https://www.waclighting.com/storage/waclightin...,lighting
4,https://www.acuitybrands.com/api/products/geta...,lighting


## Data extraction from url

In [15]:
import pandas as pd
import fitz  # PyMuPDF
import requests
from io import BytesIO
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Function to make requests with retry logic
def make_request_with_retries(url, retries=3, backoff_factor=0.3, timeout=10):
    """Makes a request with retries and timeout."""
    session = requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=[500, 502, 503, 504]
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    
    response = session.get(url, timeout=timeout)
    return response

# Initialize the list to store extracted text
extracted_text = []

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    url = row['datasheet_link']
    
    try:
        response = make_request_with_retries(url)
        response.raise_for_status()  # Ensure we raise an error for bad status codes
        
        pdf_data = BytesIO(response.content)
        
        # Try to open the PDF document
        try:
            document = fitz.open(stream=pdf_data, filetype='pdf')
        except Exception as e:
            print(f"Error opening PDF for URL: {url}. Error: {e}")
            extracted_text.append("")  # Append an empty string if there's an error
            continue
        
        # Extract text from all pages of the PDF
        text = ""
        for page in range(len(document)):
            p = document.load_page(page)
            text += p.get_text()
        
        # Append the extracted text to the list
        extracted_text.append(text)
    
    except requests.exceptions.RequestException as e:
        print(f"Failed to download PDF from {url}. Error: {e}")
        extracted_text.append("")  # Append an empty string if the download fails

# Add the extracted text as a new column in the DataFrame
df['extracted_text'] = extracted_text

# Display the first few rows of the DataFrame
print(df.head())


Failed to download PDF from https://lfillumination.com/files/specsheets/EF411B.PDF. Error: 404 Client Error: Not Found for url: https://lfillumination.com/files/specsheets/EF411B.PDF
Failed to download PDF from https://www.acuitybrands.com/api/products/getasset/holophane/1649332/4408a251-09be-4b61-813f-177b3ebcf10c/holophane-care222-hldmpc-cylinder-pendant-with-cord.pdf?abl_version=01%2f12%2f2023+16%3a54%3a24&DOC_Type=SPEC_SHEET. Error: 404 Client Error: Not Found for url: https://www.acuitybrands.com/api/products/getasset/holophane/1649332/4408a251-09be-4b61-813f-177b3ebcf10c/holophane-care222-hldmpc-cylinder-pendant-with-cord.pdf?abl_version=01%2F12%2F2023+16%3A54%3A24&DOC_Type=SPEC_SHEET
Failed to download PDF from https://beghelliusa.com/wp-content/uploads/2022/05/testa_spec.pdf. Error: 404 Client Error: Not Found for url: https://beghelliusa.com/wp-content/uploads/2023/08/testa_spec.pdf
Error opening PDF for URL: https://www.acuitybrands.com/api/products/getasset/aculux/655780/0f1

In [16]:
df.head()

Unnamed: 0,datasheet_link,target_col,extracted_text
0,https://lfillumination.com/files/specsheets/EF...,lighting,EF400 System # EF408B\nDIE CAST CYLINDRICAL LI...
1,https://lfillumination.com/files/specsheets/EF...,lighting,
2,https://lfillumination.com/files/specsheets/EF...,lighting,EF400 System # EF407B\nDIE CAST CYLINDRICAL LI...
3,https://www.waclighting.com/storage/waclightin...,lighting,ADJUSTABLE BEAM WALL WASH 12V\n5221\nORDERING ...
4,https://www.acuitybrands.com/api/products/geta...,lighting,Type:\nProject:\nHDMC\nSurface Mount with FAR-...


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1895 entries, 0 to 1894
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   datasheet_link  1895 non-null   object
 1   target_col      1895 non-null   object
 2   extracted_text  1895 non-null   object
dtypes: object(3)
memory usage: 44.5+ KB


In [18]:
df['extracted_text']

0       EF400 System # EF408B\nDIE CAST CYLINDRICAL LI...
1                                                        
2       EF400 System # EF407B\nDIE CAST CYLINDRICAL LI...
3       ADJUSTABLE BEAM WALL WASH 12V\n5221\nORDERING ...
4       Type:\nProject:\nHDMC\nSurface Mount with FAR-...
                              ...                        
1890                                                     
1891                                                     
1892                                                     
1893                                                     
1894                                                     
Name: extracted_text, Length: 1895, dtype: object

In [19]:
df.to_csv('extracted_data.csv', index=False)

In [20]:
data  = pd .read_csv('extracted_data.csv')

In [21]:
data.head()

Unnamed: 0,datasheet_link,target_col,extracted_text
0,https://lfillumination.com/files/specsheets/EF...,lighting,EF400 System # EF408B\nDIE CAST CYLINDRICAL LI...
1,https://lfillumination.com/files/specsheets/EF...,lighting,
2,https://lfillumination.com/files/specsheets/EF...,lighting,EF400 System # EF407B\nDIE CAST CYLINDRICAL LI...
3,https://www.waclighting.com/storage/waclightin...,lighting,ADJUSTABLE BEAM WALL WASH 12V\n5221\nORDERING ...
4,https://www.acuitybrands.com/api/products/geta...,lighting,Type:\nProject:\nHDMC\nSurface Mount with FAR-...


In [46]:
# Function to make requests with retry logic
def make_request_with_retries(url, retries=3, backoff_factor=0.3, timeout=10):
    """Makes a request with retries and timeout."""
    session = requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=[500, 502, 503, 504]
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    
    response = session.get(url, timeout=timeout)
    return response

# Initialize the list to store extracted text
extracted_text_test = []

# Iterate over each row in the DataFrame
for index, row in test_df.iterrows():
    url = row['datasheet_link']
    
    try:
        response = make_request_with_retries(url)
        response.raise_for_status()  # Ensure we raise an error for bad status codes
        
        pdf_data = BytesIO(response.content)
        
        # Try to open the PDF document
        try:
            document = fitz.open(stream=pdf_data, filetype='pdf')
        except Exception as e:
            print(f"Error opening PDF for URL: {url}. Error: {e}")
            extracted_text_test.append("")  # Append an empty string if there's an error
            continue
        
        # Extract text from all pages of the PDF
        text = ""
        for page in range(len(document)):
            p = document.load_page(page)
            text += p.get_text()
        
        # Append the extracted text to the list
        extracted_text_test.append(text)
    
    except requests.exceptions.RequestException as e:
        print(f"Failed to download PDF from {url}. Error: {e}")
        extracted_text_test.append("")  # Append an empty string if the download fails

# Add the extracted text as a new column in the DataFrame
test_df['extracted_text'] = extracted_text_test

# Display the first few rows of the DataFrame
print(df.head())


Failed to download PDF from https://www.luminis.com/assets/Uploads/SPECS-SYRIOS-SQUARE-SQ405-JUL2022-REV6.pdf. Error: 404 Client Error: Not Found for url: https://www.luminis.com/assets/Uploads/SPECS-SYRIOS-SQUARE-SQ405-JUL2022-REV6.pdf
Failed to download PDF from https://www.luminis.com/assets/Uploads/SPECS-SYRIOS-SQUARE-SQ510-JUL2022-REV6.pdf. Error: 404 Client Error: Not Found for url: https://www.luminis.com/assets/Uploads/SPECS-SYRIOS-SQUARE-SQ510-JUL2022-REV6.pdf
Failed to download PDF from https://www.luminis.com/assets/Uploads/SPECS-SYRIOS-SY305-JUL2022-REV6.pdf. Error: 404 Client Error: Not Found for url: https://www.luminis.com/assets/Uploads/SPECS-SYRIOS-SY305-JUL2022-REV6.pdf
Failed to download PDF from https://www.luminis.com/assets/Uploads/SPECS-SCOPO-SC350-JUL2022-REV4.pdf. Error: 404 Client Error: Not Found for url: https://www.luminis.com/assets/Uploads/SPECS-SCOPO-SC350-JUL2022-REV4.pdf
Failed to download PDF from https://www.luminis.com/assets/Uploads/SPECS-LUMISTIK-

In [47]:
test_df.head()

Unnamed: 0,datasheet_link,target_col,extracted_text
0,https://lumenart.com/images/alume/awl-01_specs...,lighting,AWL.01\nSPECIFICATIONS\nMaterial\nMachined alu...
1,https://lumenart.com/images/fabric/rdc/rdc_spe...,lighting,RDC Series\nSPECIFICATIONS\nConstruction\nFabr...
2,https://lumenart.com/images/fabric/cyp/cyp_spe...,lighting,CYP Series\nSPECIFICATIONS\nConstruction\nFabr...
3,https://lumenart.com/images/designer/wlp_specs...,lighting,WLP\nSPECIFICATIONS\nConstruction\nExtruded al...
4,https://lumenart.com/images/designer/wcp/wcp-s...,lighting,WCP-S\nSPECIFICATIONS\nConstruction\nReal oak ...


In [48]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   datasheet_link  400 non-null    object
 1   target_col      400 non-null    object
 2   extracted_text  400 non-null    object
dtypes: object(3)
memory usage: 9.5+ KB


In [None]:
test_df['extracted_test']

In [54]:
test_df.to_csv('extracted_test_data.csv', index=False)