# Importing Libraries

In [3]:
import fitz  # PyMuPDF
import re
import spacy
import pandas as pd

# NLP Model

In [4]:
nlp = spacy.load("en_core_web_sm") # spaCy model

# Extracting text from a PDF

In [5]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# PDF paths

In [None]:
pdf_paths = [
    "/Users/amaanqureshi/Desktop/TB2/MSc Project/paper_1.pdf",
    "/Users/amaanqureshi/Desktop/TB2/MSc Project/paper_2.pdf",
    "/Users/amaanqureshi/Desktop/TB2/MSc Project/paper_3.pdf",
    "/Users/amaanqureshi/Desktop/TB2/MSc Project/paper_4.pdf",
    "/Users/amaanqureshi/Desktop/TB2/MSc Project/paper_5.pdf",
    "/Users/amaanqureshi/Desktop/TB2/MSc Project/paper_6.pdf",
    "/Users/amaanqureshi/Desktop/TB2/MSc Project/paper_7.pdf",
    "/Users/amaanqureshi/Desktop/TB2/MSc Project/paper_8.pdf",
    "/Users/amaanqureshi/Desktop/TB2/MSc Project/paper_9.pdf",
    "/Users/amaanqureshi/Desktop/TB2/MSc Project/paper_10.pdf",
    '/Users/amaanqureshi/Desktop/TB2/MSc Project/paper_11.pdf',
    '/Users/amaanqureshi/Desktop/TB2/MSc Project/paper_12.pdf',
    '/Users/amaanqureshi/Desktop/TB2/MSc Project/paper_13.pdf',
]

# Regex Function for polymers

In [6]:
# Function to identify potential polymer names using regex
def identify_potential_polymer_names(text):
    polymer_pattern = re.compile(
        r'\b(?:'
        r'poly[a-zA-Z]*\b|'                    # General pattern for synthetic polymers starting with 'poly'
        r'[a-zA-Z]+poly[a-zA-Z]*\b|'           # General pattern for polymers containing 'poly'
        r'[a-zA-Z]*co[ -]?poly[a-zA-Z]*\b|'    # Copolymers
        r'polysaccharide\b|'                   # Natural polymers: polysaccharides
        r'cellulose\b|'                        # Specific natural polymer
        r'starch\b|'                           # Specific natural polymer
        r'glycogen\b|'                         # Specific natural polymer
        r'polyethylene\b|'                     # Specific synthetic polymers
        r'polypropylene\b|'
        r'polystyrene\b|'
        r'silicone\b|'
        r'nylon\b|'
        r'polyester\b|'
        r'acrylic\b|'
        r'aramid\b|'
        r'kevlar\b|'
        r'epoxy\b|'
        r'phenolic resin\b|'
        r'melamine\b'
        r')',
        re.IGNORECASE
    )
    return polymer_pattern.findall(text)

# Properties Extraction

In [9]:
# Function to identify potential properties and their values
def identify_properties_and_values(text):
    property_pattern = re.compile(
        r'\b(?:melting point|boiling point|density|tensile strength|elastic modulus|thermal conductivity|glass transition temperature|hardness|viscosity|permeability|degradation temperature|solubility)\b',
        re.IGNORECASE
    )
    value_pattern = re.compile(
        r'\b\d+\.?\d*\s*(?:°C|°F|K|g/cm³|MPa|GPa|W/mK|Pa·s|m/s|mmHg|%)\b',
        re.IGNORECASE
    )
    properties = property_pattern.findall(text)
    values = value_pattern.findall(text)
    return list(zip(properties, values))

# Function to refine and extract polymer names and properties using NLP and context
def extract_polymer_properties(text):
    potential_names = identify_potential_polymer_names(text)
    polymer_data = {}

    doc = nlp(text)

    for token in doc:
        if token.text in potential_names:
            polymer_name = token.text.strip()
            surrounding_text = text[max(0, token.idx - 100):min(len(text), token.idx + 100)]
            properties_and_values = identify_properties_and_values(surrounding_text)

            if polymer_name not in polymer_data:
                polymer_data[polymer_name] = properties_and_values
            else:
                polymer_data[polymer_name].extend(properties_and_values)

    return polymer_data

# Function to convert extracted polymer properties into a DataFrame
def convert_to_dataframe(polymer_data):
    rows = []
    for polymer, properties in polymer_data.items():
        if polymer.lower() in ["polymer", "polym", "polymers"]:
            continue  # Skip generic entries
        if properties:
            for prop, value in properties:
                rows.append([polymer, prop, value])
        else:
            rows.append([polymer, "", ""])  # No properties
    return pd.DataFrame(rows, columns=["Polymer", "Property", "Value"])


# Extract and process polymer properties from all PDFs
all_data = []
for pdf_path in pdf_paths:
    text = extract_text_from_pdf(pdf_path)
    polymer_properties = extract_polymer_properties(text)
    all_data.append(convert_to_dataframe(polymer_properties))

 # Saving to CSV

In [None]:
# Combine all data into a single DataFrame and save to CSV
final_df = pd.concat(all_data, ignore_index=True)
final_df.to_csv('Extracted_Polymer_Properties_Table.csv', index=False)

# Print confirmation
print("Polymer properties extracted and saved to 'Extracted_Polymer_Properties_Table.csv'")
