In [None]:
import re
import requests
from bs4 import BeautifulSoup
import csv
import urllib3

urllib3.disable_warnings()

def extract_invention_details(url):
    response = requests.get(url, verify=False)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, 'html.parser')

    title = soup.find('h1').get_text(strip=True) if soup.find('h1') else ''

    description_section = soup.find('div', class_='c_tp_description')
    description_html = str(description_section)
    
    description_match = re.search(r'<p>(.*?)(?:</p>|<br/>)', description_html, re.DOTALL)
    description = description_match.group(1).strip() if description_match else ''

    background_match = re.search(r'<strong>Background:\s*<\/strong>(.*?)(?=<strong>|$)', description_html, re.DOTALL)
    background = background_match.group(1).strip() if background_match else ''

    background = re.sub(r'<(?:br|p|/p|/br|br/)[^>]*>', '', background)

    category = "Could not find"

    td = soup.find('td', attrs={'valign': 'top', 'style': 'width:200px;'})
        
    if td:
 
        description_div = td.find('div', class_='c_tp_description')
        
        if description_div:

            first_link = description_div.find('td').find('a')
            
            if first_link:

                link_text = first_link.text.strip()
                link_text = link_text[29:]

                first_cat = link_text.split(' > ')[0]
                category = first_cat

    return {
        'title': title.strip(),
        'description': description.strip(),
        'background': background.strip(),
        'category': category.strip()
    }

url = 'https://inventions.arizona.edu/tech/New_Sunscreens_Based_on_Particles_Prepared_from_Higher_Aldehydes_and_Phenols'
details = extract_invention_details(url)
print('Title:', details['title'])
print('\nDescription:', details['description'])
print('\nBackground:', details['background'])
print('\nCategory:', details['category'])

In [None]:
def get_urls_on_page(page_num):
    url = "https://inventions.arizona.edu/searchresults.aspx?q=&type=&page=" + str(page_num) + "&sort=datecreated&order=desc"
    response = requests.get(url, verify=False)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.content, 'html.parser')    
    invention_urls = []
    for tr in soup.find_all('tr'):
        a_tag = tr.find('a')
        if a_tag and 'href' in a_tag.attrs:

            href = a_tag['href']

            full_url = f"https://inventions.arizona.edu{href}"  
            invention_urls.append(full_url)
    
    return invention_urls

invention_urls = get_urls_on_page(0)
for url in invention_urls:
    print(url)

In [None]:
import pandas as pd
import time

def create_invention_dataset(start_page, end_page):
    all_inventions = []
    
    for page_num in range(start_page, end_page + 1):
        urls = get_urls_on_page(page_num)
        
        for url in urls:
            try:
                details = extract_invention_details(url)                
                invention_data = {
                    'Invention': details['title'],
                    'Description': details['description'],
                    'Background': details['background'],
                    'Category': details['category']
                }
                
                all_inventions.append(invention_data)
                time.sleep(0.25)
                
            except Exception as e:
                print(f"Error processing {url}: {str(e)}")
        
    df = pd.DataFrame(all_inventions)
    
    df.to_csv('inventions_dataset1.csv', index=False)
    print("Dataset saved to inventions_dataset1.csv")
    
    return df

start_page = 0
end_page = 152 

dataset1 = create_invention_dataset(start_page, end_page)

print(dataset1.head())

In [None]:
dataset.to_csv('inventions_dataset.csv', index=False, encoding='utf-8-sig')

In [None]:
import urllib3

urllib3.disable_warnings()

def get_titles_on_page(page_num):
    url = "https://inventions.arizona.edu/searchresults.aspx?q=&type=&page=" + str(page_num) + "&sort=datecreated&order=desc"
    response = requests.get(url, verify=False)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.content, 'html.parser')
    titles = []
    for tr in soup.find_all('tr'):
        a_tag = tr.find('a')
        if a_tag and 'href' in a_tag.attrs:
            title = a_tag.string
            titles.append(title)
    
    return titles



In [None]:
titles = []
for page_num in range(0,153):
    titles = titles + get_titles_on_page(page_num)
print(titles[:30])

In [None]:
df = pd.DataFrame(titles)
df.to_csv('titles.csv', index=False)

In [None]:
file_path = 'inventions_dataset1.csv' 
df = pd.read_csv(file_path)

def clean_text(text):
    if isinstance(text, str):
        text = text.replace('Â', '')
        text = text.replace('ThisÂ', 'This')
        text = re.sub(r'â€‹â€‹â€‹â€‹â€‹â€‹â€', '', text)
        text = re.sub(r'<span style="font-family:null">', '', text)
        text = re.sub(r'</?span[^>]*>', '', text)  
        text = text.replace('â€™', "'")  
        text = text.replace('â€œ', '"').replace('â€', '"')
        text = re.sub(r'â€', '', text)
    return text

df['Description'] = df['Description'].apply(clean_text)
df['Background'] = df['Background'].apply(clean_text)
df['Invention'] = df['Invention'].apply(clean_text)
df['Category'] = df['Category'].apply(clean_text)
df.dropna(inplace=True)

cleaned_file_path = 'cleaned_inventions_dataset1.csv'
df.to_csv(cleaned_file_path, index=False)

In [None]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
import nltk
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

nltk.download('punkt')

def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_embeddings = load_glove_embeddings('glove.6B.100d.txt')

def text_to_embedding(text, embeddings_index, embedding_dim=100):
    words = word_tokenize(text.lower())
    valid_words = [word for word in words if word in embeddings_index]
    if not valid_words:
        return np.zeros(embedding_dim)
    embedding = np.mean([embeddings_index[word] for word in valid_words], axis=0)
    return embedding

df = pd.read_csv('cleaned_inventions_dataset1.csv')
df['text'] = df['Invention'] + " " + df['Description'] + " " + df['Background']
df['embedding'] = df['text'].apply(lambda x: text_to_embedding(x, glove_embeddings))

embedding_df = pd.DataFrame(df['embedding'].tolist())

df = pd.concat([df.drop(columns=['Invention', 'Description', 'Background', 'text', 'embedding']), embedding_df], axis=1)
X = df.drop(columns=['Category'])
y = df['Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))