In [16]:
import pandas as pd
import numpy as np
from lxml import html

loaded_df = pd.read_csv('Full_arXiv_updated_data.csv', nrows=500)
dropped_rows = loaded_df[loaded_df['pdf_text'].str.startswith('error')]
print(f"Number of dropped rows: {len(dropped_rows)}")
loaded_df = loaded_df.loc[:, ~loaded_df.columns.str.contains('^Unnamed')]

Number of dropped rows: 4


In [17]:
# Read the HTML content from the file in binary mode
with open('category_taxonomy', 'rb') as file:  # Note 'rb' mode for binary reading
    html_content = file.read()

# Parse the HTML content
tree = html.fromstring(html_content)

# Define subjects_divs by selecting the correct elements from the HTML tree
subjects_divs = tree.xpath('//div[contains(@class, "columns divided")]')

# Extract information
data = []
for div in subjects_divs:
    subject_code = div.xpath('.//h4/text()')
    category = div.xpath('.//h4/span/text()')
    description = div.xpath('.//div[contains(@class, "column")]/p/text()')
    
    # Simplify extraction by directly accessing the first item if available
    data.append({
        'Subject Code': subject_code[0].split()[0] if subject_code else 'N/A',
        'Category': category[0].strip('()') if category else 'N/A',
        'Description': description[0] if description else 'N/A'
    })

# Convert the list of dictionaries to a DataFrame
category_df = pd.DataFrame(data)

# Identify rows to drop where any column has "N/A"
mask = category_df.apply(lambda row: "N/A" in row.values, axis=1)

# Drop these rows in place using the correct DataFrame variable
category_df.drop(category_df[mask].index, inplace=True)
category_df.reset_index(drop=True, inplace=True)

new_categories = [
    {'Subject Code': 'cond-mat', 'Category': 'Condenced Matter', 'Description': ''},  
    {'Subject Code': 'astro-ph', 'Category': 'Astrophysics', 'Description': ''},
    {'Subject Code': 'eess', 'Category': 'Electrical Engineering and Systems Science', 'Description': ''},
    {'Subject Code': 'math', 'Category': 'Mathematics', 'Description': ''},
    {'Subject Code': 'q-bio', 'Category': 'Quantitative Biology', 'Description': ''},
    {'Subject Code': 'q-fin', 'Category': 'Quantitative Finance', 'Description': ''},
    {'Subject Code': 'stat', 'Category': 'Statistics', 'Description': ''},
    {'Subject Code': 'econ', 'Category': 'Economics', 'Description': ''},
    {'Subject Code': 'hep', 'Category': 'High Energy Physics', 'Description': ''},
    {'Subject Code': 'nucl', 'Category': 'Nuclear Experiment', 'Description': ''},
    {'Subject Code': 'physics', 'Category': 'Physics', 'Description': ''},
    {'Subject Code': 'nlin', 'Category': 'Nonlinear Sciences', 'Description': ''},
    {'Subject Code': 'cs', 'Category': 'Computer Science', 'Description': ''},
]
# Create a DataFrame for new categories
new_categories_df = pd.DataFrame(new_categories)

# Concatenate the new categories DataFrame to the original df
category_df = pd.concat([category_df, new_categories_df], ignore_index=True)

# Drop duplicates if any Subject Code might be repeated
category_df.drop_duplicates(subset=['Subject Code'], keep='last', inplace=True)
category_df.reset_index(drop=True, inplace=True)

# Dictionary for mapping Subject Code to Category
category_mapping = category_df.set_index('Subject Code')['Category'].to_dict()

# Use map to create 'Category' column based on the 'subject' column
loaded_df['Category'] = loaded_df['subject'].map(category_mapping)

In [23]:
loaded_df.head()

Unnamed: 0,id,title,summary,published,authors,subject,pdf_link,pdf_text,Category
0,http://arxiv.org/abs/cond-mat/0102536v1,Impact of Electron-Electron Cusp on Configurat...,The effect of the electron-electron cusp on th...,2001-02-28T20:12:09Z,"['David Prendergast', 'M. Nolan', 'Claudia Fil...",cond-mat.str-el,http://arxiv.org/pdf/cond-mat/0102536v1,['arXiv:cond-mat/0102536v1 [cond-mat.str-el] ...,Strongly Correlated Electrons
1,http://arxiv.org/abs/cond-mat/0511278v1,A simple and efficient approach to the optimiz...,We present a simple and efficient method to op...,2005-11-11T09:48:02Z,"['Anthony Scemama', 'Claudia Filippi']",cond-mat.other,http://arxiv.org/pdf/cond-mat/0511278v1,['arXiv:cond-mat/0511278v1 [cond-mat.other] ...,Other Condensed Matter
2,http://arxiv.org/abs/cond-mat/0403509v1,Optimized Jastrow-Slater wave functions for gr...,A quantum Monte Carlo method is presented for ...,2004-03-19T13:51:29Z,"['Friedemann Schautz', 'Claudia Filippi']",cond-mat.other,http://arxiv.org/pdf/cond-mat/0403509v1,['arXiv:cond-mat/0403509v1 [cond-mat.other] ...,Other Condensed Matter
3,http://arxiv.org/abs/2002.03622v1,Excited-state calculations with quantum Monte ...,Quantum Monte Carlo methods are first-principl...,2020-02-10T09:57:17Z,"['Jonas Feldt', 'Claudia Filippi']",physics.chem-ph,http://arxiv.org/pdf/2002.03622v1,['Excited-state calculations with quantum\nMon...,Chemical Physics
4,http://arxiv.org/abs/cond-mat/9808114v1,Quantum Monte Carlo calculation of Compton pro...,Recent high resolution Compton scattering expe...,1998-08-11T19:06:45Z,"['Claudia Filippi', 'David M. Ceperley']",cond-mat,http://arxiv.org/pdf/cond-mat/9808114v1,['arXiv:cond-mat/9808114v1 11 Aug 1998Submitt...,Condenced Matter
