In [None]:
import pandas as pd
import requests
import xml.etree.ElementTree as ET
import re
import os
from pdfminer.high_level import extract_text
import time
from pdfminer.psparser import PSSyntaxError
import fitz

In [None]:
# Load the dataset
cs_papers = pd.read_csv("data/cs_papers_api.csv")

In [None]:
category_mapping = {
    'cs.CV': 'Computer Vision and Pattern Recognition',
    'cs.NE': 'Neural and Evolutionary Computing',
    'cs.MA': 'Multiagent Systems',
    'cs.RO': 'Robotics',
    'cs.CL': 'Computation and Language',
    'cs.LG': 'Machine Learning',
    'cs.AI': 'Artificial Intelligence',
    'cs.CR': 'Cryptography and Security',
    'cs.HC': 'Human-Computer Interaction',
    'cs.IR': 'Information Retrieval',
    'cs.GT': 'Computer Science and Game Theory',
    'cs.SE': 'Software Engineering',
    'cs.CY': 'Computers and Society',
    'cs.ET': 'Emerging Technologies',
    'cs.NI': 'Networking and Internet Architecture',
    'cs.MM': 'Multimedia',
    'cs.SI': 'Social and Information Networks',
    'cs.CC': 'Computational Complexity',
    'cs.DB': 'Databases',
    'cs.IT': 'Information Theory',
    'cs.PL': 'Programming Languages',
    'cs.DS': 'Data Structures and Algorithms',
    'cs.SD': 'Sound',
    'cs.LO': 'Logic in Computer Science',
    'cs.DL': 'Digital Libraries',
    'cs.DC': 'Distributed, Parallel, and Cluster Computing',
    'cs.OH': 'Other Computer Science',
    'cs.CE': 'Computational Engineering, Finance, and Science',
    'cs.AR': 'Hardware Architecture',
    'cs.FL': 'Formal Languages and Automata Theory',
    'cs.GR': 'Graphics',
    'cs.MS': 'Mathematical Software',
    'cs.CG': 'Computational Geometry',
    'cs.SC': 'Symbolic Computation',
    'cs.PF': 'Performance',
    'cs.OS': 'Operating Systems',
    'cs.DM': 'Discrete Mathematics',
    'cs.NA': 'Numerical Analysis',
    'cs.SY': 'Systems and Control',
    'cs.GL': 'General Literature'
}

In [None]:
# Map the primary_category column to full titles
cs_papers['primary_category_full'] = cs_papers['primary_category'].map(category_mapping)


In [None]:
cs_papers

Unnamed: 0,paper_id,title,abstract,year,primary_category,categories,primary_category_full
0,2301.02657v1,TarViS: A Unified Approach for Target-based Vi...,The general domain of video segmentation is cu...,2023-01-06 18:59:52+00:00,cs.CV,cs.CV cs.AI cs.LG,Computer Vision and Pattern Recognition
1,2301.02642v1,Triple-stream Deep Metric Learning of Great Ap...,We propose the first metric learning system fo...,2023-01-06 18:36:04+00:00,cs.CV,cs.CV cs.AI cs.LG,Computer Vision and Pattern Recognition
2,2301.02610v1,Feedback-Gated Rectified Linear Units,Feedback connections play a prominent role in ...,2023-01-06 17:14:11+00:00,cs.NE,cs.NE cs.AI,Neural and Evolutionary Computing
3,2301.02593v1,Multi-Agent Reinforcement Learning for Fast-Ti...,To integrate high amounts of renewable energy ...,2023-01-06 16:41:51+00:00,cs.MA,cs.MA cs.AI cs.LG cs.SY,Multiagent Systems
4,2301.02561v1,Multi-Vehicle Trajectory Prediction at Interse...,Traditional approaches to prediction of future...,2023-01-06 15:13:23+00:00,cs.RO,cs.RO cs.AI,Robotics
...,...,...,...,...,...,...,...
200089,1610.07090v1,STEPS: Predicting place attributes via spatio-...,"In recent years, a vast amount of research has...",2016-10-22 19:41:44+00:00,cs.SI,cs.SI,Social and Information Networks
200090,1610.07772v1,Visual Themes and Sentiment on Social Networks...,Online Social Networks explode with activity w...,2016-10-25 07:56:43+00:00,cs.SI,cs.SI,Social and Information Networks
200091,1610.08098v2,The Effect of Pokémon Go on The Pulse of the C...,"Pok\'emon Go, a location-based game that uses ...",2017-09-18 16:50:12+00:00,cs.SI,cs.SI,Social and Information Networks
200092,1610.08686v1,Polarized User and Topic Tracking in Twitter,Digital traces of conversations in micro-blogg...,2016-10-27 10:03:31+00:00,cs.SI,cs.SI,Social and Information Networks


In [None]:
def fetch_metadata(paper_id, retries=5, delay=10):
    base_url = f"http://export.arxiv.org/api/query?id_list={paper_id}"
    for attempt in range(retries):
        try:
            response = requests.get(base_url, timeout=20)
            if response.status_code == 200:
                return response.text
            elif response.status_code == 500:
                print(f"HTTP 500 error for paper_id {paper_id}, retrying...")
                time.sleep(delay)
            else:
                print(f"Failed to fetch metadata for paper_id {paper_id}, Status code: {response.status_code}")
                return None
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                print(f"Failed to fetch metadata for paper_id {paper_id} after {retries} attempts.")
                return None

In [None]:
def parse_pdf_url(response_text):
    root = ET.fromstring(response_text)
    for entry in root.findall('{http://www.w3.org/2005/Atom}entry'):
        for link in entry.findall('{http://www.w3.org/2005/Atom}link'):
            if link.attrib.get('title') == 'pdf':
                return link.attrib['href']
    return None

In [None]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with a single space
    text = re.sub(r'\[[^]]*\]', '', text)  # Remove text in square brackets
    text = re.sub(r'\([^)]*\)', '', text)  # Remove text in parentheses
    text = re.sub(r'<[^>]*>', '', text)  # Remove HTML tags
    text = re.sub(r'http[s]?://\S+', '', text)  # Remove URLs
    text = text.strip()
    return text

In [None]:
def remove_abstract_from_full_text(full_text, abstract):
    clean_abstract = clean_text(abstract)
    abstract_pattern = re.escape(clean_abstract)
    full_text_cleaned = re.sub(abstract_pattern, '', full_text, flags=re.IGNORECASE)

    if full_text_cleaned == full_text:
        keywords = ["introduction", "1 introduction", "background", "related work", "methodology", "methods",
                    "results", "discussion", " Keywords"]
        abstract_start = re.search(r'\babstract\b', full_text, re.IGNORECASE)
        if not abstract_start:
            return full_text

        abstract_end = len(full_text)
        for keyword in keywords:
            match = re.search(r'\b' + re.escape(keyword) + r'\b', full_text[abstract_start.end():], re.IGNORECASE)
            if match:
                abstract_end = abstract_start.end() + match.start()
                break

        full_text_cleaned = full_text[:abstract_start.start()] + full_text[abstract_end:]

    return full_text_cleaned

In [None]:

def fetch_and_clean_full_text(paper_id, abstract, category_dir, retries=5, delay=10):
    metadata = fetch_metadata(paper_id, retries, delay)
    if metadata:
        pdf_url = parse_pdf_url(metadata)
        if pdf_url:
            for attempt in range(retries):
                try:
                    response = requests.get(pdf_url, timeout=20)
                    if response.status_code == 200:
                        pdf_path = os.path.join(category_dir, f"pdfs/{paper_id}.pdf")
                        with open(pdf_path, 'wb') as f:
                            f.write(response.content)

                        try:
                            full_text = extract_text(pdf_path)
                            full_text_cleaned = clean_text(full_text)
                            full_text_cleaned = remove_abstract_from_full_text(full_text_cleaned, abstract)

                            if full_text_cleaned:
                                txt_path = os.path.join(category_dir, f"texts/{paper_id}.txt")
                                with open(txt_path, 'w', encoding='utf-8') as f:
                                    f.write(full_text_cleaned)
                                return full_text_cleaned
                        except PSSyntaxError as e:
                            print(f"PSSyntaxError for paper_id {paper_id}: {e}")
                            return None
                    elif response.status_code == 500:
                        print(f"HTTP 500 error for paper_id {paper_id}, retrying...")
                        time.sleep(delay)
                    else:
                        print(f"Failed to fetch PDF for paper_id {paper_id}, Status code: {response.status_code}")
                        return None
                except requests.exceptions.RequestException as e:
                    print(f"Attempt {attempt + 1} to fetch PDF failed: {e}")
                    if attempt < retries - 1:
                        time.sleep(delay)
                    else:
                        print(f"Failed to fetch PDF for paper_id {paper_id} after {retries} attempts.")
                        return None
    return None


In [None]:
def fetch_full_texts_by_category(df, category, chunk_size=50):
    filtered_df = df[df['primary_category_full'] == category].copy()
    num_chunks = (len(filtered_df) + chunk_size - 1) // chunk_size

    category_dir = os.path.join("full", category)
    os.makedirs(os.path.join(category_dir, "pdfs"), exist_ok=True)
    os.makedirs(os.path.join(category_dir, "texts"), exist_ok=True)
    os.makedirs(os.path.join(category_dir, "chunks"), exist_ok=True)

    for i in range(num_chunks):
        chunk_df = filtered_df.iloc[i * chunk_size:(i + 1) * chunk_size].copy()
        full_texts = []

        for paper_id, abstract in zip(chunk_df['paper_id'], chunk_df['abstract']):
            full_text = fetch_and_clean_full_text(paper_id, abstract, category_dir)
            full_texts.append(full_text)

        chunk_df.loc[:, 'cleaned_full_text'] = full_texts
        chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)

        output_csv = os.path.join(category_dir, f"chunks/cleaned_texts_chunk_{i + 1+16}.csv")
        chunk_df.to_csv(output_csv, index=False, escapechar='\\')

        print(f"Saved chunk {i + 1} to {output_csv}")

In [None]:
categories = ['Artificial Intelligence', 'Computer Vision and Pattern Recognition']

In [None]:
for category in categories:
    print(f"Processing category: {category}")
    fetch_full_texts_by_category(cs_papers, category, chunk_size=50)
    print(f"Finished processing category: {category}")

Processing category: Artificial Intelligence
Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Read timed out. (read timeout=10)
Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Read timed out. (read timeout=10)
Failed to fetch PDF for paper_id 2212.13631v4, Status code: 500
Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Read timed out. (read timeout=10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Saved chunk 1 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_1.csv
Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Read timed out. (read timeout=10)
Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Read timed out. (read timeout=10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Saved chunk 2 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_2.csv
Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Max retries exceeded with url: /api/query?id_list=2212.08966v1 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x00000282EA22B190>, 'Connection to export.arxiv.org timed out. (connect timeout=10)'))
Attempt 1 to fetch PDF failed: HTTPConnectionPool(host='arxiv.org', port=80): Read timed out.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Saved chunk 3 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_3.csv
Attempt 1 to fetch PDF failed: HTTPConnectionPool(host='arxiv.org', port=80): Read timed out.
Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Max retries exceeded with url: /api/query?id_list=2212.02064v1 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x00000282EB1F27A0>, 'Connection to export.arxiv.org timed out. (connect timeout=10)'))
Failed to fetch PDF for paper_id 2211.16242v2, Status code: 404
Saved chunk 4 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_4.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set 

Saved chunk 5 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_5.csv
Attempt 1 to fetch PDF failed: HTTPConnectionPool(host='arxiv.org', port=80): Read timed out.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Saved chunk 6 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_6.csv


Ignoring (part of) ToUnicode map because the PDF data does not conform to the format. This could result in (cid) values in the output. The start and end byte have different lengths.
Ignoring (part of) ToUnicode map because the PDF data does not conform to the format. This could result in (cid) values in the output. The start and end byte have different lengths.
Ignoring (part of) ToUnicode map because the PDF data does not conform to the format. This could result in (cid) values in the output. The start and end byte have different lengths.


Failed to fetch PDF for paper_id 2211.03888v2, Status code: 500


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Saved chunk 7 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_7.csv
Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Max retries exceeded with url: /api/query?id_list=2211.01496v1 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x00000282814A1120>, 'Connection to export.arxiv.org timed out. (connect timeout=10)'))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Saved chunk 8 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_8.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Saved chunk 9 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_9.csv
Attempt 1 to fetch PDF failed: HTTPConnectionPool(host='arxiv.org', port=80): Read timed out. (read timeout=10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Saved chunk 10 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_10.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Saved chunk 11 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_11.csv
Attempt 1 to fetch PDF failed: HTTPConnectionPool(host='arxiv.org', port=80): Read timed out. (read timeout=10)
Attempt 2 to fetch PDF failed: HTTPConnectionPool(host='arxiv.org', port=80): Read timed out. (read timeout=10)
Attempt 3 to fetch PDF failed: HTTPConnectionPool(host='arxiv.org', port=80): Read timed out. (read timeout=10)
Attempt 4 to fetch PDF failed: HTTPConnectionPool(host='arxiv.org', port=80): Read timed out. (read timeout=10)
Attempt 5 to fetch PDF failed: HTTPConnectionPool(host='arxiv.org', port=80): Read timed out. (read timeout=10)
Failed to fetch PDF for paper_id 2209.14292v3 after 5 attempts.
Failed to fetch PDF for paper_id 2209.13002v2, Status code: 404


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Saved chunk 12 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_12.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Saved chunk 13 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_13.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Saved chunk 14 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_14.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Saved chunk 15 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_15.csv
Failed to fetch PDF for paper_id 2208.14037v3, Status code: 404
Attempt 1 to fetch PDF failed: HTTPConnectionPool(host='arxiv.org', port=80): Read timed out. (read timeout=10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Saved chunk 16 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_16.csv


PSSyntaxError: Invalid dictionary construct: [/'Producer', b'pdfTeX-1.40.21', /'Title', b'KRACL: Contrastive Learning with Graph Context Modeling for Sparse Knowledge Graph Completion', /'Author', /b'Zhaouxan', /b'Tan,', /b'Zilong', /b'Chen,', /b'Shangbin', /b'Feng,', /b'Qingyue', /b'Zhang,', /b'Qinghua', /b'Zheng,', /b'Jundong', /b'Li,', /b'Minnan', /b'Luo', /'TemplateVersion', b'2023.1', /'Author', b'', /'Title', b'', /'Subject', b'', /'Creator', b'LaTeX with hyperref', /'Keywords', b'', /'CreationDate', b'D:20220817003034Z', /'ModDate', b'D:20220817003034Z', /'Trapped', /'False', /'PTEX.Fullbanner', b'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2']

In [None]:
categories2 = ['General Literature', 'Operating Systems', 'Programming Languages']

In [None]:
for category in categories2 :
    print(f"Processing category: {category}")
    fetch_full_texts_by_category(cs_papers, category, chunk_size=50)
    print(f"Finished processing category: {category}")

Processing category: General Literature


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Saved chunk 1 to full\General Literature\chunks/cleaned_texts_chunk_1.csv
Failed to fetch PDF for paper_id 1012.4170v2, Status code: 404


The PDF <_io.BufferedReader name='full\\General Literature\\pdfs/0808.3717v1.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Failed to fetch metadata for paper_id 0702141v1, Status code: 400
Failed to fetch metadata for paper_id 0610127v1, Status code: 400
Failed to fetch metadata for paper_id 0608062v2, Status code: 400
Failed to fetch metadata for paper_id 0607022v1, Status code: 400
Failed to fetch metadata for paper_id 0602070v1, Status code: 400
Failed to fetch metadata for paper_id 0412090v1, Status code: 400
Failed to fetch metadata for paper_id 0411009v2, Status code: 400
Failed to fetch metadata for paper_id 0410075v1, Status code: 400
Failed to fetch metadata for paper_id 0404033v1, Status code: 400
Failed to fetch metadata for paper_id 0404026v1, Status code: 400
Failed to fetch metadata for paper_id 0402037v2, Status code: 400
Failed to fetch metadata for paper_id 0306132v1, Status code: 400
Failed to fetch metadata for paper_id 0210001v1, Status code: 400
Failed to fetch metadata for paper_id 0110018v2, Status code: 400
Failed to fetch metadata for paper_id 0106022v1, Status code: 400
Failed to 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Failed to fetch metadata for paper_id 9811005v1, Status code: 400
Failed to fetch metadata for paper_id 9809010v1, Status code: 400
Failed to fetch metadata for paper_id 9301114v1, Status code: 400
Saved chunk 3 to full\General Literature\chunks/cleaned_texts_chunk_3.csv
Finished processing category: General Literature
Processing category: Operating Systems


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Failed to fetch metadata for paper_id 0701021v2, Status code: 400
Failed to fetch metadata for paper_id 9903014v1, Status code: 400


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Saved chunk 1 to full\Operating Systems\chunks/cleaned_texts_chunk_1.csv
Failed to fetch PDF for paper_id 2104.05306v3, Status code: 404


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Saved chunk 2 to full\Operating Systems\chunks/cleaned_texts_chunk_2.csv
Failed to fetch PDF for paper_id 1909.11644v2, Status code: 404


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Saved chunk 3 to full\Operating Systems\chunks/cleaned_texts_chunk_3.csv
Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Read timed out. (read timeout=10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Saved chunk 4 to full\Operating Systems\chunks/cleaned_texts_chunk_4.csv
Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Max retries exceeded with url: /api/query?id_list=1705.06932v1 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x00000282816BECE0>, 'Connection to export.arxiv.org timed out. (connect timeout=10)'))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_full_text'] = full_texts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk_df.loc[:, 'cleaned_abstract'] = chunk_df['abstract'].apply(clean_text)


Saved chunk 5 to full\Operating Systems\chunks/cleaned_texts_chunk_5.csv


AssertionError: ('Unhandled', 14)

In [None]:
categories3 = ['Computer Vision and Pattern Recognition', 'Data Structures and Algorithms']

In [None]:
for category in categories3 :
    print(f"Processing category: {category}")
    fetch_full_texts_by_category(cs_papers, category, chunk_size=50)
    print(f"Finished processing category: {category}")

In [None]:
fetch_full_texts_by_category(cs_papers, 'Machine Learning', chunk_size=50)

Attempt 1 to fetch PDF failed: HTTPConnectionPool(host='arxiv.org', port=80): Read timed out. (read timeout=10)
Saved chunk 1 to data2\Machine Learning\chunks/cleaned_texts_chunk_1.csv
Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Max retries exceeded with url: /api/query?id_list=2301.00717v1 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x0000028282540B20>, 'Connection to export.arxiv.org timed out. (connect timeout=10)'))
Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Max retries exceeded with url: /api/query?id_list=2212.12015v1 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x00000282827ECA90>, 'Connection to export.arxiv.org timed out. (connect timeout=10)'))
Saved chunk 2 to data2\Machine Learning\chunks/cleaned_texts_chunk_2.csv
Attempt 1 to fetch PDF failed: HTTPConnectionPool(host='arxiv.org', port=80): Read timed out.
Attempt 1 to fetch PDF failed: HTTPConnectionPool(

PSSyntaxError: Invalid dictionary construct: [/'Producer', b'pdfTeX-1.40.21', /'Title', /b'Scaling', /b'Marginalized', /b'Importance', /b'Sampling', /b'to', /b'High-Dimensional', /b'State-Spaces', /b'via', /b'State', /b'Abstraction', /'TemplateVersion', b'2023.1', /'Creator', b'TeX', /'CreationDate', b'D:20221216012230Z', /'ModDate', b'D:20221216012230Z', /'Trapped', /'False', /'PTEX.Fullbanner', b'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2']

In [None]:
categories = ['Artificial Intelligence', 'Computer Vision and Pattern Recognition']

In [None]:
cs_ai = cs_papers[cs_papers.primary_category_full == 'Artificial Intelligence']

In [None]:
cs_ai_shape = cs_ai.shape
ai_index = cs_ai_shape[0]- 800
cs_ai_tail = cs_ai.tail(ai_index)
cs_ai_tail

Unnamed: 0,paper_id,title,abstract,year,primary_category,categories,primary_category_full
5541,2208.09568v1,Probabilities of Causation with Nonbinary Trea...,This paper deals with the problem of estimatin...,2022-08-19 23:54:47+00:00,cs.AI,cs.AI,Artificial Intelligence
5543,2208.09558v1,Personalized Decision Making -- A Conceptual I...,Personalized decision making targets the behav...,2022-08-19 22:21:29+00:00,cs.AI,cs.AI cs.LO,Artificial Intelligence
5544,2208.09554v1,Evaluating Diverse Knowledge Sources for Onlin...,Online autonomous agents are able to draw on a...,2022-08-19 21:53:15+00:00,cs.AI,cs.AI,Artificial Intelligence
5554,2208.09344v1,Positive dependence in qualitative probabilist...,Qualitative probabilistic networks (QPNs) comb...,2022-08-19 13:53:04+00:00,cs.AI,cs.AI,Artificial Intelligence
5560,2208.09292v3,UnCommonSense: Informative Negative Knowledge ...,Commonsense knowledge about everyday concepts ...,2022-09-05 07:02:33+00:00,cs.AI,cs.AI cs.IR,Artificial Intelligence
...,...,...,...,...,...,...,...
198423,1003.0659v2,Particle Filtering on the Audio Localization M...,We present a novel particle filtering algorith...,2010-03-02 21:40:35+00:00,cs.AI,cs.AI cs.SD,Artificial Intelligence
198438,1301.2306v1,A Mixed Graphical Model for Rhythmic Parsing,A method is presented for the rhythmic parsing...,2013-01-10 16:26:12+00:00,cs.AI,cs.AI cs.SD,Artificial Intelligence
198454,1404.2313v1,Outer-Product Hidden Markov Model and Polyphon...,We present a polyphonic MIDI score-following a...,2014-04-08 21:48:13+00:00,cs.AI,cs.AI cs.SD,Artificial Intelligence
198455,1404.2314v2,A Stochastic Temporal Model of Polyphonic MIDI...,We study indeterminacies in realization of orn...,2016-08-03 00:44:16+00:00,cs.AI,cs.AI cs.SD,Artificial Intelligence


In [None]:
for category in categories:
    print(f"Processing category: {category}")
    fetch_full_texts_by_category(cs_ai_tail, category, chunk_size=50)
    print(f"Finished processing category: {category}")

Processing category: Artificial Intelligence
Saved chunk 1 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_17.csv
MuPDF error: syntax error: unknown keyword: 'pagesize'

MuPDF error: syntax error: unknown keyword: 'width'

MuPDF error: syntax error: unknown keyword: '614.295pt'

MuPDF error: syntax error: unknown keyword: 'height'

MuPDF error: syntax error: unknown keyword: '794.96999pt'

MuPDF error: syntax error: unknown keyword: 'pagesize'

MuPDF error: syntax error: unknown keyword: 'width'

MuPDF error: syntax error: unknown keyword: '614.295pt'

MuPDF error: syntax error: unknown keyword: 'height'

MuPDF error: syntax error: unknown keyword: '794.96999pt'

Saved chunk 2 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_18.csv
Saved chunk 3 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_19.csv
Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Read timed out. (read timeout=10)
Saved chunk 4 to full\Artificial Intelligence\chunk

Failed to fetch metadata for paper_id 0207008v1, Status code: 400
Failed to fetch metadata for paper_id 0106004v1, Status code: 400
Failed to fetch metadata for paper_id 0003076v2, Status code: 400
Failed to fetch metadata for paper_id 0002016v3, Status code: 400
Failed to fetch metadata for paper_id 9909010v1, Status code: 400
Failed to fetch metadata for paper_id 9909009v1, Status code: 400
Failed to fetch metadata for paper_id 9810018v1, Status code: 400
Saved chunk 35 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_51.csv
Saved chunk 36 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_52.csv
Attempt 1 to fetch PDF failed: HTTPConnectionPool(host='arxiv.org', port=80): Read timed out. (read timeout=20)
Attempt 2 to fetch PDF failed: HTTPConnectionPool(host='arxiv.org', port=80): Read timed out. (read timeout=20)
Saved chunk 37 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_53.csv
Attempt 1 to fetch PDF failed: HTTPConnectionPool(host='arxiv.org', por

Saved chunk 63 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_79.csv
MuPDF error: syntax error: could not parse color space (208 0 R)

Saved chunk 64 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_80.csv
Saved chunk 65 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_81.csv
Attempt 1 to fetch PDF failed: HTTPConnectionPool(host='arxiv.org', port=80): Read timed out.
Saved chunk 66 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_82.csv
Failed to fetch PDF for paper_id 2008.06464v2, Status code: 404
Failed to fetch PDF for paper_id 2003.07108v2, Status code: 404
Saved chunk 67 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_83.csv
Failed to fetch PDF for paper_id 2003.03917v3, Status code: 404
Saved chunk 68 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_84.csv
Saved chunk 69 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_85.csv
Saved chunk 70 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_86.csv


Failed to fetch metadata for paper_id 0607037v2, Status code: 400
Failed to fetch metadata for paper_id 0606070v1, Status code: 400
Failed to fetch metadata for paper_id 0509032v1, Status code: 400
Failed to fetch metadata for paper_id 0504101v2, Status code: 400
Failed to fetch metadata for paper_id 0503046v1, Status code: 400
Failed to fetch metadata for paper_id 0503043v3, Status code: 400
Failed to fetch metadata for paper_id 0502078v1, Status code: 400
Failed to fetch metadata for paper_id 0407034v1, Status code: 400
Failed to fetch metadata for paper_id 0307017v1, Status code: 400
Failed to fetch metadata for paper_id 0307013v1, Status code: 400
Failed to fetch metadata for paper_id 0306091v2, Status code: 400
Failed to fetch metadata for paper_id 0211031v1, Status code: 400
Failed to fetch metadata for paper_id 0210007v1, Status code: 400
Failed to fetch metadata for paper_id 0207097v2, Status code: 400
Failed to fetch metadata for paper_id 0207072v1, Status code: 400
Failed to 

Failed to fetch metadata for paper_id 9701102v1, Status code: 400
Failed to fetch metadata for paper_id 9703101v1, Status code: 400
Failed to fetch metadata for paper_id 9704101v1, Status code: 400
Failed to fetch metadata for paper_id 9705101v1, Status code: 400
Failed to fetch metadata for paper_id 9705102v1, Status code: 400
Failed to fetch metadata for paper_id 9706101v1, Status code: 400
Failed to fetch metadata for paper_id 9706102v1, Status code: 400
Failed to fetch metadata for paper_id 9707101v1, Status code: 400
Failed to fetch metadata for paper_id 9707102v1, Status code: 400
Failed to fetch metadata for paper_id 9707103v1, Status code: 400
Failed to fetch metadata for paper_id 9709101v1, Status code: 400
Failed to fetch metadata for paper_id 9709102v1, Status code: 400
Failed to fetch metadata for paper_id 9711102v1, Status code: 400
Failed to fetch metadata for paper_id 9711103v1, Status code: 400
Failed to fetch metadata for paper_id 9711104v1, Status code: 400
Failed to 

Failed to fetch metadata for paper_id 0212025v1, Status code: 400
Failed to fetch metadata for paper_id 0301006v1, Status code: 400
Failed to fetch metadata for paper_id 0301010v2, Status code: 400
Failed to fetch metadata for paper_id 0301023v1, Status code: 400
Failed to fetch metadata for paper_id 0302029v1, Status code: 400
Failed to fetch metadata for paper_id 0302036v2, Status code: 400
Failed to fetch metadata for paper_id 0302039v1, Status code: 400
Failed to fetch metadata for paper_id 0303006v1, Status code: 400
Failed to fetch metadata for paper_id 0303009v2, Status code: 400
Failed to fetch metadata for paper_id 0303018v1, Status code: 400
Failed to fetch metadata for paper_id 0305001v1, Status code: 400
Failed to fetch metadata for paper_id 0305019v1, Status code: 400
Failed to fetch metadata for paper_id 0305044v2, Status code: 400
Failed to fetch metadata for paper_id 0306124v1, Status code: 400
Failed to fetch metadata for paper_id 0306135v1, Status code: 400
Failed to 

Failed to fetch metadata for paper_id 0510062v1, Status code: 400
Failed to fetch metadata for paper_id 0510063v1, Status code: 400
Saved chunk 87 to full\Artificial Intelligence\chunks/cleaned_texts_chunk_103.csv
Failed to fetch metadata for paper_id 0510079v2, Status code: 400
Failed to fetch metadata for paper_id 0510083v1, Status code: 400
Failed to fetch metadata for paper_id 0510091v1, Status code: 400
Failed to fetch metadata for paper_id 0511004v1, Status code: 400
Failed to fetch metadata for paper_id 0511015v2, Status code: 400
Failed to fetch metadata for paper_id 0511091v1, Status code: 400
Failed to fetch metadata for paper_id 0512045v2, Status code: 400
Failed to fetch metadata for paper_id 0512047v2, Status code: 400
Failed to fetch metadata for paper_id 0512099v1, Status code: 400
Failed to fetch metadata for paper_id 0601001v2, Status code: 400
Failed to fetch metadata for paper_id 0601031v1, Status code: 400
Failed to fetch metadata for paper_id 0601052v1, Status code

In [None]:
def fetch_and_clean_full_text(paper_id, abstract, category_dir, retries=5, delay=10):
    metadata = fetch_metadata(paper_id, retries, delay)
    if metadata:
        pdf_url = parse_pdf_url(metadata)
        if pdf_url:
            for attempt in range(retries):
                try:
                    response = requests.get(pdf_url, timeout=20)
                    if response.status_code == 200:
                        pdf_path = os.path.join(category_dir, f"pdfs/{paper_id}.pdf")
                        with open(pdf_path, 'wb') as f:
                            f.write(response.content)

                        try:
                            # Using PyMuPDF for text extraction
                            doc = fitz.open(pdf_path)
                            full_text = ""
                            for page in doc:
                                full_text += page.get_text()

                            full_text_cleaned = clean_text(full_text)
                            full_text_cleaned = remove_abstract_from_full_text(full_text_cleaned, abstract)

                            if full_text_cleaned:
                                txt_path = os.path.join(category_dir, f"texts/{paper_id}.txt")
                                with open(txt_path, 'w', encoding='utf-8') as f:
                                    f.write(full_text_cleaned)
                                return full_text_cleaned
                        except Exception as e:
                            print(f"Error for paper_id {paper_id}: {e}")
                            return None
                    elif response.status_code == 500:
                        print(f"HTTP 500 error for paper_id {paper_id}, retrying...")
                        time.sleep(delay)
                    else:
                        print(f"Failed to fetch PDF for paper_id {paper_id}, Status code: {response.status_code}")
                        return None
                except requests.exceptions.RequestException as e:
                    print(f"Attempt {attempt + 1} to fetch PDF failed: {e}")
                    if attempt < retries - 1:
                        time.sleep(delay)
                    else:
                        print(f"Failed to fetch PDF for paper_id {paper_id} after {retries} attempts.")
                        return None
    return None

In [None]:
print(f"Processing category: Networking and Internet Architecture")
fetch_full_texts_by_category(cs_papers, 'Networking and Internet Architecture', chunk_size=50)
print(f"Finished processing category: Networking and Internet Architecture")

Processing category: Networking and Internet Architecture
Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Max retries exceeded with url: /api/query?id_list=2212.03809v2 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x00000282889B3670>, 'Connection to export.arxiv.org timed out. (connect timeout=10)'))
Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Max retries exceeded with url: /api/query?id_list=2209.13532v1 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x000002828F1696F0>, 'Connection to export.arxiv.org timed out. (connect timeout=10)'))
Saved chunk 1 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_17.csv
Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Read timed out. (read timeout=10)
Attempt 1 to fetch PDF failed: HTTPConnectionPool(host='arxiv.org', port=80): Read timed out.
MuPDF error: syntax error: could not parse color sp

HTTP 500 error for paper_id 2109.11607v1, retrying...
HTTP 500 error for paper_id 2109.11607v1, retrying...
Saved chunk 35 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_51.csv
Saved chunk 36 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_52.csv
Saved chunk 37 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_53.csv
Saved chunk 38 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_54.csv
Failed to fetch PDF for paper_id 2108.13176v3, Status code: 404
Saved chunk 39 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_55.csv
MuPDF error: syntax error: cannot find ExtGState resource 'A1'

MuPDF error: syntax error: cannot find ExtGState resource 'A2'

MuPDF error: syntax error: cannot find ExtGState resource 'A2'

MuPDF error: syntax error: cannot find ExtGState resource 'A2'

MuPDF error: syntax error: cannot find ExtGState resource 'A1'

MuPDF error: syntax error: cannot find

MuPDF error: syntax error: unknown keyword: 'width'

MuPDF error: syntax error: unknown keyword: '614.295pt'

MuPDF error: syntax error: unknown keyword: 'height'

MuPDF error: syntax error: unknown keyword: '794.96999pt'

Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Read timed out. (read timeout=10)
Saved chunk 48 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_64.csv
MuPDF error: syntax error: could not parse color space (212 0 R)

Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Read timed out. (read timeout=10)
Saved chunk 49 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_65.csv
Saved chunk 50 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_66.csv
Attempt 1 to fetch PDF failed: HTTPConnectionPool(host='arxiv.org', port=80): Read timed out.
Failed to fetch PDF for paper_id 2102.01724v3, Status code: 404
Saved chunk 51 to full\Networking and Internet Architecture

Saved chunk 77 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_93.csv
Saved chunk 78 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_94.csv
Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Read timed out. (read timeout=10)
Saved chunk 79 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_95.csv
Saved chunk 80 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_96.csv
Saved chunk 81 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_97.csv
Failed to fetch PDF for paper_id 1911.09034v3, Status code: 404
Saved chunk 82 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_98.csv
Saved chunk 83 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_99.csv
Saved chunk 84 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_100.csv
Failed to fetch PDF for paper_id 1910.06619v3, Status code: 404
MuPDF error: 

Failed to fetch PDF for paper_id 1811.05829v2, Status code: 404
Failed to fetch PDF for paper_id 1811.05386v2, Status code: 404
Saved chunk 109 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_125.csv
Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Max retries exceeded with url: /api/query?id_list=1810.13164v1 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x00000282889B0250>, 'Connection to export.arxiv.org timed out. (connect timeout=10)'))
MuPDF error: syntax error: could not parse color space (2234 0 R)

Saved chunk 110 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_126.csv
Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Read timed out. (read timeout=10)
Saved chunk 111 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_127.csv
Saved chunk 112 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_128.csv
Attempt 1 to fet

Saved chunk 143 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_159.csv
Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Read timed out. (read timeout=10)
Saved chunk 144 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_160.csv
Failed to fetch metadata for paper_id 0612108v1, Status code: 400
Failed to fetch metadata for paper_id 0608058v5, Status code: 400
Failed to fetch metadata for paper_id 0601101v1, Status code: 400
Failed to fetch metadata for paper_id 0506021v1, Status code: 400
Failed to fetch metadata for paper_id 0208023v1, Status code: 400
Failed to fetch metadata for paper_id 0106028v1, Status code: 400
Saved chunk 145 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_161.csv
HTTP 500 error for paper_id 1609.07629v1, retrying...
HTTP 500 error for paper_id 1609.07629v1, retrying...
HTTP 500 error for paper_id 1609.07629v1, retrying...
HTTP 500 error for paper_id 1609.07629v1, retryi

Failed to fetch metadata for paper_id 9809091v1, Status code: 400
Failed to fetch metadata for paper_id 9809092v1, Status code: 400
Failed to fetch metadata for paper_id 9809093v1, Status code: 400
Failed to fetch metadata for paper_id 9809094v1, Status code: 400
Failed to fetch metadata for paper_id 9809095v1, Status code: 400
Failed to fetch metadata for paper_id 9809096v1, Status code: 400
Failed to fetch metadata for paper_id 9809097v1, Status code: 400
Failed to fetch metadata for paper_id 9809098v1, Status code: 400
Failed to fetch metadata for paper_id 9809099v1, Status code: 400
Failed to fetch metadata for paper_id 9809100v1, Status code: 400
Failed to fetch metadata for paper_id 9809101v1, Status code: 400
Failed to fetch metadata for paper_id 9809102v1, Status code: 400
Failed to fetch metadata for paper_id 9810006v1, Status code: 400
Failed to fetch metadata for paper_id 9811027v1, Status code: 400
Failed to fetch metadata for paper_id 9811028v1, Status code: 400
Failed to 

Failed to fetch metadata for paper_id 0510052v2, Status code: 400
Failed to fetch metadata for paper_id 0510082v1, Status code: 400
Failed to fetch metadata for paper_id 0511031v2, Status code: 400
Failed to fetch metadata for paper_id 0511053v1, Status code: 400
Saved chunk 156 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_172.csv
Failed to fetch metadata for paper_id 0511059v2, Status code: 400
Failed to fetch metadata for paper_id 0511062v1, Status code: 400
Failed to fetch metadata for paper_id 0511080v1, Status code: 400
Failed to fetch metadata for paper_id 0511101v4, Status code: 400
Failed to fetch metadata for paper_id 0511102v1, Status code: 400
Failed to fetch metadata for paper_id 0512011v3, Status code: 400
Failed to fetch metadata for paper_id 0512092v1, Status code: 400
Failed to fetch metadata for paper_id 0512094v1, Status code: 400
Failed to fetch metadata for paper_id 0601015v1, Status code: 400
Failed to fetch metadata for paper_id 0601016v

Failed to fetch PDF for paper_id 1001.3483v2, Status code: 404
Failed to fetch PDF for paper_id 1002.1169v2, Status code: 404
Attempt 1 failed: HTTPConnectionPool(host='export.arxiv.org', port=80): Max retries exceeded with url: /api/query?id_list=1002.1186v1 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x000002828F169DB0>, 'Connection to export.arxiv.org timed out. (connect timeout=10)'))
Saved chunk 166 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_182.csv
Failed to fetch PDF for paper_id 1002.3989v2, Status code: 404
Failed to fetch PDF for paper_id 1002.4255v2, Status code: 404
Saved chunk 167 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_183.csv
Saved chunk 168 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_184.csv
Failed to fetch PDF for paper_id 1005.4018v2, Status code: 404
Saved chunk 169 to full\Networking and Internet Architecture\chunks/cleaned_texts_chunk_185.csv
Fai