In [2]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import os
import PyPDF2
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [16]:
# Function to extract text from PDF
def extract_text_from_pdf(file_path):
    pdf_file_obj = open(file_path, 'rb')
    pdf_reader = PyPDF2.PdfReader(pdf_file_obj)
    text = ""
    for page_num in range(len(pdf_reader.pages)):
        page_obj = pdf_reader.pages[page_num]
        text += page_obj.extract_text()
    pdf_file_obj.close()
    return text

In [4]:


def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove special characters
    text = re.sub(r'[^a-z0-9 ]+', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove emails
    text = re.sub(r'\S*@\S*\s?', '', text)
    
    # Remove weblinks
    text = re.sub(r'http\S+', '', text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_text = [word for word in words if word not in stop_words]
    
    # Tokenize the text
    tokenized_text = ' '.join(filtered_text)
    
    return tokenized_text

In [6]:
pdf_dir = 'data'
pdf_files = [os.path.join(pdf_dir, f) for f in os.listdir(pdf_dir) if f.endswith('.pdf')]


In [22]:
# Extract and preprocess text from all PDFs
documents = []
for pdf_file in pdf_files:
    text = extract_text_from_pdf(pdf_file)
    documents.append(preprocess_text(text))

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the documents into a TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(documents)

# Convert the TF-IDF matrix to a DataFrame for easier inspection
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())


tfidf_df

Unnamed: 0,aadapter,ab,abbey,abby,abc,abcnetaunewshistoricaustralianorbitalrocketlaunchremotequeensland,abigail,abl,able,ableour,...,zhangluthey,zhaojie,zhixiang,zhu,zhuque,zhuyangzhuthey,zihan,zircon,zvezda,zvezdaservice
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018954,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018992,0.018992,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.009238,0.009238,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.00535,0.00535,0.00535,0.0107,0.0,0.0,0.00535,0.0107,0.0,0.0,...,0.00535,0.00535,0.00535,0.00535,0.048149,0.00535,0.00535,0.0,0.0107,0.00535


In [21]:
tfidf_df

Unnamed: 0,aadapter,ab,abbey,abby,abc,abcnetaunewshistoricaustralianorbitalrocketlaunchremotequeensland,abigail,abl,able,ableour,...,zhangluthey,zhaojie,zhixiang,zhu,zhuque,zhuyangzhuthey,zihan,zircon,zvezda,zvezdaservice
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018954,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018992,0.018992,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.009238,0.009238,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.00535,0.00535,0.00535,0.0107,0.0,0.0,0.00535,0.0107,0.0,0.0,...,0.00535,0.00535,0.00535,0.00535,0.048149,0.00535,0.00535,0.0,0.0107,0.00535


In [23]:
tfidf_df.to_csv("context.csv")