In [1]:
import os
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from PyPDF2 import PdfReader
import pathway as pw

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score,f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

os.environ["TOKENIZERS_PARALLELISM"] = "false"

import time
import requests
import google.generativeai as genai
from google.api_core import retry
import warnings
warnings.filterwarnings("ignore")
import logging
#Suppress logs from specific libraries
logging.getLogger('pathway_engine').setLevel(logging.WARNING)
logging.getLogger('aiohttp.access').setLevel(logging.WARNING)
logging.getLogger('root').setLevel(logging.WARNING)  # General log level for all root logs
#Suppress logs from the 'requests' library or any other libraries similarly
logging.getLogger('requests').setLevel(logging.WARNING)
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger('pathway_engine.connectors.monitoring')
logger.setLevel(logging.WARNING)

import json
with open("config.json") as f:
    config = json.load(f)
    
GEMINI_API_KEY=config["GEMINI_API_KEY"]

### Importing Required Libraries and Initializing NLP Tools

This step involves the initialization of several important libraries and tools that will be used for text preprocessing. Here's a breakdown of each part of the code:

1. **Downloading NLTK Stopwords**:
   - `nltk.download('stopwords')`: This command downloads a list of common stopwords (e.g., "the", "is", "in", etc.) that are typically removed during text preprocessing to improve the quality of the text analysis.

2. **Defining Excluded Punctuation**:
   - `exclude = string.punctuation`: This sets a variable `exclude` that contains all the punctuation marks. This will be useful when we need to remove punctuation from the text during preprocessing.

3. **Initializing Stemming and Lemmatization Tools**:
   - `lemmatizer = WordNetLemmatizer()`: This initializes the **WordNet Lemmatizer** from NLTK, which will be used for lemmatization. Lemmatization involves reducing a word to its base or dictionary form (e.g., "better" becomes "good").

#### Purpose:
- **Text Preprocessing**: These tools are essential for text preprocessing tasks, such as removing stopwords, punctuation, and reducing words to their base forms (lemmatization).
- **Efficiency in NLP Tasks**: By removing unnecessary elements (like stopwords and punctuation), we ensure that the text is cleaner and more suitable for machine learning or further NLP tasks, such as text classification.


In [2]:
nltk.download('stopwords')
exclude=string.punctuation
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/malyadippal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 1. `text_extractor_from_pdf(pdf_path)`

This function extracts the text from all the pages of a given PDF file.

- **Input:** Path to a PDF file.
- **Process:** It uses `PdfReader` from the `PyPDF2` library to read the content of the PDF. Then, it iterates over each page and extracts the text.
- **Output:** A string containing the combined text from all pages of the PDF.

---

### 2. `lemmatize_words(text)`

This function lemmatizes the words in the given text.

- **Input:** A string of text.
- **Process:** It splits the text into individual words and applies the `lemmatizer` to reduce each word to its base form (lemma).
- **Output:** A string of lemmatized words joined together.

---

### 3. `remove_stopwords(text)`

This function removes common stopwords from the input text.

- **Input:** A string of text.
- **Process:** The function iterates over the words in the text and removes those present in the English stopwords list from the `stopwords` library.
- **Output:** A string where the stopwords are removed, leaving only meaningful words.

---

### 4. `preprocessed_text(text)`

This function preprocesses the input text by applying several text cleaning and normalization techniques.

- **Input:** A string of text.
- **Process:**
  1. Converts the text to lowercase.
  2. Removes any HTML tags using a regular expression.
  3. Removes URLs using a regular expression.
  4. Strips punctuations and symbols using `str.translate`.
  5. Removes stopwords by calling `remove_stopwords`.
  6. Lemmatizes the words by calling `lemmatize_words`.
- **Output:** A cleaned and preprocessed version of the input text, ready for further analysis.


In [3]:
#-----------------------------------------------------------------------------------------------------
def text_extractor_from_pdf(pdf_path):
    reader=PdfReader(pdf_path)
    text=""
    for page in reader.pages:
        text+=page.extract_text()
    return text
#-----------------------------------------------------------------------------------------------------
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
#-----------------------------------------------------------------------------------------------------
def remove_stopwords(text):
    new_text=[]
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x=new_text[:]
    new_text.clear()
    return " ".join(x)
#-----------------------------------------------------------------------------------------------------
def preprocessed_text(text):
    text=text.lower() # Lowercase Conversion
    text=re.sub(r'<.*?>','',text) #Removing html tags, though it might not be present in this pdfs
    text=re.sub(r'https?://\S+|www\.\S+','',text) # Removing URLs
    text=text.translate(str.maketrans('','',exclude)) #Removing punctuations & symbols
    text=remove_stopwords(text) #Removing stop words
    text=lemmatize_words(text) #Lemmatizing the words in the text
    return text
#-----------------------------------------------------------------------------------------------------

---

###                                                    `Task 1`

### 1. `non_publishable_folder` and `non_publishable_texts`

This block processes PDFs in the "Reference/Non-Publishable" folder and extracts the preprocessed text from each file.

- **Process:**
  1. Defines the folder path (`non_publishable_folder`).
  2. Loops through each file in the folder and checks if the file has a `.pdf` extension.
  3. For each PDF file, it extracts the text using `text_extractor_from_pdf` and preprocesses the text using `preprocessed_text`.
  4. Appends the processed text to the `non_publishable_texts` list.
- **Output:** A list of preprocessed texts from the non-publishable PDF files.

---

### 2. `publishable_folder` and `publishable_texts`

This block processes PDFs in multiple folders related to publishable documents (CVPR, EMNLP, KDD, NeurIPS, TMLR) and extracts the preprocessed text from each file.

- **Process:**
  1. Defines a list of folder paths (`publishable_folder`), each representing a specific publication venue.
  2. Loops through each folder, checking for `.pdf` files.
  3. For each PDF file, it extracts and preprocesses the text as done in the previous block.
  4. Appends the processed text to the `publishable_texts` list.
- **Output:** A list of preprocessed texts from the publishable PDF files across all the specified folders.


In [4]:
#-----------------------------------------------------------------------------------------------------
non_publishable_folder='Reference/Non-Publishable/'
non_publishable_texts=[]
for file in os.listdir(non_publishable_folder):
    if file.endswith('.pdf'):
        pdf_path=os.path.join(non_publishable_folder,file)
        non_publishable_texts.append(preprocessed_text(text_extractor_from_pdf(pdf_path)))
#-----------------------------------------------------------------------------------------------------
publishable_folder=['Reference/Publishable/CVPR/','Reference/Publishable/EMNLP/','Reference/Publishable/KDD/','Reference/Publishable/NeurIPS/','Reference/Publishable/TMLR/']
publishable_texts=[]
for folder in publishable_folder:
    for file in os.listdir(folder):
        if file.endswith('.pdf'):
            pdf_path=os.path.join(folder,file)
            publishable_texts.append(preprocessed_text(text_extractor_from_pdf(pdf_path)))
#-----------------------------------------------------------------------------------------------------

### 3. Creating DataFrame from Texts

In this block, a `pandas` DataFrame is created from the preprocessed texts (both publishable and non-publishable) and their corresponding labels.

- **Process:**
  1. A dictionary `data` is created, where:
     - The `"Text"` key contains a combined list of `publishable_texts` and `non_publishable_texts`.
     - The `"Publishable"` key contains labels (1 for publishable, 0 for non-publishable) with the same length as the respective text lists.
  2. A DataFrame `df` is created using the `data` dictionary.
  3. The DataFrame is shuffled using `.sample(frac=1)` and reset with a new index (`reset_index(drop=True)`).
- **Output:** A shuffled DataFrame (`df`) with columns `"Text"` and `"Publishable"`, where each text is labeled as either publishable (1) or non-publishable (0).


In [5]:
data={
    "Text":publishable_texts+non_publishable_texts,
    "Publishable":[1]*len(publishable_texts)+[0]*len(non_publishable_texts)
}
df=pd.DataFrame(data)
df=df.sample(frac=1,random_state=42).reset_index(drop=True)
df.head()

Unnamed: 0,Text,Publishable
0,detailed action identification baseball game r...,1
1,advanced technique contextually interpreting n...,1
2,denoising diffusion probabilistic model jonath...,1
3,advancement 3d food modeling review metafood c...,1
4,pytorch imperative style highperformance deep ...,1


### 4. Data Balancing and Splitting into Features and Labels

In this block, the DataFrame is balanced by downsampling the publishable texts to match the number of non-publishable texts, and then it is split into features and labels.

- **Process:**
  1. **Splitting the DataFrame:** 
     - `df_publishable`: Contains only publishable texts (`Publishable == 1`).
     - `df_non_publishable`: Contains only non-publishable texts (`Publishable == 0`).
  2. **Downsampling:**
     - `df_publishable_downsampled`: The publishable texts are downsampled to match the number of non-publishable texts using `.sample()`.
  3. **Concatenating and Shuffling:**
     - `df_balanced`: The downsampled publishable texts and non-publishable texts are concatenated using `pd.concat()`. The result is shuffled using `.sample(frac=1)` and reset with a new index (`reset_index(drop=True)`).
  4. **Feature and Label Split:**
     - `X`: The feature matrix, containing only the `"Text"` column.
     - `y`: The label vector, which contains the `"Publishable"` column converted to a NumPy array of integers.

- **Output:** 
  - `df_balanced`: A balanced DataFrame of publishable and non-publishable texts.
  - `X`: The features (texts).
  - `y`: The corresponding labels (1 for publishable, 0 for non-publishable).


In [6]:
df_publishable=df[df['Publishable']==1]
df_non_publishable=df[df['Publishable']==0]
#------------------------------------------------------------------------------------
df_publishable_downsampled=df_publishable.sample(df_non_publishable.shape[0],random_state=42)
#------------------------------------------------------------------------------------
df_balanced=pd.concat([df_publishable_downsampled,df_non_publishable])
df_balanced=df_balanced.sample(frac=1,random_state=42).reset_index(drop=True)
#------------------------------------------------------------------------------------
X=df_balanced.iloc[:,0:1]
y=df_balanced['Publishable'].to_numpy(dtype=np.int32)

In [7]:
df_balanced

Unnamed: 0,Text,Publishable
0,aidriven personalization online education plat...,0
1,detecting medication usage parkinson’s disease...,1
2,synergistic convergence photosynthetic pathway...,0
3,importance written explanation aggregating cro...,1
4,analyzing realtime group coordination augmente...,0
5,detailed action identification baseball game r...,1
6,transdimensional property graphite relation ch...,0
7,proceeding 2023 conference empirical method na...,1
8,generalization relu network via restricted iso...,1
9,deciphering enigmatic property metal critical ...,0


In [8]:
X

Unnamed: 0,Text
0,aidriven personalization online education plat...
1,detecting medication usage parkinson’s disease...
2,synergistic convergence photosynthetic pathway...
3,importance written explanation aggregating cro...
4,analyzing realtime group coordination augmente...
5,detailed action identification baseball game r...
6,transdimensional property graphite relation ch...
7,proceeding 2023 conference empirical method na...
8,generalization relu network via restricted iso...
9,deciphering enigmatic property metal critical ...


In [9]:
y

array([0, 1, 0, 1, 0, 1, 0, 1, 1, 0], dtype=int32)

### 5. Splitting Texts for Model Accuracy Check

In this block, we extract a subset of the **publishable texts** that were **not** included in the downsampled data. This subset will later be used to check the accuracy of the model after training.

- **Process:**
  - `X_temp`: A DataFrame containing publishable texts that were **excluded** from the downsampling process. This is achieved by:
    1. Selecting rows where the `"Publishable"` column is `1` (i.e., publishable texts).
    2. Using the `~df['Text'].isin(df_publishable_downsampled['Text'])` condition to filter out texts that are already present in the downsampled publishable texts (`df_publishable_downsampled`).
  
- **Purpose:**
  - `X_temp` will be used to evaluate the **accuracy of the model** once it is trained, ensuring that the test data remains unseen during the training phase.

- **Output:**
  - `X_temp`: A DataFrame of the publishable texts that are not part of the training dataset, reserved for model evaluation.


In [10]:
X_temp = df[(df['Publishable'] == 1) & (~df['Text'].isin(df_publishable_downsampled['Text']))]
X_temp

Unnamed: 0,Text,Publishable
1,advanced technique contextually interpreting n...,1
2,denoising diffusion probabilistic model jonath...,1
3,advancement 3d food modeling review metafood c...,1
4,pytorch imperative style highperformance deep ...,1
7,examining convergence denoising diffusion prob...,1
10,proceeding 2023 conference empirical method na...,1
11,safe predictor inputoutput specification enfor...,1
14,addressing minmax challenge nonconvexnonconcav...,1
18,addressing popularity bias popularityconscious...,1


### 6. Splitting Data into Training and Testing Sets

In this step, the data is split into training and testing sets using the `train_test_split` function from scikit-learn.

- **Process:**
  - `X_train`, `X_test`: These variables represent the features (text data) for training and testing, respectively.
  - `y_train`, `y_test`: These variables represent the target labels (publishable or non-publishable) for training and testing.
  - The split is performed with a fixed `random_state=1` to ensure reproducibility.

- **Purpose:**
  - The data is randomly divided so that the model can be trained on one subset (`X_train`, `y_train`) and tested on an unseen subset (`X_test`, `y_test`).
  
- **Output:**
  - The dataset is split into two sets for training and evaluation, allowing the model to learn from the training set and be evaluated on the test set.


In [11]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=1)

### 7. Text Feature Extraction: Bag of Words (BOW) and TF-IDF

In this step, two common text feature extraction techniques—**Bag of Words (BOW)** and **Term Frequency-Inverse Document Frequency (TF-IDF)**—are applied to the text data for training and testing.

#### 7.1 Bag of Words (BOW)
- **CountVectorizer** is used to convert the text data into a matrix of token counts.
  - `X_train_bow`: The training text data is transformed using `fit_transform` into an array of token counts (BOW representation).
  - `X_test_bow`: The test text data is transformed using the `transform` method (no fitting here, only transforming based on the training data).
  
  **BOW** treats the text as an unordered collection of words and represents it as a sparse matrix, where each word is represented by a count.

#### 7.2 TF-IDF (Term Frequency-Inverse Document Frequency)
- **TfidfVectorizer** is used to convert the text data into a matrix that reflects the importance of words.
  - `X_train_tfidf`: The training text data is transformed using `fit_transform` into an array where each word is weighted by its term frequency and inverse document frequency.
  - `X_test_tfidf`: The test text data is transformed using `transform` based on the training data's learned vocabulary and IDF values.
  
  **TF-IDF** assigns higher importance to words that appear frequently in a specific document but less often across all documents.

#### Purpose:
- These transformations convert raw text data into numerical format, making it suitable for machine learning algorithms.
- The BOW and TF-IDF models provide different perspectives on the text's content, with BOW focusing on word counts and TF-IDF adjusting for common terms across documents.

#### Output:
- `X_train_bow`, `X_test_bow`: Represent the training and testing data using BOW.
- `X_train_tfidf`, `X_test_tfidf`: Represent the training and testing data using TF-IDF.


In [12]:
#-----------------------------------------------------------------------------------------------------
#Applying BOW
cv=CountVectorizer()
X_train_bow=cv.fit_transform(X_train['Text']).toarray()
X_test_bow=cv.transform(X_test['Text']).toarray()
#-----------------------------------------------------------------------------------------------------
#Applying Tf_Idf
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train['Text']).toarray()
X_test_tfidf = tfidf.transform(X_test['Text']).toarray()
#-----------------------------------------------------------------------------------------------------

### 8. Model Evaluation: Gaussian Naive Bayes (GaussianNB) with BOW and TF-IDF

In this section, we apply **Gaussian Naive Bayes (GaussianNB)** using both **Bag of Words (BOW)** and **TF-IDF** feature representations and evaluate the model performance.

#### 8.1 Gaussian Naive Bayes (BOW)
- **Model Training**: A **Gaussian Naive Bayes (GaussianNB)** model is created and trained using the BOW features (`X_train_bow`) and the corresponding target labels (`y_train`).
- **Prediction**: The model makes predictions on the test set (`X_test_bow`), and the accuracy score is calculated.
  - `accuracy_score`: Measures the percentage of correct predictions.
- **F1 Score**: The **F1 Score** is calculated using the `f1_score` function with weighted average, which considers both precision and recall.
  - `f1_score`: Provides a more balanced metric, especially useful for imbalanced datasets.
- **Cross-Validation**: **5-fold cross-validation** is applied to assess the model's performance across different splits of the training data.
  - `cross_val_score`: Provides the average accuracy score from the cross-validation process.

#### 8.2 Gaussian Naive Bayes (TF-IDF)
- Similar steps are repeated for the **TF-IDF** features (`X_train_tfidf`), with the same evaluation metrics applied:
  - **Accuracy**: Assesses the model's performance in terms of correct predictions.
  - **F1 Score**: Evaluates the harmonic mean of precision and recall.
  - **Cross-Validation Accuracy**: Measures how well the model generalizes by evaluating it on different training-validation splits.


#### Results:
- **Accuracy Score using BOW:** 1.0
- **F1 Score:** 1.0
- **Cross-Validation Accuracy using BOW:** 0.6

- **Accuracy Score using TF-IDF:** 1.0
- **F1 Score:** 1.0
- **Cross-Validation Accuracy using TF-IDF:** 0.6

#### Possible Reasons for the Results:
- **Accuracy Score and F1 Score of 1.0:** The perfect accuracy and F1 scores suggest that the model is performing exceptionally well on the test data. However, this might indicate **overfitting**, where the model has memorized the training data and performs well on data similar to it, but might not generalize well on unseen data.

- **Cross-Validation Accuracy of 0.6:** The discrepancy between the high accuracy scores and the lower cross-validation score suggests that while the model performs well on certain data splits, it might struggle on others. This indicates that the model is likely **overfitting** to the training data, and its performance is less consistent across different subsets of the data.

In summary, the perfect scores on the test data (accuracy and F1 score) suggest that the model might not be generalizing well, and the cross-validation score of 0.6 confirms that the model's performance is not as reliable across all data splits.



In [13]:
#-----------------------------------------------------------------------------------------------------
#GaussiaNB using BOW
gnb=GaussianNB()
gnb.fit(X_train_bow,y_train)
y_pred=gnb.predict(X_test_bow)
print("Accuracy Score using bow:",accuracy_score(y_test,y_pred))
f1=f1_score(y_test, y_pred, average='weighted')
print("F1 Score:",f1)
scores = cross_val_score(gnb, X_train_bow, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy using bow:", scores.mean())
#-----------------------------------------------------------------------------------------------------
#GaussiaNB using TfIdf
gnbtf=GaussianNB()
gnbtf.fit(X_train_tfidf,y_train)
y_pred_tf=gnbtf.predict(X_test_tfidf)
print("\nAccuracy Score using tfidf:",accuracy_score(y_test,y_pred_tf))
f1=f1_score(y_test, y_pred, average='weighted')
print("F1 Score:",f1)
scores = cross_val_score(gnb, X_train_tfidf, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy using tfidf:", scores.mean())
#-----------------------------------------------------------------------------------------------------

Accuracy Score using bow: 1.0
F1 Score: 1.0
Cross-Validation Accuracy using bow: 0.8

Accuracy Score using tfidf: 1.0
F1 Score: 1.0
Cross-Validation Accuracy using tfidf: 0.6


### Analyzing Top Features Influencing Predictions

In this section, we analyze the **top features** that significantly influence the model's predictions. This step helps us understand the model's decision-making process.

#### Purpose of Viewing Top Features:
- **Interpretability**: By identifying the top features, we can interpret which words or terms the **Gaussian Naive Bayes (GNB)** model relies on to classify text as **publishable** or **non-publishable**.
- **Feature Importance**: These features are the most important ones in distinguishing between the classes, and understanding them helps us grasp which factors contribute to the model's predictions.

This step is important for **model interpretability**, allowing us to see which words are most indicative of whether a document is publishable or not. It provides valuable insights into the model's behavior and the underlying patterns it is using to make predictions.


In [14]:
print("Top features influencing predictions:")
feature_names = np.array(cv.get_feature_names_out())
log_probs = gnb.theta_
top_features = np.argsort(log_probs[1] - log_probs[0])[-10:]
print(feature_names[top_features])

Top features influencing predictions:
['question' 'feature' 'parameterefficient' 'network' 'parameter' 'large'
 'tuning' 'method' 'language' 'model']


### 5th Step Continuation: Model Validation on Remaining Publishable Texts

In this step, we evaluate the model's performance on the texts that were originally classified as "Publishable" but were not included in the downsampled dataset used for training. This is done to check how well the model generalizes to unseen data.

The code transforms the text in `X_temp` into a bag-of-words representation (`X_temp_bow`), applies the trained Gaussian Naive Bayes (GNB) model to predict the labels, and then compares the predictions (`y_temp`) with the actual labels in `X_temp['Publishable']` using accuracy as the metric.

This step ensures that the model's performance is not biased by the training data and provides an estimate of how well it can predict on new, unseen "Publishable" texts.

#### Possible Reason for High Accuracy:
The accuracy score is coming out as 1, it might indicate that the model is overfitting to the training data, or that the validation set (`X_temp`) contains texts very similar to the ones in the training set. This can lead to the model performing exceptionally well on the validation data, which may not reflect its true performance on entirely unseen data. Additionally, if the "Publishable" texts are very distinct or the dataset is small, the model might be able to easily classify the texts with high accuracy.







In [15]:
X_temp_bow=cv.transform(X_temp['Text']).toarray()
y_temp=gnb.predict(X_temp_bow)
accuracy_score(X_temp['Publishable'].to_numpy(dtype=np.int32),y_temp)

1.0

### 9. Model Evaluation Using Logistic Regression (BOW and TF-IDF)

In this step, **Logistic Regression (LR)** is applied to the dataset using both **Bag of Words (BOW)** and **TF-IDF** vectorization techniques. Similar to the previous step, the model is evaluated on the test data (`X_test_bow` and `X_test_tfidf`) using accuracy, F1 score, and cross-validation.

The code trains the **Logistic Regression** model on the BOW and TF-IDF representations of the training data and then evaluates its performance.

#### Results:
- **Accuracy Score using BOW:** 1.0
- **F1 Score:** 1.0
- **Cross-Validation Accuracy using BOW:** 0.9

- **Accuracy Score using TF-IDF:** 1.0
- **F1 Score:** 1.0
- **Cross-Validation Accuracy using TF-IDF:** 0.8

#### Possible Reasons for the Results:
- **Accuracy Score and F1 Score of 1.0:** Similar to the previous models, the perfect accuracy and F1 scores indicate that the **Logistic Regression** model performs exceptionally well on the test data. This could again point to **overfitting**, where the model fits the training data too well and fails to generalize effectively to new, unseen data.

- **Cross-Validation Accuracy:** While the accuracy on the test data is perfect, the cross-validation scores are lower, with BOW scoring 0.9 and TF-IDF scoring 0.8. These lower scores suggest that while the model performs well in specific splits of the data, it might not generalize as reliably across all data subsets, again hinting at potential **overfitting**.

Overall, the results highlight that while the **Logistic Regression** model might appear to perform well on the test set, there are signs of overfitting due to the discrepancy between the high test accuracy and the lower cross-validation accuracy.


In [16]:
#-----------------------------------------------------------------------------------------------------
# Logistic Regression using bow
clf = LogisticRegression(random_state=42)
clf.fit(X_train_bow, y_train)
y_pred_lg=clf.predict(X_test_bow)
print("Accuracy Score using bow:",accuracy_score(y_test,y_pred_lg))
f1=f1_score(y_test, y_pred_lg, average='weighted')
print("F1 Score:",f1)
cv_scores = cross_val_score(clf, X_train_bow, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy using bow:", cv_scores.mean())
#-----------------------------------------------------------------------------------------------------
# Logistic Regression using tfidf
clftf = LogisticRegression(random_state=42)
clftf.fit(X_train_tfidf, y_train)
y_pred_lgtf=clf.predict(X_test_tfidf)
print("\nAccuracy Score using tfidf:",accuracy_score(y_test,y_pred_lgtf))
f1=f1_score(y_test, y_pred_lgtf, average='weighted')
print("F1 Score:",f1)
cv_scores = cross_val_score(clf, X_train_tfidf, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy using tfidf:", cv_scores.mean())
#-----------------------------------------------------------------------------------------------------

Accuracy Score using bow: 1.0
F1 Score: 1.0
Cross-Validation Accuracy using bow: 0.9

Accuracy Score using tfidf: 1.0
F1 Score: 1.0
Cross-Validation Accuracy using tfidf: 0.8


### 10. Model Evaluation Using Support Vector Machine (BOW)

In this step, the **Support Vector Machine (SVM)** with a **linear kernel** is applied to the dataset using the **Bag of Words (BOW)** vectorization technique. The model is trained and evaluated on the test data (`X_test_bow`) using accuracy, F1 score, and cross-validation.

The code trains the **SVM** model on the BOW representation of the training data and evaluates its performance.

#### Results:
- **Accuracy Score using BOW:** 1.0
- **F1 Score:** 1.0
- **Cross-Validation Accuracy using BOW:** 0.9

#### Possible Reasons for the Results:
- **Accuracy Score and F1 Score of 1.0:** The perfect accuracy and F1 scores suggest that the **SVM** model performs exceptionally well on the test data. As with the previous models, this could indicate **overfitting**, where the model fits the training data too well but may not generalize well to unseen data.

- **Cross-Validation Accuracy:** The cross-validation accuracy of 0.9 is lower than the perfect accuracy on the test set, which again suggests that while the model performs well on specific test splits, it might struggle to generalize to different subsets of the data. This discrepancy points to **overfitting** as a potential issue.

In summary, although the **SVM model** performs perfectly on the test data, the lower cross-validation accuracy suggests that it may be overfitting, highlighting the need for further model tuning to improve generalization.


In [17]:
#-----------------------------------------------------------------------------------------------------
# Support Vector Machine using bow
svm=SVC(kernel='linear', random_state=42)
svm.fit(X_train_bow, y_train)
y_pred_svm=svm.predict(X_test_bow)
print("Accuracy Score using bow:",accuracy_score(y_test,y_pred_svm))
f1=f1_score(y_test, y_pred_svm, average='weighted')
print("F1 Score:",f1)
cv_scores = cross_val_score(clf, X_train_bow, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy using bow:", cv_scores.mean())
#-----------------------------------------------------------------------------------------------------

Accuracy Score using bow: 1.0
F1 Score: 1.0
Cross-Validation Accuracy using bow: 0.9


### 11. Final Model - Voting Classifier

In this step, a **Voting Classifier** is created using three different models: **GaussianNB**, **Logistic Regression**, and **Support Vector Machine (SVM)**. The models are combined using **hard voting**, where the final prediction is based on the majority vote from the individual models.

The purpose of using the **Voting Classifier** is to combine the strengths of different models to create a more robust classifier. By doing so, the model may be better at generalizing patterns and distinguishing between **publishable** and **non-publishable** papers.

#### Results:
- **Accuracy Score:** 1.0
- **Cross-Validation Accuracy using BOW:** 0.9
- **F1 Score:** 1.0

#### Observations and Conclusion:
- **Accuracy and F1 Scores:** The Voting Classifier provides perfect accuracy and F1 scores on the test set (X_test) and also on the temporary validation set (X_temp). This indicates that the model is able to learn to correctly distinguish between publishable and non-publishable papers.
  
- **Cross-Validation Accuracy:** The cross-validation accuracy of 0.9 suggests that the model generalizes well to different data subsets, further supporting the idea that the model is not overfitting.

- **Overfitting Assessment:** Despite the high accuracy and F1 scores, the model's performance on different data splits (via cross-validation) suggests that it is not overfitting. Moreover, the content and style of the papers (publishable vs non-publishable) exhibit clear differences that the machine learning models (GaussianNB, Logistic Regression, SVM) can easily capture, even with a smaller dataset. Given this distinction in content and style, the model likely learns to distinguish patterns effectively without overfitting.

In conclusion, after reviewing the results and the data, it can be reasonably assumed that the model is not overfitting and has learned the correct patterns to differentiate between **publishable** and **non-publishable** papers. The high performance, combined with the use of a Voting Classifier, makes this approach more robust and reliable.


In [18]:
voting_clf = VotingClassifier(estimators=[
    ('gnb', gnb),
    ('lr', clf),
    ('svm', svm)
], voting='hard')
voting_clf.fit(X_train_bow, y_train)
y_pred_vc = voting_clf.predict(X_test_bow)
print("Accuracy Score:",accuracy_score(y_test,y_pred_vc))
f1=f1_score(y_test, y_pred_vc, average='weighted')
cv_scores = cross_val_score(voting_clf, X_train_bow, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy using bow:", cv_scores.mean())
print("F1 Score:",f1)

Accuracy Score: 1.0
Cross-Validation Accuracy using bow: 0.9
F1 Score: 1.0


### Checking Predictions for a Specific PDF

This code is used to observe the predictions made by different models (GaussianNB, Logistic Regression, SVM, and Voting Classifier) for a given PDF document. The process involves:

#### Purpose:
This code helps to:
- Examine how individual models and the ensemble Voting Classifier predict for a specific input.
- Compare the performance and consistency of different models on the same document.

#### Note:
This step is optional and can be used solely to observe model predictions on a given PDF. If such comparisons are not required, this step can be ignored.


In [19]:
text=text_extractor_from_pdf('Samples/Sample10.pdf')
temp=preprocessed_text(text)

temp_bow=cv.transform([temp]).toarray()
temp_tfidf=tfidf.transform([temp]).toarray()

y_gn_bow=gnb.predict(temp_bow)
y_gn_tf=gnbtf.predict(temp_tfidf)

y_lf_bow=clf.predict(temp_bow)
y_lf_tf=clftf.predict(temp_tfidf)

y_svm_bow=svm.predict(temp_bow)

y_voting_clf_bow=voting_clf.predict(temp_bow)
print('Prediction due to Bag of Words(GaussianNB): ',y_gn_bow)
# print('Prediction due to Tf-Idf(GaussianNB): ',y_gn_tf)
print('\n')
print('Prediction due to Bag of Words(Logistic Regression): ',y_lf_bow)
# print('Prediction due to Tf-Idf(Logistic Regression): ',y_lf_tf)
print('\n')
print('Prediction due to Bag of Words(SVM): ',y_svm_bow)
print('\n')
print('Prediction due to Bag of Words(Voting Classifier): ',y_voting_clf_bow)

Prediction due to Bag of Words(GaussianNB):  [1]


Prediction due to Bag of Words(Logistic Regression):  [1]


Prediction due to Bag of Words(SVM):  [1]


Prediction due to Bag of Words(Voting Classifier):  [1]


---

### `Why Not Use Deep Learning Models?`

We didn’t use any deep learning-based models to classify the papers because the dataset is very small. Deep learning models require a large amount of data to train effectively and generalize well. With such a small dataset, it’s well-known that deep learning models would fail to classify the papers correctly. That’s why we chose simpler machine learning models like GaussianNB, Logistic Regression, and SVM, which perform much better for smaller datasets like this.


---

###                                                    `Task 2`

### Step 1: Setting Up Pathway for Google Drive Integration

This code sets up Pathway to monitor and process files in a Google Drive folder, serving as the foundation for our project. Here's the breakdown:

---

#### Process Overview

1. **Google Drive Integration**:
   - We use the `pw.io.gdrive.read` function to connect Pathway to a specific Google Drive folder.
   - Parameters explained:
     - `object_id`: The unique ID of the Google Drive folder (explained below).
     - `service_user_credentials_file`: The JSON credentials file for the service account that provides secure access to the folder.
     - `mode="streaming"`: Ensures real-time monitoring of the folder for new or updated files.
     - `with_metadata`: Captures additional file details like names and timestamps.
     - `refresh_interval=10`: Sets a refresh interval of 10 seconds for checking folder updates.

2. **Creating the `object_id`**:
   - We created a **Google Drive folder** specifically for this project.
   - A **service account** was set up, with the associated JSON credentials file downloaded.
   - The service account was granted access to this folder.
   - The `object_id` is the unique folder identifier, extracted from the folder's URL.
   - Folder URL: https://drive.google.com/drive/folders/1POboCZVq6bzgVaL-b_7milEq9FEEff2p?usp=sharing

3. **Setting Up Data Sources**:
   - The `data_sources` list includes a table created using `pw.io.gdrive.read`. This table continuously streams and processes the folder's contents.

4. **Running Pathway in a Thread**:
   - Pathway is executed in a separate thread using `threading.Thread` and the `pw.run()` function. This ensures asynchronous real-time processing without interrupting other operations.

---

#### Why This Step Is Important
- **Real-Time Processing**: The folder is monitored for changes, ensuring that newly uploaded files are promptly processed.
- **Secure Access**: By using a service account, we ensure controlled and secure interaction with the Google Drive folder.
- **Asynchronous Execution**: Running Pathway in a separate thread makes it possible to handle other tasks simultaneously.

---

#### Practical Application
This setup allows us to upload research files to a dedicated Google Drive folder, which Pathway can monitor and process in real-time. This approach is particularly useful for managing evolving datasets or shared resources.

--- 


In [20]:
import pathway as pw
import threading

# Function to run Pathway in a separate thread
def run_pathway():
    pw.run()

# Add the data to Pathway Vectorstore
table = pw.io.gdrive.read(
    object_id="1POboCZVq6bzgVaL-b_7milEq9FEEff2p",
    service_user_credentials_file="credentials.json",
    mode="streaming",
    with_metadata=True,
    refresh_interval=10,
)

# Add processed documents from Google Drive
data_sources = [table]
# data_sources.append(table)

# Start Pathway in a separate thread
pathway_thread = threading.Thread(target=run_pathway, daemon=True)
pathway_thread.start()


[?1049h[H[?25l[H                                                                                
                                                                                
                                                                                
                                                                                
                                                                                
                                                                                
                                                                                
                                                                                
                                                                                
                                                                                
                                                                                
                                                                                
        

### Step 2: Setting Up the Pathway VectorStore Server for Embedding and Querying

This step focuses on configuring the **Pathway VectorStore Server** to process and embed data from the monitored Google Drive folder. Here's how it works:

---

#### Process Overview

1. **Importing Required Modules**:
   - Pathway's `llm` extension is used to handle embedding, splitting, parsing, and vector storage functionalities.

2. **Defining Components**:
   - **`TokenCountSplitter`**: Splits the text into smaller chunks based on token count. This is useful for handling large documents without exceeding token limits.
   - **`GeminiEmbedder`**: Uses the Gemini API to generate embeddings for text data. This API requires an `api_key` for secure access.
   - **`ParseUnstructured`**: Parses raw text using the `preprocessed_text` function, which ensures that the input is cleaned and standardized before embedding.

3. **Initializing the VectorStore Server**:
   - **Data Sources**: Includes all the files monitored by Pathway in the Google Drive folder (from Step 1).
   - **Parser**: Processes raw input files into structured, preprocessed text.
   - **Embedder**: Generates numerical embeddings for the text, which are stored in the vector database for similarity-based operations.
   - **Splitter**: Divides the text into manageable chunks, enhancing the embedding and retrieval processes.

4. **Running the VectorStore Server**:
   - **Configuration**: The server is configured to run locally (`127.0.0.1`) on a specified port (`8765`).
   - **Threaded Mode**: Ensures non-blocking execution, allowing multiple processes to interact with the server simultaneously.
   - **Cache Disabled**: For this instance, caching is turned off to avoid stale data issues during processing.

5. **Workaround for Colab**:
   - A `time.sleep(30)` command is added to keep the cell running for 30 seconds, as threads in Colab require active cells for visibility. 

---

#### Why This Step Is Important
- **Efficient Data Handling**: The use of text splitting and preprocessing ensures smooth handling of large or unstructured files.
- **Embedding for Similarity Queries**: By embedding text into numerical vectors, the server allows semantic searches and similarity-based operations.
- **Real-Time Vector Database**: Enables the system to store and retrieve vector embeddings in real time, crucial for applications like search engines, recommendation systems, and classification tasks.

---

#### Practical Application
This setup enables us to process and store embeddings for research papers or other documents uploaded to Google Drive. The embeddings can later be used for querying similar documents, clustering, or other machine learning tasks.

---


In [21]:
from pathway.xpacks.llm.embedders import GeminiEmbedder
from pathway.xpacks.llm.splitters import TokenCountSplitter
from pathway.xpacks.llm.vector_store import VectorStoreClient, VectorStoreServer
from pathway.xpacks.llm.parsers import ParseUnstructured
PATHWAY_PORT = 8765

text_splitter = TokenCountSplitter()
embedder = GeminiEmbedder(api_key=GEMINI_API_KEY)
parser=ParseUnstructured(
    mode='single',
    post_processors=[preprocessed_text]
)

vector_server = VectorStoreServer(
    *data_sources,
    parser=parser,
    embedder=embedder,
    splitter=text_splitter,
)
vector_server.run_server(host="127.0.0.1", port=PATHWAY_PORT, threaded=True, with_cache=False)
time.sleep(30)  # Workaround for Colab - messages from threads are not visible unless a cell is running

(Press CTRL+C to quit)


In [22]:
# table.schema.column_names()[1]

### Step 3: Connecting to the VectorStore Server

In this step, we establish a connection to the **Pathway VectorStore Server** set up in Step 2.

---

#### Process Overview

1. **`VectorStoreClient` Initialization**:
   - The `VectorStoreClient` connects to the running **VectorStore Server** instance. 
   - The connection is established by specifying:
     - **Host**: `127.0.0.1` (local server).
     - **Port**: The port used by the server, defined as `PATHWAY_PORT` (set to 8765 in Step 2).

2. **Purpose of the Client**:
   - This client acts as an interface to communicate with the server, enabling operations like querying, adding new data, or retrieving embeddings from the vector database.

---

#### Why This Step Is Important
- **Server Communication**: The client is essential for interacting with the vector database created by the server.
- **Data Retrieval and Querying**: Using this connection, we can perform semantic searches, retrieve similar documents, or integrate embeddings into downstream tasks.

---

#### Practical Application
With this client, we can:
- Query the vector database to find similar documents based on embeddings.
- Use the embeddings stored in the database for tasks like classification, clustering, or recommendation.

---


In [23]:
client = VectorStoreClient(
    host="127.0.0.1",
    port=PATHWAY_PORT,
    timeout=30
)

### Step 4: Loading Conference Papers into Vector Store and Running the Server

In this step, we load the conference papers into a vector store and run a server to handle vector-based queries. Here's how the process works:

1. **Conference Folders Setup**:
   - A dictionary `conference_folders` is defined, mapping each conference name to its respective folder path. Each folder contains PDF papers related to a specific conference (e.g., CVPR, EMNLP, KDD, NeurIPS, TMLR).

2. **Reading PDF Files**:
   - The code iterates through the conference folder paths and uses the `pw.io.fs.read` function to read the PDF files.
   - The files are read using a binary format, ensuring that metadata is included, and the files are ingested in static mode, meaning they are processed once and not repeatedly loaded.
   - The resulting table for each conference is appended to the `reference_sources` list.

3. **Creating Vector Store Server**:
   - The `VectorStoreServer` is initialized with the loaded conference papers and necessary configurations such as a parser, embedder, and text splitter.
   - The server is then run on `127.0.0.1` at port `8000`, enabling vector-based queries for the conference papers.

4. **Running the Server**:
   - The server is started in a threaded mode, allowing for concurrent handling of multiple requests.

This setup ensures that the conference papers are ready to be queried based on vector similarities, enabling efficient classification and retrieval.

In [24]:
conference_folders = {
    "CVPR": r"Reference/Publishable/CVPR",
    "EMNLP": r"Reference/Publishable/EMNLP",
    "KDD": r"Reference/Publishable/KDD",
    "NeurIPS": r"Reference/Publishable/NeurIPS",
    "TMLR": r"Reference/Publishable/TMLR"
}

reference_sources=[]
for conference_name, folder_path in conference_folders.items():
    table = pw.io.fs.read(
        path=folder_path + "/*.pdf",  # Glob pattern to match all PDF files
        format="binary",
        with_metadata=True,
        mode="static",  # Static mode to ingest data once
    )

    reference_sources.append(table)

# Create a vector store server for conferences
vector_server = VectorStoreServer(
    *reference_sources,
    parser=parser,
    embedder=embedder,
    splitter=text_splitter,
)
vector_server.run_server(host="127.0.0.1", port=8000, threaded=True)


<Thread(VectorStoreServer, started 14465888256)>

In [25]:
# reference_sources

In [26]:
# for tv in reference_sources:
#     print(tv.schema.column_names())

In [27]:
# content_table=[]
# for table in reference_sources:
#     content_table.append(table.select(pdf_content=table.data))


In [28]:
 # pw.debug.compute_and_print(content_table[0], include_id=False)

### Step 5: Paper Classification using Gemini and VectorStore

In this step, we implement a system to classify research papers into conferences based on their content using the **Gemini** model and a preloaded vector store. Here's how the process works:

1. **PDF Extraction**:
   - The `PDFReader` class is responsible for reading PDF files from specified conference folders.
   - It scans all files in the provided folder, extracts the content from each PDF, and stores the extracted text in a dictionary.
   - The `__call__` method is used to trigger the extraction process, reading the text from each page in the PDF file.

2. **Vector Store Setup**:
   - We configure a `VectorStoreClient` to connect to a local vector store running on `127.0.0.1:8000`.
   - The papers from each conference folder are processed and added to the vector store, where they will be stored as embeddings for future similarity-based searches.

3. **Paper Classification**:
   - When a new paper is provided for classification, its content is extracted using the `PDFReader` class.
   - The extracted text is sent to the vector store to perform a similarity search. The most relevant paper from the preloaded papers is identified based on the similarity score.
   - The system uses the **Gemini** model to classify the paper. A prompt is constructed with the paper content and relevant passage from the search, asking Gemini to classify the paper into one of the predefined conferences.

4. **Using Gemini API**:
   - The `genai` library is used to interact with the Gemini model. It is configured with an API key (`GEMINI_API_KEY`), which is required for the model to generate the response.
   - The generated response contains the classification (i.e., the most relevant conference) and a rationale explaining why the paper belongs to that particular conference.

5. **Final Output**:
   - The system returns the classification and rationale for the paper, providing a clear understanding of why the paper was assigned to the specific conference.

---

### Important Notes:
- Replace the placeholder `GEMINI_API_KEY` with your actual Gemini API key to ensure proper functionality.
- The vector store should be running and accessible at `127.0.0.1:8000` for successful integration with the classification system.


In [29]:
# from pathway.stdlib.indexing import BruteForceKnnFactory, HybridIndexFactory
# from pathway.stdlib.indexing.bm25 import TantivyBM25Factory

# hybrid_index = HybridIndexFactory(
#     [
#         TantivyBM25Factory(),  # BM25-based keyword search
#         BruteForceKnnFactory(embedder=embedder)  # Vector-based semantic search
#     ]
# )
# for docs in publishable_texts:
#     hybrid_index.build_index(docs)

In [30]:
# dir(hybrid_index)

In [31]:
import os
import google.generativeai as genai
from google.api_core import retry

resource_client = VectorStoreClient(
    host="127.0.0.1",
    port=8000,
    timeout=120
)

def classify_paper(paper_text,conferences, genai_model):
    # Perform similarity search
    result = resource_client.query(query=[paper_text], k=1)
    
    if not result:
        return "Could not classify the paper.", ""

    closest_match = result[0]
    metadata = closest_match["metadata"]
    # classification = metadata["conference"]
    # # Extract relevant passage
    relevant_passages = closest_match["text"]
    passage_oneline = " ".join(relevant_passages).replace("\n", " ")

    # Generate classification and rationale
    prompt = (
        f"Classify the following paper into one of these conferences: {', '.join(conferences)}.\n"
        f"Paper Content (trimmed): {paper_text}...\n"
        f"Relevant Passage: {passage_oneline}\n"
        f"Provide the closest match classification from the listed conferences. The classification must not be 'None of the above'\n"
        f"Additionally, provide a separate rationale for the classification (not more than 100 words). Start your rationale with 'Rationale:'."
)


    response = genai_model.generate_content(prompt).parts[0].text

    # Split response into classification and rationale
    if "Rationale:" in response:
        classification, rationale = response.split("Rationale:", maxsplit=1)
        classification = classification.replace("**Classification:**", "").replace("**","").strip()
        rationale = rationale.replace("**", "").strip()
    else:
        classification = response.strip()
        rationale = "Rationale not provided."

    return classification, rationale


genai.configure(api_key=GEMINI_API_KEY)

genai_model = genai.GenerativeModel("gemini-1.5-pro")

def classify_rationale(paper_path):
    if not os.path.exists(paper_path) or not paper_path.endswith(".pdf"):
        print("Invalid file. Please try again.")
    
    with open(paper_path, "rb") as file:
        pdf_reader = PdfReader(file)
        paper_text = " ".join(page.extract_text() for page in pdf_reader.pages)
        classification, rationale = classify_paper(paper_text, list(conference_folders.keys()), genai_model)
    return classification,rationale

In [32]:
# resource_client.get_vectorstore_statistics()

In [33]:
client.get_vectorstore_statistics()

ERROR:pathway_engine.persistence.state:Rotation id is unparsable from the key 2-0-0 2
ERROR:pathway_engine.persistence.state:Rotation id is unparsable from the key 2-0-0 2


(Press CTRL+C to quit)


{'file_count': 1, 'last_modified': None, 'last_indexed': 1737186575}

### Step 6: Managing the Results List

In this part of the code, we initialize an empty DataFrame (`df2`) to store the results later. We define a folder path (`./GetDocuments`) where the downloaded PDF files will be stored. The `os.makedirs()` function ensures that this folder is created if it doesn't already exist. 

We also define the `prefix` variable, which holds the base URL needed to construct the download link for each file from Google Drive. The empty list `results` will store the classification information of the papers and is used globally across the process. 

#### Purpose:
- **Create necessary directories**: The `./GetDocuments` folder is created to store the PDFs downloaded from Google Drive.
- **Prepare the results container**: The `results` list is initialized here so that it can be used across multiple code snippets to accumulate the classification and metadata of the papers.
- **Define download URL prefix**: The `prefix` helps in forming the complete URL for downloading files from Google Drive.

This is a setup phase, and it does not perform any processing yet, but ensures that we are ready to handle the download and classification process in the next steps.


In [34]:
import gdown,sys

df2 = pd.DataFrame()
prefix='https://drive.google.com/uc?/export=download&id='
os.makedirs('./GetDocuments', exist_ok=True)
results=[]

### Step 7: Downloading and Classifying New Files

In this part of the code, we retrieve the list of files (PDFs) from the vector store using the `client.get_input_files()` method. If there are any documents in the vector store, the code proceeds to download the PDFs from Google Drive`( https://drive.google.com/drive/folders/1POboCZVq6bzgVaL-b_7milEq9FEEff2p?usp=sharing)` using the `gdown.download()` method.

For each document:
1. **File Download**: The PDF files are downloaded to the `./GetDocuments` folder using the file URL and `gdown`.
2. **Text Extraction and Preprocessing**: The content of each downloaded PDF is extracted and preprocessed with the `preprocessed_text()` function.
3. **Bag of Words Transformation**: The preprocessed text is transformed using the BOW (Bag of Words) method.
4. **Prediction**: The model predicts whether the paper is publishable based on the BOW transformation, and the classification and rationale are determined using the preprocessed content. 
5. **Result Collection**: All results (publishable status, classification, rationale) are added to the `results` list, which is then used to generate a DataFrame (`df2`).

#### Purpose:
- **Download PDFs**: PDFs stored in the vector store are downloaded to the local machine, ensuring we work with the latest documents.
- **Text Extraction and Transformation**: We extract and preprocess the text from the PDFs, which is crucial for accurate prediction and classification.
- **Prediction**: The model (based on BOW) is used to determine whether a paper is publishable, followed by a classification using the LLM model, providing a rationale for the classification.
- **Storing Results**: The results of each file's classification are stored in the `results` list, which is then formatted into a DataFrame (`df2`) for easier access.

`This code needs to be manually run after the new files are stored in the vector store.It will not run automatically after files are added to the vector store. The user should manually execute this code to classify the newly uploaded documents and generate the results DataFrame (`df2`).`

In [35]:
document_info = client.get_input_files()
if(len(document_info)):
    for doc in document_info:
        file_name=doc['name']
        file_id=doc['url'].split('/')[-2]
        output_path = f'./GetDocuments/{file_name}'  # Set the desired output path
        if not os.path.exists(output_path):
            print(f"Downloading {file_name}...")
            gdown.download(prefix + file_id, output_path, quiet=False)
        else:
            print(f"{file_name} already exists. Skipping download.")
            continue
        processed_text=preprocessed_text(text_extractor_from_pdf(output_path))
        processed_text_bow=cv.transform([processed_text]).toarray()
        pt_voting_clf_bow=voting_clf.predict(processed_text_bow)
        if (pt_voting_clf_bow[0]):
            classification,rationale=classify_rationale(output_path)
            time.sleep(2)
            results.append((file_name.split('.')[0],pt_voting_clf_bow[0],classification,rationale))
        else:
            results.append((file_name.split('.')[0],pt_voting_clf_bow[0],'na','na'))
        
    df2 = pd.DataFrame(results, columns=['Paper ID', 'Publishable', 'Conference', 'Rationale'])
    df2['Paper ID Number'] = df2['Paper ID'].str.extract(r'(\d+)').astype(int)
    df2 = df2.sort_values(by='Paper ID Number').drop(columns=['Paper ID Number'])
    df2 = df2.reset_index(drop=True)
else:
    print('No files present in drive!!')

Downloading R008.pdf...


Downloading...
From: https://drive.google.com/uc?/export=download&id=1uJpRdAHtRmohbfX0UP1lWdagjxPzHiT3
To: /Users/malyadippal/Desktop/Code_Conquerors_KDSH_ROUND2/Task/GetDocuments/R008.pdf
100%|██████████████████████████████████████| 70.9k/70.9k [00:00<00:00, 1.00MB/s]


In [36]:
df2

Unnamed: 0,Paper ID,Publishable,Conference,Rationale
0,R008,1,EMNLP,The paper focuses on Noun-Noun compound interp...


In [37]:
# df2.to_csv("results.csv", index=False)

In [38]:
classification,rationale=classify_rationale("papers/P008.pdf")
print("Classification:",classification)
print("\n")
print("Rationale:",rationale)

Classification: EMNLP


Rationale: The paper focuses on Chain-of-Thought (CoT) prompting in large language models, a topic central to natural language processing. It analyzes CoT's impact on reasoning tasks, sample complexity, and generalization, all within the scope of NLP and thus aligning with EMNLP's focus.  The mention of related work in question answering and commonsense reasoning further strengthens this classification.
