In [1]:
!pip install nltk scikit-learn spacy pdfplumber transformers




In [2]:
import nltk
nltk.download('stopwords')

import spacy.cli
spacy.cli.download("en_core_web_sm")



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aanandprabhu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import nltk
import spacy
import sklearn

print("All set! Your libraries are working fine.")
print("Thanks!")


All set! Your libraries are working fine.
Thanks!


In [4]:
import pandas as pd

df = pd.read_csv("NLP_Abstract_Dataset (Discipline).csv")
df.head()

Unnamed: 0,ID,Discipline,Abstract
0,1,CS,"Large Language Models (LLMs), such as ChatGPT ..."
1,2,CS,Despite the success of deep learning in close-...
2,3,CS,Data analysis plays an indispensable role for ...
3,4,CS,The goal of user experience design in industry...
4,5,CS,Elliptic curve cryptosystems are considered an...


In [5]:
import re
from nltk.corpus import stopwords

#Load the english stopwords
stop_words = set(stopwords.words('english'))

#Pre-processing function
def preprocess(text):
    text = text.lower() #convert to lowercase
    text = re.sub(r'[^a-z\s]', ' ', text)  # remove non-letter characters
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    tokens = [word for word in text.split() if word not in stop_words]  # remove stopwords
    return " ".join(tokens)


# Apply preprocessing
df['cleaned_abstract'] = df['Abstract'].apply(preprocess)

# View sample result
df[['Abstract', 'cleaned_abstract']].head()

Unnamed: 0,Abstract,cleaned_abstract
0,"Large Language Models (LLMs), such as ChatGPT ...",large language models llms chatgpt bard revolu...
1,Despite the success of deep learning in close-...,despite success deep learning close set object...
2,Data analysis plays an indispensable role for ...,data analysis plays indispensable role underst...
3,The goal of user experience design in industry...,goal user experience design industry improve c...
4,Elliptic curve cryptosystems are considered an...,elliptic curve cryptosystems considered effici...


In [7]:
from sklearn.preprocessing import LabelEncoder

#Create encoder
le = LabelEncoder()

#Fit and transform the Discipline column
df['label'] = le.fit_transform(df['Discipline'])

#Shows the label mappings 
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:", label_mapping)

#View the new column
df[['Discipline', 'label']]

Label Mapping: {'CS': 0, 'IS': 1, 'IT': 2}


Unnamed: 0,Discipline,label
0,CS,0
1,CS,0
2,CS,0
3,CS,0
4,CS,0
5,IS,1
6,IS,1
7,IS,1
8,IS,1
9,IS,1


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Create the vectorizer
vectorizer = TfidfVectorizer(
    stop_words='english',   # remove common stop words like "the", "is", etc.
    lowercase=True          # ensures all words are lowercase
)

#Fit the vectorizer on your cleaned abstracts and transform them into vectors
X = vectorizer.fit_transform(df['cleaned_abstract'])

#Check the result
print("TF-IDF Matrix Shape:", X.shape)  # Rows = abstracts, Columns = unique words
print("Example TF-IDF vector for first abstract:\n", X.toarray()[0])

TF-IDF Matrix Shape: (15, 847)
Example TF-IDF vector for first abstract:
 [0.         0.06231104 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.07392395 0.         0.         0.         0.
 0.         0.         0.         0.06231104 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.06231104 0.18693312 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.06231104 0.06231104 0.         0.         0.
 0.         0.06231104 0.         0.         0.         0.06231104
 0.         0.         0.        

### Model Comparison
We compare two common classification algorithms:
- Logistic Regression (baseline model)
- Multinomial Naive Bayes (suitable for smaller datasets and text data)

Both are trained on the same TF-IDF feature matrix and evaluated on a common test set.

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

#Define features and labels
y = df['label']  #encoded discipline labels

#Splits data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

#Train a Logistic Regression classifier
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

#Make predictions
y_pred = model.predict(X_test)

#Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

Classification Report:
              precision    recall  f1-score   support

          CS       1.00      1.00      1.00         1
          IS       1.00      1.00      1.00         1
          IT       1.00      1.00      1.00         1

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



In [12]:
from sklearn.naive_bayes import MultinomialNB

#Train Naive Bayes on the same TF-IDF vectors
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

#Predict and evaluate
y_pred_nb = nb_model.predict(X_test)

#Print results
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb, target_names=le.classes_))

Naive Bayes Classification Report:
              precision    recall  f1-score   support

          CS       0.00      0.00      0.00         1
          IS       1.00      1.00      1.00         1
          IT       0.50      1.00      0.67         1

    accuracy                           0.67         3
   macro avg       0.50      0.67      0.56         3
weighted avg       0.50      0.67      0.56         3



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 🔍 Model Comparison: Logistic Regression vs Naive Bayes

To evaluate different classification strategies for predicting the research discipline (CS, IS, IT), we implemented and compared two widely used text classification models:

### 1. **Logistic Regression**
- Handles overlapping features better
- Performs well with TF-IDF vectors
- Requires more data but generalizes well

### 2. **Multinomial Naive Bayes**
- Simple and fast
- Often preferred for small datasets and count-based features
- Assumes word independence, which may limit performance with TF-IDF

Both models were trained on the same TF-IDF feature matrix and evaluated using a common test set.

The results are shown below using `classification_report`, comparing precision, recall, and F1-score across classes.

In [13]:
print("Logistic Regression Results:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("Naive Bayes Results:")
print(classification_report(y_test, y_pred_nb, target_names=le.classes_))

Logistic Regression Results:
              precision    recall  f1-score   support

          CS       1.00      1.00      1.00         1
          IS       1.00      1.00      1.00         1
          IT       1.00      1.00      1.00         1

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3

Naive Bayes Results:
              precision    recall  f1-score   support

          CS       0.00      0.00      0.00         1
          IS       1.00      1.00      1.00         1
          IT       0.50      1.00      0.67         1

    accuracy                           0.67         3
   macro avg       0.50      0.67      0.56         3
weighted avg       0.50      0.67      0.56         3



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### ✅ Key Takeaway: Initial Model Comparison on Prototype Dataset

To validate the machine learning pipeline, both Logistic Regression and Multinomial Naive Bayes were implemented and evaluated using a small prototype dataset of 15 research abstracts.

The goal at this stage was not to optimize performance, but to test end-to-end functionality — from text preprocessing to model evaluation — using TF-IDF feature extraction and `classification_report`.

Despite the dataset’s limited size:
- **Logistic Regression** correctly classified all test samples across all classes
- **Naive Bayes** failed to predict one class (CS), resulting in lower overall accuracy and F1-score

This preliminary comparison suggests that Logistic Regression is better suited to the TF-IDF-based feature space in this context. However, these results are **not statistically significant** due to the extremely small dataset and are intended only to demonstrate that the pipeline works.

Further evaluation will be conducted once more data is added.