In [1]:
!pip install nltk scikit-learn spacy pdfplumber transformers




In [2]:
import nltk
nltk.download('stopwords')

import spacy.cli
spacy.cli.download("en_core_web_sm")



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aanandprabhu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import nltk
import spacy
import sklearn

print("All set! Your libraries are working fine.")
print("Thanks!")


All set! Your libraries are working fine.
Thanks!


In [4]:
import pandas as pd

df = pd.read_csv("NLP_Abstract_Dataset (Discipline).csv")
df.head()

Unnamed: 0,ID,Discipline,Abstract
0,1,CS,"Large Language Models (LLMs), such as ChatGPT ..."
1,2,CS,Despite the success of deep learning in close-...
2,3,CS,Data analysis plays an indispensable role for ...
3,4,CS,The goal of user experience design in industry...
4,5,CS,Elliptic curve cryptosystems are considered an...


In [5]:
import re
from nltk.corpus import stopwords

#Load the english stopwords
stop_words = set(stopwords.words('english'))

#Pre-processing function
def preprocess(text):
    text = text.lower() #convert to lowercase
    text = re.sub(r'[^a-z\s]', ' ', text)  # remove non-letter characters
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    tokens = [word for word in text.split() if word not in stop_words]  # remove stopwords
    return " ".join(tokens)


# Apply preprocessing
df['cleaned_abstract'] = df['Abstract'].apply(preprocess)

# View sample result
df[['Abstract', 'cleaned_abstract']].head()

Unnamed: 0,Abstract,cleaned_abstract
0,"Large Language Models (LLMs), such as ChatGPT ...",large language models llms chatgpt bard revolu...
1,Despite the success of deep learning in close-...,despite success deep learning close set object...
2,Data analysis plays an indispensable role for ...,data analysis plays indispensable role underst...
3,The goal of user experience design in industry...,goal user experience design industry improve c...
4,Elliptic curve cryptosystems are considered an...,elliptic curve cryptosystems considered effici...


In [7]:
from sklearn.preprocessing import LabelEncoder

#Create encoder
le = LabelEncoder()

#Fit and transform the Discipline column
df['label'] = le.fit_transform(df['Discipline'])

#Shows the label mappings 
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:", label_mapping)

#View the new column
df[['Discipline', 'label']]

Label Mapping: {'CS': 0, 'IS': 1, 'IT': 2}


Unnamed: 0,Discipline,label
0,CS,0
1,CS,0
2,CS,0
3,CS,0
4,CS,0
5,IS,1
6,IS,1
7,IS,1
8,IS,1
9,IS,1


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Create the vectorizer
vectorizer = TfidfVectorizer(
    stop_words='english',   # remove common stop words like "the", "is", etc.
    lowercase=True          # ensures all words are lowercase
)

#Fit the vectorizer on your cleaned abstracts and transform them into vectors
X = vectorizer.fit_transform(df['cleaned_abstract'])

#Check the result
print("TF-IDF Matrix Shape:", X.shape)  # Rows = abstracts, Columns = unique words
print("Example TF-IDF vector for first abstract:\n", X.toarray()[0])

TF-IDF Matrix Shape: (15, 847)
Example TF-IDF vector for first abstract:
 [0.         0.06231104 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.07392395 0.         0.         0.         0.
 0.         0.         0.         0.06231104 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.06231104 0.18693312 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.06231104 0.06231104 0.         0.         0.
 0.         0.06231104 0.         0.         0.         0.06231104
 0.         0.         0.        