In [3]:
# STEP 1: Load and Preprocess the Dataset

import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords


nltk.download('stopwords')

# Load dataset
df = pd.read_csv("NLP_Abstract_Dataset (Discipline)(105).csv")

# Basic cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Apply cleaning
df['Clean_Abstract'] = df['Abstract'].apply(clean_text)

# Encode labels
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Discipline'])

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    df['Clean_Abstract'], df['Label'], test_size=0.2, random_state=42, stratify=df['Label']
)

# Quick check
print("Training samples:", len(X_train))
print("Test samples:", len(X_test))
print("Label mapping:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

Training samples: 84
Test samples: 21
Label mapping: {'CS': 0, 'IS': 1, 'IT': 2}


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aanandprabhu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# STEP 2: TF-IDF Vectorization

from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Initialize vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=2000)  

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Save the vectorizer for future use
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

# Sanity check
print("TF-IDF Vectorization complete.")
print("Training shape:", X_train_tfidf.shape)
print("Test shape:", X_test_tfidf.shape)

TF-IDF Vectorization complete.
Training shape: (84, 2000)
Test shape: (21, 2000)


In [5]:
# STEP 3: Train & Evaluate Models

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Logistic Regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_tfidf, y_train)
y_pred_logreg = logreg.predict(X_test_tfidf)

# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred_nb = nb.predict(X_test_tfidf)

# Evaluation
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("\nLogistic Regression Report:\n", classification_report(y_test, y_pred_logreg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_logreg))

print("\n===============================\n")

print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("\nNaive Bayes Report:\n", classification_report(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))

Logistic Regression Accuracy: 0.9047619047619048

Logistic Regression Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.86      0.86      0.86         7
           2       0.86      0.86      0.86         7

    accuracy                           0.90        21
   macro avg       0.90      0.90      0.90        21
weighted avg       0.90      0.90      0.90        21

Confusion Matrix:
 [[7 0 0]
 [0 6 1]
 [0 1 6]]


Naive Bayes Accuracy: 0.9047619047619048

Naive Bayes Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       0.86      0.86      0.86         7
           2       0.86      0.86      0.86         7

    accuracy                           0.90        21
   macro avg       0.90      0.90      0.90        21
weighted avg       0.90      0.90      0.90        21

Confusion Matrix:
 [[7 0 0]
 [0 6 1]
 [0 1 6]]


### Model Selection Rationale

Both Logistic Regression and Naive Bayes achieved 90% accuracy on the test set. However, Logistic Regression was chosen as the final model because:
- It showed perfect prediction for CS abstracts (class 0)
- Only 2 total misclassifications (IS ↔ IT)
- It offers better interpretability, making it more suitable for extension into subclassification tasks later
- Logistic Regression also performed better in earlier prototype testing

In [6]:
# STEP 4: Save the final model
import joblib

# Save Logistic Regression model to disk
joblib.dump(logreg, "discipline_classifier_logreg.pkl")

print("Final Logistic Regression model saved as discipline_classifier_logreg.pkl")

Final Logistic Regression model saved as discipline_classifier_logreg.pkl
