In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#!pip install nltk scikit-learn pandas



# Initialization

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import zipfile
import os
import re
import string
from scipy.sparse import csr_matrix

In [4]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
zip_file_path = '/content/drive/MyDrive/SEM7_SLP/sentence+classification.zip'

with zipfile.ZipFile(zip_file_path, 'r') as z:
    z.extractall('/content/extracted_data')

extracted_files = os.listdir('/content/extracted_data')
print("Extracted files:", extracted_files)

Extracted files: ['SentenceCorpus', '__MACOSX']


In [6]:
labeled_articles_dir = '/content/extracted_data/SentenceCorpus/labeled_articles'

In [7]:
data = []
for annot_file in os.listdir(labeled_articles_dir):
    file_path = os.path.join(labeled_articles_dir, annot_file)
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        sentences = file.readlines()
        domain = annot_file.split('_')[0]
        for sentence in sentences:
            data.append([sentence.strip(), domain])

In [8]:
df = pd.DataFrame(data, columns=['text', 'domain'])
df.head()

Unnamed: 0,text,domain
0,### abstract ###,jdm
1,"MISC\tsimilar to research on risky choice, the...",jdm
2,MISC\tthe well-known allais paradox contradict...,jdm
3,AIMX\twe describe a violation of the law of di...,jdm
4,### introduction ###,jdm


#  Preprocessing

In [9]:
#initialize NLTK tools
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [10]:
def preprocess_text(text, remove_stopwords=True, use_stemming=False, use_lemmatization=False):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    if remove_stopwords:
        tokens = [word for word in tokens if word not in stopwords.words('english')]
    if use_stemming:
        tokens = [stemmer.stem(word) for word in tokens]
    elif use_lemmatization:
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Rejoin tokens into a single string
    return ' '.join(tokens)

In [11]:
df['preprocessed_text'] = df['text'].apply(lambda x: preprocess_text(x, remove_stopwords=True))

In [12]:
df['preprocessed_text_stemmed'] = df['text'].apply(lambda x: preprocess_text(x, remove_stopwords=True, use_stemming=True))
df['preprocessed_text_lemmatized'] = df['text'].apply(lambda x: preprocess_text(x, remove_stopwords=True, use_lemmatization=True))

In [13]:
df.head()

Unnamed: 0,text,domain,preprocessed_text,preprocessed_text_stemmed,preprocessed_text_lemmatized
0,### abstract ###,jdm,abstract,abstract,abstract
1,"MISC\tsimilar to research on risky choice, the...",jdm,misc similar research risky choice traditional...,misc similar research riski choic tradit analy...,misc similar research risky choice traditional...
2,MISC\tthe well-known allais paradox contradict...,jdm,misc wellknown allais paradox contradicts fund...,misc wellknown allai paradox contradict fundam...,misc wellknown allais paradox contradicts fund...
3,AIMX\twe describe a violation of the law of di...,jdm,aimx describe violation law diminishing margin...,aimx describ violat law diminish margin util w...,aimx describe violation law diminishing margin...
4,### introduction ###,jdm,introduction,introduct,introduction


# Text Classification

In [14]:
import numpy as np
from collections import defaultdict
from math import log

In [15]:
class NgramFeatureExtractor:
    def __init__(self, n=2):
        self.n = n
        self.vocabulary = {}

    def generate_ngrams(self, tokens):
        ngrams = zip(*[tokens[i:] for i in range(self.n)])
        return [" ".join(ngram) for ngram in ngrams]

    def fit(self, documents):
        all_ngrams = []
        for doc in documents:
            tokens = doc.split() if isinstance(doc, str) else doc

            unigrams = tokens #must be converted into a list for concatenation
            bigrams = self.generate_ngrams(doc) if self.n >= 2 else []
            trigrams = self.generate_ngrams(doc) if self.n >= 3 else []
            all_ngrams.extend(unigrams + bigrams + trigrams) #.extend() adds items to end of current list

        self.vocabulary = {ngram: idx for idx, ngram in enumerate(set(all_ngrams))}

    def transform(self, documents):
        feature_matrix = np.zeros((len(documents), len(self.vocabulary)))
        for i, doc in enumerate(documents):
            tokens = doc.split() if isinstance(doc, str) else doc

            unigrams = tokens
            bigrams = self.generate_ngrams(doc) if self.n >= 2 else []
            trigrams = self.generate_ngrams(doc) if self.n >= 3 else []
            ngrams = unigrams + bigrams + trigrams
            for ngram in ngrams:
                if ngram in self.vocabulary:
                    feature_matrix[i, self.vocabulary[ngram]] += 1
        return feature_matrix

In [21]:
class NaiveBayesClassifier:
    def __init__(self):
        self.class_probs = {}
        self.feature_probs = defaultdict(lambda: defaultdict(lambda: 0))
        self.classes = []
        self.vocabulary_size = 0

    def fit(self, X, y):
        num_docs = len(y)
        self.classes = np.unique(y)
        class_counts = defaultdict(lambda: 0)
        feature_counts = defaultdict(lambda: defaultdict(lambda: 0))

        self.vocabulary_size = X.shape[1]

        for i, label in enumerate(y):
            class_counts[label] += 1
            row = X[i]
            for j in range(len(row)):
                if row[j] > 0:
                    feature_counts[label][j] += row[j]

        self.class_probs = {cls: count / num_docs for cls, count in class_counts.items()}
        for cls in self.classes:
            total_features = sum(feature_counts[cls].values())
            for feature in range(self.vocabulary_size):
                self.feature_probs[cls][feature] = (feature_counts[cls][feature] + 1) / (total_features + self.vocabulary_size)

    def predict(self, X):
        predictions = []
        for i in range(X.shape[0]):
            row = X[i]
            log_probs = {}
            for cls in self.classes:
                log_prob = log(self.class_probs[cls])
                for j in range(len(row)):
                    if row[j] > 0:
                        log_prob += log(self.feature_probs[cls].get(j, 1 / (self.vocabulary_size + 1))) * row[j]
                log_probs[cls] = log_prob
            predictions.append(max(log_probs, key=log_probs.get))
        return predictions

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df['preprocessed_text_stemmed'], df['domain'], test_size=0.3, random_state=42)

In [23]:
ngram_extractor = NgramFeatureExtractor(n=3)
ngram_extractor.fit(X_train)

In [24]:
X_train_ngram = ngram_extractor.transform(X_train)
X_test_ngram = ngram_extractor.transform(X_test)

In [25]:
nb = NaiveBayesClassifier()
nb.fit(X_train_ngram, y_train)
y_pred = nb.predict(X_test_ngram)

In [28]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9464646464646465
Classification Report:
               precision    recall  f1-score   support

       arxiv       0.94      0.94      0.94       324
         jdm       0.99      0.92      0.95       305
        plos       0.92      0.97      0.95       361

    accuracy                           0.95       990
   macro avg       0.95      0.95      0.95       990
weighted avg       0.95      0.95      0.95       990

