In [1]:
from scipy import sparse
from sklearn import linear_model
from collections import Counter
import numpy as np
import operator
import nltk
import math
from scipy.stats import norm
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [2]:
def load_ordinal_data(filename, ordering):
    X = []
    Y = []
    orig_Y=[]
    for ordinal in ordering:
        Y.append([])

    with open(filename, encoding="utf-8") as file:
        for line in file:
            cols = line.split("\t")
            idd = cols[0]
            label = cols[1].strip()
            text = cols[2]

            X.append(text)

            index=ordering.index(label)
            for i in range(len(ordering)):
                if index > i:
                    Y[i].append(1)
                else:
                    Y[i].append(0)
            orig_Y.append(label)

    return X, Y, orig_Y



#### Default BoW Ordinal Classifier

In [3]:
class OrdinalClassifier:

    def __init__(self, ordinal_values, feature_method, trainX, trainY, devX, devY, testX, testY, orig_trainY, orig_devY, orig_testY):
        self.ordinal_values=ordinal_values
        self.feature_vocab = {}
        self.feature_method = feature_method
        self.min_feature_count=2
        self.log_regs = [None]* (len(self.ordinal_values)-1)

        self.trainY=trainY
        self.devY=devY
        self.testY=testY

        self.orig_trainY=orig_trainY
        self.orig_devY=orig_devY
        self.orig_testY=orig_testY

        self.trainX = self.process(trainX, training=True)
        self.devX = self.process(devX, training=False)
        self.testX = self.process(testX, training=False)

    # Featurize entire dataset
    def featurize(self, data):
        featurized_data = []
        for text in data:
            feats = self.feature_method(text)
            featurized_data.append(feats)
        return featurized_data

    # Read dataset and returned featurized representation as sparse matrix + label array
    def process(self, X_data, training = False):

        data = self.featurize(X_data)

        if training:
            fid = 0
            feature_doc_count = Counter()
            for feats in data:
                for feat in feats:
                    feature_doc_count[feat]+= 1

            for feat in feature_doc_count:
                if feature_doc_count[feat] >= self.min_feature_count:
                    self.feature_vocab[feat] = fid
                    fid += 1

        F = len(self.feature_vocab)
        D = len(data)
        X = sparse.dok_matrix((D, F))
        for idx, feats in enumerate(data):
            for feat in feats:
                if feat in self.feature_vocab:
                    X[idx, self.feature_vocab[feat]] = feats[feat]

        return X


    def train(self):
        (D,F) = self.trainX.shape


        for idx, ordinal_value in enumerate(self.ordinal_values[:-1]):
            best_dev_accuracy=0
            best_model=None
            for C in [0.1, 1, 10, 100]:

                log_reg = linear_model.LogisticRegression(C = C, max_iter=1000)
                log_reg.fit(self.trainX, self.trainY[idx])
                development_accuracy = log_reg.score(self.devX, self.devY[idx])
                if development_accuracy > best_dev_accuracy:
                    best_dev_accuracy=development_accuracy
                    best_model=log_reg


            self.log_regs[idx]=best_model

    def test(self):
        cor=tot=0
        counts=Counter()
        preds=[None]*(len(self.ordinal_values)-1)
        for idx, ordinal_value in enumerate(self.ordinal_values[:-1]):
            preds[idx]=self.log_regs[idx].predict_proba(self.testX)[:,1]

        preds=np.array(preds)

        for data_point in range(len(preds[0])):


            ordinal_preds=np.zeros(len(self.ordinal_values))
            for ordinal in range(len(self.ordinal_values)-1):
                if ordinal == 0:
                    ordinal_preds[ordinal]=1-preds[ordinal][data_point]
                else:
                    ordinal_preds[ordinal]=preds[ordinal-1][data_point]-preds[ordinal][data_point]

            ordinal_preds[len(self.ordinal_values)-1]=preds[len(preds)-1][data_point]

            prediction=np.argmax(ordinal_preds)
            counts[prediction]+=1
            if prediction == self.ordinal_values.index(self.orig_testY[data_point]):
                cor+=1
            tot+=1

        return cor/tot

#### Modified TF-IDF Ordinal Classifier

In [4]:
class OrdinalClassifierTFIDF:

    def __init__(self, ordinal_values, feature_method, trainX, trainY, devX, devY, testX, testY, orig_trainY, orig_devY, orig_testY):
        self.ordinal_values=ordinal_values
        self.feature_vocab = {}
        self.feature_method = feature_method
        self.min_feature_count=2
        self.log_regs = [None]* (len(self.ordinal_values)-1)

        # --- Added for TF-IDF ---
        self.doc_freq = Counter()
        self.idf_values = {}
        self.N_train = 0

        self.trainY=trainY
        self.devY=devY
        self.testY=testY

        self.orig_trainY=orig_trainY
        self.orig_devY=orig_devY
        self.orig_testY=orig_testY

        self.trainX = self.process(trainX, training=True)
        self.devX = self.process(devX, training=False)
        self.testX = self.process(testX, training=False)

    # Featurize entire dataset
    def featurize(self, data):
        featurized_data = []
        for text in data:
            feats = self.feature_method(text)
            featurized_data.append(feats)
        return featurized_data

    def process(self, X_data, training = False):

        data = self.featurize(X_data)

        if training:
            self.N_train = len(data) # Store total number of training documents
            fid = 0
            # --- Calculate Document Frequency (DF) ---
            for feats in data:
                for feat in set(feats.keys()):
                    self.doc_freq[feat] += 1

            # --- Filter features based on DF and build vocab ---
            for feat in self.doc_freq:
                if self.doc_freq[feat] >= self.min_feature_count:
                    if feat not in self.feature_vocab:
                         self.feature_vocab[feat] = fid
                         fid += 1

            # --- Calculate IDF ---
            for feat, _ in self.feature_vocab.items():
                self.idf_values[feat] = math.log(self.N_train / (self.doc_freq[feat] + 1))


        F = len(self.feature_vocab)
        D = len(data)
        X = sparse.dok_matrix((D, F))
        for idx, feats in enumerate(data):
            for feat in feats:
                if feat in self.feature_vocab:
                    # --- Apply TF-IDF weight ---
                    tf = feats[feat]
                    idf = self.idf_values.get(feat, 0)
                    X[idx, self.feature_vocab[feat]] = tf * idf

        return X


    def train(self):
        (D,F) = self.trainX.shape


        for idx, ordinal_value in enumerate(self.ordinal_values[:-1]):
            best_dev_accuracy=0
            best_model=None
            for C in [0.1, 1, 10, 100]:

                log_reg = linear_model.LogisticRegression(C = C, max_iter=1000)
                log_reg.fit(self.trainX, self.trainY[idx])
                development_accuracy = log_reg.score(self.devX, self.devY[idx])
                if development_accuracy > best_dev_accuracy:
                    best_dev_accuracy=development_accuracy
                    best_model=log_reg


            self.log_regs[idx]=best_model

    def test(self):
        cor=tot=0
        counts=Counter()
        preds=[None]*(len(self.ordinal_values)-1)
        for idx, ordinal_value in enumerate(self.ordinal_values[:-1]):
            preds[idx]=self.log_regs[idx].predict_proba(self.testX)[:,1]

        preds=np.array(preds)

        for data_point in range(len(preds[0])):


            ordinal_preds=np.zeros(len(self.ordinal_values))
            for ordinal in range(len(self.ordinal_values)-1):
                if ordinal == 0:
                    ordinal_preds[ordinal]=1-preds[ordinal][data_point]
                else:
                    ordinal_preds[ordinal]=preds[ordinal-1][data_point]-preds[ordinal][data_point]

            ordinal_preds[len(self.ordinal_values)-1]=preds[len(preds)-1][data_point]

            prediction=np.argmax(ordinal_preds)
            counts[prediction]+=1
            if prediction == self.ordinal_values.index(self.orig_testY[data_point]):
                cor+=1
            tot+=1

        return cor/tot


In [5]:
def binary_bow_featurize(text):
    feats = {}
    words = nltk.word_tokenize(text)

    for word in words:
        word=word.lower()
        feats[word]=1

    return feats

In [6]:
from collections import Counter

def tfidf_featurize(text):
    feats = {}
    words = nltk.word_tokenize(text.lower())
    word_counts = Counter(words)
    total_words = sum(word_counts.values())

    for word in word_counts:
        feats[word] = word_counts[word] / total_words

    return feats


In [7]:
def confidence_intervals(accuracy, n, significance_level):
    critical_value=(1-significance_level)/2
    z_alpha=-1*norm.ppf(critical_value)
    se=math.sqrt((accuracy*(1-accuracy))/n)
    return accuracy-(se*z_alpha), accuracy+(se*z_alpha)

In [8]:
def run(trainingFile, devFile, testFile, ordinal_values, featurizer): # added featurizer
    trainX, trainY, orig_trainY = load_ordinal_data(trainingFile, ordinal_values)
    devX, devY, orig_devY = load_ordinal_data(devFile, ordinal_values)
    testX, testY, orig_testY = load_ordinal_data(testFile, ordinal_values)

    # --- Select Classifier based on featurizer ---
    if featurizer == tfidf_featurize:
        classifier = OrdinalClassifierTFIDF( # Use the new class
            ordinal_values,
            featurizer, # Pass tfidf_featurize (which calculates TF)
            trainX, trainY,
            devX, devY,
            testX, testY,
            orig_trainY, orig_devY, orig_testY
        )
    else:
        classifier = OrdinalClassifier( # Use the original class
            ordinal_values,
            featurizer,
            trainX, trainY,
            devX, devY,
            testX, testY,
            orig_trainY, orig_devY, orig_testY
        )

    classifier.train()
    accuracy = classifier.test()

    lower, upper = confidence_intervals(accuracy, len(testY[0]), 0.95)
    print("Test accuracy for best dev model: %.3f, 95%% CIs: [%.3f %.3f]" % (accuracy, lower, upper))



In [9]:
trainingFile = "splits/train.txt"
devFile = "splits/dev.txt"
testFile = "splits/test.txt"

ordinal_values=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]

print("Running with Bag of Words:")
run(trainingFile, devFile, testFile, ordinal_values, binary_bow_featurize)

print("\nRunning with TF:")
run(trainingFile, devFile, testFile, ordinal_values, tfidf_featurize)


Running with Bag of Words:
Test accuracy for best dev model: 0.240, 95% CIs: [0.156 0.324]

Running with TF:
Test accuracy for best dev model: 0.280, 95% CIs: [0.192 0.368]


Default Bag of Words Accuracy: `0.240, 95% CIs: [0.156 0.324]`

TF-IDF Accuracy: `0.280, 95% CIs: [0.192 0.368]`