In [2]:
def Verify(expression: bool, message: str):
    if not expression:
        raise Exception(message)
    else:
        return

<h1>Task 1.1: IMDB Data loading</h1>

In [3]:
import numpy as np

class IMDBSample:
    def __init__(self, rowIndex, numberOfFeatures=89527):
        self.rowIndex = rowIndex
        self.features = np.zeros(numberOfFeatures)
        self.label = -1

class IMDBDataLoader:
    def __init__(self, vocabFilepath, featFilepath):
        self.samples = []
        self.words = []
        self.vocabFilepath = vocabFilepath
        self.featFilepath = featFilepath
        self.ParseIntoVectors()
    
    def ParseIntoVectors(self):
        with open(self.featFilepath, 'r', encoding='utf-8') as file:
            for line_number, line in enumerate(file, start=0):
                currentSample = IMDBSample(line_number)
                parts = line.split()
                Verify(int(parts[0])>6 or int(parts[0]) < 6, "Error: Rating value unexpected in IMDB dataloader.")
                currentSample.label = int(int(parts[0]))
                for part in parts[1:]:
                    wordIndex, frequency = map(int, part.split(':'))
                    Verify(wordIndex<currentSample.features.size, "Word index larger than number of features in IMDB dataloader")
                    Verify(frequency>=0, "Word Frequency is smaller than expected in IMDB dataloader.")
                    currentSample.features[wordIndex] = frequency
                self.samples.append(currentSample)

        with open(self.vocabFilepath, 'r', encoding='utf-8') as file:
            for line_number, line in enumerate(file, start = 0):
                self.words.append(line)
                
    
    def GetData(self):
        numberOfSamples = len(self.samples)
        numberOfFeatures = self.samples[0].features.size if self.samples else 0

        X = np.zeros((numberOfSamples, numberOfFeatures))
        y = np.zeros(numberOfSamples)

        for i, sample in enumerate(self.samples):
            X[i,:] = sample.features
            y[i] = sample.label
        return y, X
    
    def GetWords(self):
        return self.words



<h3>Data loading</h3>

In [4]:
vocabFilepath = '../aclImdb/imdb.vocab'
trainfeatFilepath = '../aclImdb/train/labeledBow.feat'
testfeatFilepath = '../aclImdb/test/labeledBow.feat'
dataloaderTrain = IMDBDataLoader(vocabFilepath, trainfeatFilepath)
dataloaderTest = IMDBDataLoader(vocabFilepath, testfeatFilepath)
print("Dataloading complete")
y_train, X_train = dataloaderTrain.GetData()
y_test, X_test = dataloaderTest.GetData()

words = dataloaderTrain.GetWords()


Dataloading complete


<h3>Data filtering</h3>

In [11]:
onePercentThreshold = int(y_train.size*0.01)
fiftyPercentThreshold = int(y_train.size*0.5)
featureFrequencies = np.sum(X_train > 0, axis=0)

featuresToKeep = (featureFrequencies > onePercentThreshold) & (featureFrequencies < fiftyPercentThreshold)
featureFrequenciesFiltered = featureFrequencies[featuresToKeep]
XFiltered_train = X_train[:, featuresToKeep]
XFiltered_test = X_test[:, featuresToKeep]
wordsFiltered = [word for word, keep in zip(words, featuresToKeep) if keep]

weights = np.linalg.inv(XFiltered_train.T @ XFiltered_train) @ XFiltered_train.T @ y_train #OLS SSE Solution
D_Selected = 1000
feature_importances = np.abs(np.copy(weights))
top_features_indices = np.argsort(feature_importances)[-D_Selected:]
XFiltered_train_selected = XFiltered_train[:, top_features_indices]
XFiltered_test_selected = XFiltered_test[:, top_features_indices]
featureFrequenciesFiltered_Selected = featureFrequenciesFiltered[top_features_indices]
wordsFiltered_selected = [wordsFiltered[i] for i in top_features_indices]



<h3>Displaying Top Words (Task 3.1)</h3>

In [21]:
numberOfWordsToDisplay = 10 #both positive and negative side
positive_weights_indices = np.argsort(weights)[-numberOfWordsToDisplay:]
negative_weights_indices = np.argsort(weights)[:numberOfWordsToDisplay]
words_with_largest_positive_weights = [(wordsFiltered[i].strip(), weights[i], featureFrequenciesFiltered[i]) for i in positive_weights_indices]
words_with_largest_negative_weights = [(wordsFiltered[i].strip(), weights[i], featureFrequenciesFiltered[i]) for i in negative_weights_indices]

print(f"Words with the {numberOfWordsToDisplay} largest positive weights:")
for word, weight, frequency in reversed(words_with_largest_positive_weights):
    print(f"Word: '{word}', Weight: {weight:.5f}, Frequency: {frequency}")

print("\n")

print(f"Words with the {numberOfWordsToDisplay} largest negative weights:")
for word, weight, frequency in words_with_largest_negative_weights:
    print(f"Word: '{word}', Weight: {weight:.5f}, Frequency: {frequency}")

Words with the 10 largest positive weights:
Word: 'recommended', Weight: 1.53338, Frequency: 484
Word: 'funniest', Weight: 1.27472, Frequency: 330
Word: 'superb', Weight: 1.24865, Frequency: 621
Word: 'wonderfully', Weight: 1.21076, Frequency: 311
Word: 'available', Weight: 1.03583, Frequency: 364
Word: 'excellent', Weight: 1.01886, Frequency: 1778
Word: 'loved', Weight: 0.98732, Frequency: 1232
Word: 'tears', Weight: 0.97909, Frequency: 303
Word: 'fascinating', Weight: 0.95847, Frequency: 362
Word: 'enjoyed', Weight: 0.94308, Frequency: 1141


Words with the 10 largest negative weights:
Word: 'numerous', Weight: -0.92684, Frequency: 256
Word: 'honestly', Weight: -0.83400, Frequency: 423
Word: 'plus', Weight: -0.59289, Frequency: 562
Word: 'allow', Weight: -0.57192, Frequency: 296
Word: 'review', Weight: -0.56384, Frequency: 751
Word: 'amount', Weight: -0.56081, Frequency: 463
Word: 'worst', Weight: -0.53833, Frequency: 2261
Word: 'prove', Weight: -0.52345, Frequency: 252
Word: 'seemin