In [1]:
!python -m pip install nltk
import nltk
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

Defaulting to user installation because normal site-packages is not writeable


[nltk_data] Downloading package stopwords to
[nltk_data]     /WAVE/users2/unix/avattuone/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /WAVE/users2/unix/avattuone/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /WAVE/users2/unix/avattuone/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
import string
import re
import numpy as np
from scipy.sparse import csr_matrix
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import Counter
from nltk.util import ngrams

def remove_punctuation(s):
    """
    Remove punctuation from a string
    """
    return s.translate(str.maketrans('','', string.punctuation))

def add_bigrams_and_trigrams(tokens):
    bigrams = ['_'.join(bg) for bg in ngrams(tokens, 2)]
    trigrams = ['_'.join(tg) for tg in ngrams(tokens, 3)]
    return tokens + bigrams + trigrams

def preprocess_line(line):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    tokens = [
                lemmatizer.lemmatize(stemmer.stem(w.lower())) 
                for w in remove_punctuation(line).split() 
                if len(w) > 0 and w.lower() not in stop_words
        ] 

    return add_bigrams_and_trigrams(tokens)

def build_matrix(docs, idx, is_training):
    r""" Build sparse matrix from a list of documents, 
    each of which is a list of word/terms in the document.  
    """
    nrows = len(docs)
    if len(idx) == 0:
        tid = 0
        nnz = 0
        for d in docs:
            nnz += len(set(d))
            for w in d:
                if w not in idx:
                    idx[w] = tid
                    tid += 1
    
    else:
        nnz = 0
        for d in docs:
            nnz += len([w for w in set(d) if w in idx])
    ncols = len(idx)
    # set up memory
    ind = np.zeros(nnz, dtype=int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in docs:
        cnt = Counter(d if is_training else [w for w in d if w in idx])
        keys = list(k for k,_ in cnt.most_common())
        l = len(keys)
        for j,k in enumerate(keys):
            ind[j+n] = idx[k]
            val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
            
    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    
    return mat

def csr_l2normalize(mat, copy=False, **kargs):
    r""" Normalize the rows of a CSR matrix by their L-2 norm. 
    If copy is True, returns a copy of the normalized matrix.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # normalize
    for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum
            
    if copy is True:
        return mat

def process_data(fpath, idx=None, is_training=True):
    """
    This function processes the input data and returns the result.
    
    :param fpath: Input data to be processed
    :param idx: Optional dictionary of tokens
    :param is_training: Boolean flag indicating if the data is for training
    :return: csr_matrix containing the term-frequency values for the input data after processing
    """
     # Example processing: Read the data from fpath, split labels and features (if training data),
    # tokenize it, optionally filter tokens, apply stemming or lematization (but not both), 
    # and finally convert the data to a csr_matrix format by counting token frequencies in each doc.

    labels = []
    docs = []
    
    with open(fpath, 'r') as file:
        lines = file.readlines()
        for line in lines:
            if is_training:
                labels.append(line[0])
                line = line[2:]
            processed = preprocess_line(line)
            docs.append(processed)

    data = build_matrix(docs, idx, is_training)
    csr_l2normalize(data)
   
    return (data, labels) if is_training else data
    
# Example usage
idx = {}
train_data, train_labels = process_data('train.dat', idx=idx, is_training=True)
test_data = process_data('test.dat', idx=idx, is_training=False)
print(f"Training data number of features: {train_data.shape[1]}")
print(f"Test data number of features: {test_data.shape[1]}")
assert train_data.shape[1] == test_data.shape[1], "Train and test data do not have the same number of features"

Training data number of features: 3104732
Test data number of features: 3104732


In [3]:
def calc_proximities(train_data, x, k=5):
    """
    This function computes the proximity of a given data point x to the training data.
    
    :param train_data: The training data in csr_matrix format
    :param x: The data point for which proximity is to be computed
    :return: list of pairs containing the rows in `train_data` that have non-zero proximity to the sample and their associated proximity values
    """
    dots = x.dot(train_data.T)
    proximities = [(int(idx), float(val)) for idx, val in zip(dots.indices, dots.data)]
    sorted_proximities = sorted(proximities, key=lambda x: x[1], reverse=True)
    top_k_proximities = sorted_proximities[:k]
    
    return top_k_proximities

Top 11 x_proximities out of 5 using proximity function: [(62678, 0.2533954906327426), (16712, 0.22782391365303153), (30610, 0.2231436834155872), (45692, 0.2136327344609569), (97924, 0.20689655172413782)]
Top 11 y_proximities out of 5 using proximity function: [(37958, 0.33456448556986007), (36323, 0.21609716071294321), (85565, 0.2140712823141403), (89430, 0.21200662374236884), (71431, 0.20567663600479233)]
Top 11 z_proximities out of 5 using proximity function: [(25407, 0.5225157910494113), (15788, 0.39478625833506414), (50542, 0.3522360916804568), (23060, 0.3428480345893351), (60195, 0.3346838149756099)]
Top 11 a_proximities out of 5 using proximity function: [(52162, 0.18156825980064084), (14894, 0.17777777777777784), (22178, 0.16169041669088854), (88116, 0.14611900125320162), (90448, 0.1454785934906617)]


In [6]:
def predict_labels(train_data, train_labels, test_data):
    with open("predictions.dat", "w") as file:
        for line in test_data:
            proximities = calc_proximities(train_data, line)
            potential_labels = {}
            
            for proximity in proximities:
                proximity_label = train_labels[proximity[0]]
                
                if proximity_label not in potential_labels:
                    potential_labels[proximity_label] = 0.0
                potential_labels[proximity_label] += proximity[1]
                
            majority_label = max(potential_labels, key=potential_labels.get)
            file.write(f'{majority_label}\n')    

In [None]:
predict_labels(train_data, train_labels, test_data)
print("Finished")