### Sentiment Analysis using Novel Text Compression-based Algorithm
A text classifier using gzip and kNN-based algorithm from novel research that allegedly performs better than BERT
Source: https://aclanthology.org/2023.findings-acl.426.pdf

In [40]:
import gzip
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [32]:
# Takes in a train and test set and a k-value to determine the accuracy
def gzip_implementation(training_set, test_set, k):
    correct = 0
    # We have a training_set and test_set, each consisting of an array of (text, label) - Ex. ("I am happy!", "pos")
    for _, x1 in test_set.iterrows():
        Cx1 = len(gzip.compress(x1['post'].encode())) # length of compressed x1
    
        # iterates thru the entire training set and calculates the distance from x1
        distance_from_x1 = [] # list of distances from x1 to every text in training_set
        for _, x2 in training_set.iterrows():
            Cx2 = len(gzip.compress(x2['post']. encode())) # length of compressed x2
            x1x2 = " ".join([x1['post'], x2['post']]) # concatenate
            Cx1x2 = len(gzip.compress(x1x2. encode())) # length of compressed concatenation x1x2
            ncd = (Cx1x2 - min(Cx1,Cx2)) / max(Cx1, Cx2) # Normalized Compression Distance
            distance_from_x1.append(ncd)
        
        sorted_idx = np.argsort(np.array(distance_from_x1)) # sorts distances from low to high
        top_k_class = training_set.iloc[sorted_idx[:k]] # picks all distances within k
        predict_class = top_k_class['sentiment'].mode()

        # if the prediction matches test, it's a correct prediction
        if (predict_class == x1['sentiment']).all():
            correct += 1
        accuracy = correct/len(test_set)
        
    return accuracy

In [38]:
df = pd.read_csv('datasets/1k_data_emoji_tweets_senti_posneg.csv', usecols=['post', 'sentiment'])
train, test = train_test_split(df, test_size=0.2, random_state=1)

start = time.time()
accuracy = gzip_implementation(train, test, k=3)
end = time.time()
time_taken = end - start

print(f"Accuracy: {accuracy}")
print(f"Elapsed time: {time_taken}")

Accuracy: 0.755
Elapsed time: 33.4697630405426
