In [None]:
%load_ext autoreload

%autoreload 2

from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords
import numpy as np
import pandas as pd

from pathlib import Path

from src.string_processing import StringProcessor
from src.context_counter import ContextCounter
# import nltk
# nltk.download('stopwords')
# processor = StringProcessor(
#         EnglishStemmer().stem,
#         stopwords.words("english")
# )

# counter = ContextCounter()

data_path = Path() / "data"


In [None]:


raw_context_counts = pd.read_csv(data_path / "train_context_counts.csv")
# the contents of "train_context_counts.csv" should be similar to (
# based on the IMDB Dataset):
# word,negative,positive,total
# movi,29416,22491,51907
# film,22487,25661,48148
# one,13555,14103,27658

def ready_context_counts(context_counts):
    '''
    Calculate some values from the existing data in <context_counts>.
    '''

    context_counts.set_index("word", inplace=True)
    context_counts["score"] = (
        context_counts["positive"] 
        - context_counts["negative"]
    )/context_counts["total"]
    context_counts["pos_prob"] = (
        context_counts["total"]
        - context_counts["negative"]
    )/context_counts["total"]
    
    context_counts["weight"] = np.log2(
        context_counts["total"]/context_counts["total"].min()+1
    )
    return context_counts

context_counts = ready_context_counts(
    raw_context_counts.query("total > 1000", inplace=False)
)

# only take values that have a good "confidence"
context_counts = context_counts[context_counts["score"].abs() > 0.10]

# read in the context and processed words for some test data
# assumes test_data file contains a file one example
# per line. Each example has context first, then words.
# "context" and "words" are separated by a comma, words are separated
# spaces
test_data = []
with open(data_path / "test_data.csv", encoding="utf-8") as f:

    for i, row in enumerate(f.readlines()):
        # skip header line
        if i == 0:
            continue
        context, words = row.strip().split(",")
        test_data.append({"context":context, "words":words.split()})

print(*test_data[:3], sep = "\n")

In [None]:
context_counts

# Make predictions

In [None]:
def score_text(
        context_counts:pd.DataFrame,
        words:list,
        score_funcs:list
) -> list[float]:
    '''
    Filter `context_counts` based on `words` then use `score_funcs`
    to calulate the scores for the found words.
    '''

    filtered = context_counts.filter(words, axis="index")
    return [score_func(filtered) for score_func in score_funcs]

def weighted_score(context_counts:pd.DataFrame):


    scores = context_counts["score"]
    weights = context_counts["weight"]

    return (scores*weights).sum()/weights.sum()


def basic_score(context_counts:pd.DataFrame):

    return context_counts["score"].sum()


def prob_score(context_counts:pd.DataFrame):

    return context_counts["pos_prob"].mean()

def weighted_prob_score(context_counts:pd.DataFrame):

    scores = context_counts["pos_prob"]
    weights = context_counts["weight"]

    return (scores*weights).sum()/weights.sum()

def positive(context, score):

    return bool(
        (context == "negative" and score < 0)
        or (context == "positive" and score >= 0)
    )

def greater_than_half(context, score):

    return bool(
        (context == "negative" and score < 0.5)
        or (context == "positive" and score >= 0.5)
    )

def pn_accuracy(pn_rate):
        '''
        Calculate the accuracy of the model.
        '''

        return pn_rate[True].total()/(pn_rate[True].total()+pn_rate[False].total())

## Test some different scoring methods

In [None]:
# %%script skip_this_one

score_funcs = [prob_score, weighted_prob_score]
# these compare the score given by the model to the actual, true
# context of the test set data point
prediction_funcs = [greater_than_half, greater_than_half]
used_scores = len(score_funcs)

saved_context_counts = context_counts.copy()

pn_df = pd.DataFrame(dict(
    training_words_num = [],    
) | {fu.__name__: [] for fu in score_funcs})

# test performance with different number of keywords from the training set
training_words_num = []
pn_accuracies = []
for j in range(0,len(context_counts),len(context_counts)//10):

    if j == 0:
        continue
    training_words_num.append(j)

    print(f"Testing predictions with {j} words")

    # take <j> most popular words found in training data
    context_counts = saved_context_counts.iloc[:j,:]

    pn_rates = [ContextCounter() for _ in range(used_scores)]
    
    # TODO: test_data could use some class definition for a standardised
    # interface
    for data_point in test_data[:2000]:

        scores = score_text(
            context_counts, 
            data_point["words"], 
            score_funcs
        )
        data_point["score"] = scores

        # make predictions, calculate positive-negative -table
        context = data_point["context"]
        data_point["correct_prediction"] = [None]*used_scores
        for i, score in enumerate(scores):

            correct_prediction = prediction_funcs[i](context, score)

            data_point["correct_prediction"][i] = correct_prediction

            pn_rates[i].add(context, [correct_prediction])


    pn_df.loc[len(pn_df),:] = [j] + [pn_accuracy(pn_rate) for pn_rate in pn_rates] 

context_counts = saved_context_counts.copy()

In [None]:

title = '''\
    Accuracy of different scoring functions as a function of the number\n\
    of training words used in predicting\
'''

pn_df.plot(
    x="training_words_num",
    y=["prob_score", "weighted_prob_score"],
    title=title,
    xlabel="Number of words used to predict",
    ylabel="Accuracy"
)

In [None]:

pn_rate = ContextCounter()
for data_point in test_data:
    score = score_text(context_counts, data_point["words"], [prob_score])[0]
    context = data_point["context"]
    pn_rate.add(context,[greater_than_half(context, score)])

print(pn_accuracy(pn_rate))