# Coursemaker Trial Notebook

Most used 100 words in a language make up approximately 50% of words in a corpus (This of course varies from language to language. E.g. it is lover in agglutinative languages). If you sort the most used words and start learning words from this list (frequency sorted list fsl) you can effectively increase your hit rate in the corpus.

I want to sort sentences in a similar manner. The first sentence of the proposed order shall ideally contain the first n words in the fsl. Any sentence can contain any number of words from previous sentences, and tries to incorporate any number of next m most used words.

The 'vocabulary expansion per words studied' curve is ideally the same as the curve given above and it can theoretically (and practically) never exceed this curve. To achieve this, no mth sentence shall use the n+1th word in the sorted list if the first m sentences don't contain any of the first n words.

In [None]:
import numpy as np
import pandas as pd
import re
import string
import matplotlib.pyplot as plt
import wordcloud

In [None]:
def readfile(path):
    f = open(path, 'r')
    c = f.read()
    return c

In [None]:
# normalize a string
def normalize(s, case_folding=True, stopword_removal=True, punctuation_removal=True, newline_removal=True, punctuation_whitelist=[]):

    if not s:
        return None

    # set lowercase
    if case_folding:
        s = s.lower()

    # remove custom html characters and tabs
    s = re.sub(r"&[a-z]{1,3};", " ", s)
    s = s.replace('\t', '')

    # replace punctuation marks with a blank space
    if punctuation_removal:
        for character in list(string.punctuation) + ['”', '“']:
            if character not in punctuation_whitelist:
                s = s.replace(character, ' ')

    # remove newline characters ('\n')
    if newline_removal:
        s = s.replace('\n', ' ')

    # remove stop words given in stopwords.txt from the string
    if stopword_removal:
        b = open('stopwords.txt')
        stop_words = [ line[:-1] for line in b.readlines() ]
        b.close()

        for word in stop_words:
            s = re.sub(r" {} ".format(word), " ", s)
            s = re.sub(r"^{} ".format(word), " ", s)
            s = re.sub(r" {}$".format(word), " ", s)

    # shorten mutliple blank spaces into one
    s = re.sub(r" +", " ", s)

    return s


## Parse the dataset 
I chose A Tale of Two Cities as a corpora while conducting my experiments. I normalized the corpus and divided it into sentences. I extracted 7481 sentences, which contain 9940 unique words.

In [None]:
corpus = readfile('data/twocities.corpus')

In [None]:
corpus = normalize(corpus, stopword_removal=False, punctuation_whitelist=['!', '.', '?', '\''])

corpus = corpus.replace('mrs.', 'mrs')
corpus = corpus.replace('mr.', 'mr')

In [None]:
start      = 'the footsteps die out for'
startindex = corpus.index(start)

sentences  = re.split('\.|\?|\!', corpus[startindex:])

words      = normalize(corpus, stopword_removal=False)[startindex:].split(' ')
words      = [ word for word in words if re.match('[a-z]+', word) ]

## Preprocessing 
All words are counted and sorted according to their frequencies/counts.

In [None]:
unique_words, counts_words = np.unique(words, return_counts=True)

In [None]:
frequencies = []
for word, count in zip(unique_words, counts_words):
    frequencies.append([word, count])

In [None]:
frequencies.sort(key = lambda x: -x[1]) 

In [None]:
frequencies = np.array(frequencies)

In [None]:
freqs_cumulative = [0]
for freq in frequencies[:,1].astype(int):
    freqs_cumulative.append(freqs_cumulative[-1] + freq)
    
wcount = freqs_cumulative[-1]

freqs_cumulative = np.array(freqs_cumulative[1:])/wcount

In [None]:
fig, axs = plt.subplots(1,1, figsize=(16,9))

axs.plot(freqs_cumulative)

## Lets take a look at the words used in our corpus

In [None]:
wc = wordcloud.WordCloud(background_color='white', width=1000, height=500)
plt.figure(figsize=(19,9))
plt.imshow(wc.generate(corpus))

In [None]:
def sentences_to_feature(sentences, feature, frequencies):
    all_orders = []
    
    freq_list = list(frequencies[:,0])
    
    if feature == 'orders':
        for sentence in sentences:
            sentence_orders = [ freq_list.index(word) for word in sentence.split(' ') if word in freq_list]
            all_orders.append(sentence_orders)
        
        return all_orders
    
    if feature == 'frequencies':
        freqs = np.array(frequencies[:,1]).astype(float)/wcount
        for sentence in sentences:
            sentence_orders = [ freqs[freq_list.index(word)] for word in sentence.split(' ') if word in freq_list]
            all_orders.append(sentence_orders)
        
        return all_orders

In [None]:
sentence_orders      = sentences_to_feature(list(sentences), 'orders', frequencies)
sentence_frequencies = sentences_to_feature(list(sentences), 'frequencies', frequencies)

sentence_orders[0], sentences[0]

In [None]:
#deprecated
def get_ideal_index(sentences, frequencies):
    
    xfrequencies = np.copy(frequencies)
    #print('xfreq', xfrequencies[:10])
    
    xwords       = list(xfrequencies[:,0])
    
    #print('xwords', xwords[:10])
    
    xsentences   = [ sentence.split(' ') for sentence in sentences.copy() ]
    #print('sentences', xsentences[:4])
    xsentences   = [ [token for token in sentence if token in xwords] for sentence in xsentences if sentence != [] ]
    xsentences   = [ sentence for sentence in xsentences if sentence != []]
    #print('sentences', xsentences)
    
    for word in xwords:
        for i, sentence in enumerate(xsentences):
            for token in sentence:
                #print('TS:', token, sentence)
                if token == word:
                    sentence.remove(token)
                    
            if sentence == []:
                #print(i)
                return i
    
    return -1
                    
    

In [None]:
#deprecated
def get_ideal_index_max(sentences, frequencies):
    
    xfrequencies = np.copy(frequencies)
    
    xwords       = list(xfrequencies[:,0])

    xsentences = [' '.join(list(set(sentence.split(' ')))) for sentence in sentences ]
    xsentence_frequencies = sentences_to_feature(list(xsentences), 'frequencies', frequencies)

    sums = [ np.sum(xsf) for xsf in xsentence_frequencies ]

    return np.argmax(sums)
    
    

In [None]:
def get_ideal_index_max_avg(sentences, frequencies):
    
    xfrequencies = np.copy(frequencies)
    
    xwords       = list(xfrequencies[:,0])

    xsentences = [' '.join([token for token in list(set(sentence.split(' '))) if token in xwords ]) for sentence in sentences ]
    xsentence_frequencies = sentences_to_feature(list(xsentences), 'frequencies', frequencies)

    sums = [ np.sum(xsf)/len(xsf) if xsf != [] else 0 for xsf in xsentence_frequencies ]

    return np.argmax(sums)
    
    

In [None]:
def get_in_order(sentences, frequencies, sentence_count, metric="vanilla"):
    
    sentences_ordered = []

    remaining_sentences   = [ sentence for sentence in set(sentences.copy()) if len(sentence)>5 ]
    remaining_frequencies = np.copy(frequencies)

    print(len(remaining_sentences), len(remaining_frequencies))
    
    cumulative_return = 0

    vocab = []
    learning_history = [0]

    for i in range(sentence_count):

        if metric == "vanilla":
            chosen_index = 1

        elif metric == "def":
            chosen_index = get_ideal_index(remaining_sentences, remaining_frequencies)

        elif metric == "max":
            chosen_index = get_ideal_index_max(remaining_sentences, remaining_frequencies)

        elif metric == "max-avg":
            chosen_index = get_ideal_index_max_avg(remaining_sentences, remaining_frequencies)
        
        else:
            print("error: invalid metric")
            return None
        
        if chosen_index == -1:
            print("error")
            return None

        sentence = remaining_sentences.pop(chosen_index)

        newvocab = [ word for word in set(sentence.split(' ')) if word not in vocab ]
        filtered = ' '.join(newvocab)

        orders   = sentences_to_feature(list([filtered]), 'orders', remaining_frequencies)
        sfreqs   = sentences_to_feature(list([filtered]), 'frequencies', remaining_frequencies)

        vocab   += newvocab

        new_percentage = 100*np.sum(sfreqs) if sfreqs else 0
        cumulative_return += new_percentage

        trunc = 100
        print('{} - return: {:.2f}% ({:.2f}% cumulative)'.format(i, new_percentage, cumulative_return),
              f'\n{chosen_index}:\t"{sentence[:trunc]}{"..."*int(len(sentence)>trunc)}"', '\n')

        learning_history.append(learning_history[-1] + new_percentage)
        sentences_ordered.append(sentence)

        for order in orders:
            remaining_frequencies = np.delete(remaining_frequencies, order, 0)
            
        if not remaining_sentences:
            break
            
    plt.plot(learning_history)
    
    return sentences_ordered, learning_history

In [None]:
output = get_in_order(sentences, frequencies, 5, metric="max-avg")

In [None]:
def get_cumulative_count(sentences):
    counts = [0]
    vocabs = [set([]), ]
    for sentence in sentences:
        tokens = [ token for token in set(sentence.split(' ')) if token in words ]
        counts.append(len)
        vocabs.append(set(list(vocabs[-1])+tokens))
    return counts, vocabs