In [3]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt
from math import log
from scipy.optimize import curve_fit

In [8]:
def generate_files_dict(path):
    dfiles = {}

    for df in os.walk(path):
        if df[2]:
            for f in df[2]:                
                dfiles[df[0] + '/' + f] = {}
    return dfiles

In [14]:
def prepare_data(file):
    ydata = []
    words = []
    with open(file, encoding='iso-8859-1') as cw_novels:
        lines = cw_novels.readlines()
        for line in lines:
            line = line.replace('\x00','').replace('ÿþ', '').replace('\n', '')
            if re.search(r',', line):
                [count, word] = line.split(', ', maxsplit=1)
                word = word[2:-1] # CountWords.py doesn't work, needed a cast and saved data as binaries in a string 
                                  # (recast was a lot more work than doing this)
                if not re.search(r'\d|\.|_', word) \
                    and re.match(r'^[A-Za-z]+$', word):
                    ydata.insert(0, int(count))
                    words. insert(0, word)
    xdata = [x for x in range(1, len(ydata) + 1)]
    
    return xdata, ydata, words

def func_zipf(x, a, b, c):
    return c / (x + b) ** a

def fitting(xdata, ydata, func):
    popt, pcov = curve_fit(
        func, 
        xdata,
        ydata, 
        maxfev=2000
    )
    return popt, pcov

def fmse(xdata, ydata, popt, func):
    x = np.array(list(map(lambda x: func(x, *popt), xdata)))
    y = np.array(ydata)
    return ((x - y)**2).mean(axis=None)

def reduce_head_mse(x_data, y_data, func, max_rem=1000):
    rem = 1
    last_mse = -1
    mse = 0
    step = 200
    while rem < max_rem and abs(last_mse - mse) > 0.1:
        last_mse = mse
        x = x_data[rem:]
        y = y_data[rem:]
        popt, _ = fitting(x, y, func)
        mse = fmse(x, y, popt, func)
        rem += step
    return (rem - step), popt, mse

In [23]:
def explore_data(file):
    x, y, words = prepare_data(file)
    popt, _ = fitting(x, y, func_zipf)
    mse = fmse(x, y, popt, func_zipf)
    rem, popt_rem, mse_rem = reduce_head_mse(x, y, func_zipf, 4500)
    result = {}
    result['all'] = {}
    result['all']['y'] = y
    result['all']['words'] = words
    result['all']['popt'] = list(popt)
    result['all']['mse'] = mse
    result['remove'] = {}
    result['remove']['rem'] = rem
    result['remove']['popt'] = list(popt_rem)
    result['remove']['mse'] = mse_rem
    return result

In [11]:
data = generate_files_dict("./cwords_done")

In [25]:
results = {}
for k in data:
    results[k] = explore_data(k)



In [26]:
print(results)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [65]:
explore_data("./cwords_done/novels_wh.txt")

All data




Optimal parameters: [1.06552597e+00 1.35080095e+00 1.02328768e+05]
Mean squared error: 2196.78898920902

Removing the 801 most frequent words
Optimal parameters: [1.74598573e+00 8.79388096e+02 2.69612889e+07]
Mean squared error: 0.11822805178385377



In [66]:
explore_data("./cwords_done/novels_st.txt")

All data




Optimal parameters: [ 3.08752019e-01 -9.99999961e-01  1.07602604e+03]
Mean squared error: 1031548.7079973059

Removing the 1401 most frequent words
Optimal parameters: [1.92506009e+00 1.83174829e+03 1.22488140e+09]
Mean squared error: 0.2829371305888245

