In [1]:
# Standard library imports
import os
from datetime import datetime
import re

# Third party imports
import numpy as np
import dill

# Local application imports
from smile.regression import RegressionResultList

In [2]:
# Settings
seed = 3 # chosen by fair dice roll. guaranteed to be random. https://xkcd.com/221/
np.random.seed(seed)
np.set_printoptions(edgeitems=30, linewidth=100000)
pickle_pops_dir = r'D:\saved_populations_2'

In [3]:
def load_from_file(filename):
    with open(filename, 'rb') as f:
        return dill.load(f)

In [4]:
#parameters
npersons=1000 #per population
npops=1000 #across all files
slope_options = (1, 2, 3)
error_options = (0.3, 0.5)

In [5]:
def regress_mixed(self, **kwargs):
    li = []
    for i in range(len(self)):
        print(i)
        pop = self[i]
        li.append(pop.regress_mixed(**kwargs))
    return RegressionResultList(li, title=self.title+'\nregressed with mixed effects')
def regress(poplists, name_indexless, index=None, verbose=None):
    #poplists is a 2d array of poplists
    
    if index is not None: 
        suffix='_'+str(index)
        if verbose is None: verbose = False
    else: 
        suffix=''
        if verbose is None: verbose = True

    #preallocate arrays
    regresultslists = np.empty_like(poplists)

    #regress
    for tup in np.ndindex(poplists.shape):
        if verbose: 
            print()
            print(tup)
        regresultslists[tup] = regress_mixed(poplists[tup])

    if verbose: print("Done regressing.")
    return regresultslists

In [6]:
#timing
starttime = datetime.now()
print(f"Started at {starttime.strftime('%H:%M')}.")

try:
    nfiles_per_category = 1
    ncategories = 2 #poster and worddoc
    for i in range(nfiles_per_category):
        poplists = load_from_file(pickle_pops_dir+"\poster_sampled_poplists_"+str(i)+".pik")
        regress(poplists, "poster_sampled_poplists", index=i, verbose=True)
        poplists = load_from_file(pickle_pops_dir+"\worddoc_sampled_poplists_"+str(i)+".pik")
        regress(poplists, "worddoc_sampled_poplists", index=i, verbose=True)
        print(f"Done {ncategories*(i+1)}/{nfiles_per_category*ncategories}")
finally:
    #timing
    endtime = datetime.now()
    deltatime = int((endtime-starttime).total_seconds())
    print(f"Took {deltatime//3600} h {(deltatime%3600)//60} min {deltatime%60} s to run.")


Started at 18:50.

(0, 0, 0)
0




1




2




3




4




5




6




7




8




9





(0, 0, 1)
0


sampled by realistic has 9 NaN values


1


sampled by realistic has 9 NaN values


2


sampled by realistic has 3 NaN values


3


sampled by realistic has 3 NaN values


4


sampled by realistic has 3 NaN values


5




6




7


sampled by realistic has 6 NaN values


8




9


sampled by realistic has 15 NaN values



(0, 1, 0)
0




1




2




3




4




5




6




7




8




9





(0, 1, 1)
0


sampled by realistic has 495 NaN values


1


sampled by realistic has 603 NaN values


2


sampled by realistic has 495 NaN values


3


sampled by realistic has 609 NaN values


4


sampled by realistic has 468 NaN values


5


sampled by realistic has 480 NaN values


6


sampled by realistic has 546 NaN values


7


sampled by realistic has 495 NaN values


8


sampled by realistic has 543 NaN values


9


sampled by realistic has 525 NaN values



(1, 0, 0)
0




1




2




3




4




5




6




7




8




9





(1, 0, 1)
0




1




2




3




4




5




6




7




8




9





(1, 1, 0)
0




1




2




3




4




5




6




7




8




9





(1, 1, 1)
0


sampled by realistic has 600 NaN values


1


sampled by realistic has 462 NaN values


2


sampled by realistic has 408 NaN values


3


sampled by realistic has 498 NaN values


4


sampled by realistic has 450 NaN values


5


sampled by realistic has 480 NaN values


6


sampled by realistic has 528 NaN values


7


sampled by realistic has 474 NaN values


8


sampled by realistic has 426 NaN values


9


sampled by realistic has 492 NaN values



(2, 0, 0)
0




1




2




3




4




5




6




7




8




9





(2, 0, 1)
0




1




2




3




4




5




6




7




8




9





(2, 1, 0)
0




1




2




3




4




5




6




7




8




9





(2, 1, 1)
0


sampled by realistic has 354 NaN values


1


sampled by realistic has 456 NaN values


2


sampled by realistic has 354 NaN values


3


sampled by realistic has 432 NaN values


4


sampled by realistic has 444 NaN values


5


sampled by realistic has 456 NaN values


6


sampled by realistic has 498 NaN values


7


sampled by realistic has 498 NaN values


8


sampled by realistic has 450 NaN values


9


sampled by realistic has 462 NaN values


Done regressing.

(0, 0, 0)
0




1




2




3




4




5




6




7




8




9


sampled by realistic has 999 NaN values



(0, 0, 1)
0


sampled by realistic has 999 NaN values


1


sampled by realistic has 1017 NaN values


2


sampled by realistic has 918 NaN values


3


sampled by realistic has 981 NaN values


4


sampled by realistic has 963 NaN values


5


sampled by realistic has 945 NaN values


6


sampled by realistic has 951 NaN values


7


sampled by realistic has 918 NaN values


8


sampled by realistic has 975 NaN values


9





(0, 1, 0)
0




1




2




3




4




5




6




7




8




9


sampled by realistic has 891 NaN values



(0, 1, 1)
0


sampled by realistic has 801 NaN values


1


sampled by realistic has 783 NaN values


2


sampled by realistic has 825 NaN values


3


sampled by realistic has 837 NaN values


4


sampled by realistic has 771 NaN values


5


sampled by realistic has 789 NaN values


6


sampled by realistic has 879 NaN values


7


sampled by realistic has 834 NaN values


8


sampled by realistic has 918 NaN values


9





(1, 0, 0)
0




1




2




3




4




5




6




7




8




9


sampled by realistic has 528 NaN values



(1, 0, 1)
0


sampled by realistic has 441 NaN values


1


sampled by realistic has 480 NaN values


2


sampled by realistic has 468 NaN values


3


sampled by realistic has 504 NaN values


4


sampled by realistic has 501 NaN values


5


sampled by realistic has 537 NaN values


6


sampled by realistic has 450 NaN values


7


sampled by realistic has 504 NaN values


8


sampled by realistic has 471 NaN values


9





(1, 1, 0)
0




1




2




3




4




5




6




7




8




9


sampled by realistic has 552 NaN values



(1, 1, 1)
0


sampled by realistic has 594 NaN values


1


sampled by realistic has 693 NaN values


2


sampled by realistic has 546 NaN values


3


sampled by realistic has 534 NaN values


4




5


sampled by realistic has 585 NaN values
sampled by realistic has 567 NaN values


6


sampled by realistic has 486 NaN values


7


sampled by realistic has 585 NaN values


8


sampled by realistic has 591 NaN values


9





(2, 0, 0)
0




1




2




3




4




5




6




7




8




9


sampled by realistic has 255 NaN values



(2, 0, 1)
0


sampled by realistic has 333 NaN values


1


sampled by realistic has 273 NaN values


2


sampled by realistic has 327 NaN values


3


sampled by realistic has 294 NaN values


4


sampled by realistic has 282 NaN values


5


sampled by realistic has 285 NaN values


6


sampled by realistic has 321 NaN values


7


sampled by realistic has 294 NaN values


8


sampled by realistic has 294 NaN values


9





(2, 1, 0)
0




1




2




3




4




5




6




7




8




9


sampled by realistic has 558 NaN values



(2, 1, 1)
0


sampled by realistic has 345 NaN values


1


sampled by realistic has 372 NaN values


2


sampled by realistic has 444 NaN values


3


sampled by realistic has 432 NaN values


4


sampled by realistic has 393 NaN values


5


sampled by realistic has 441 NaN values


6


sampled by realistic has 471 NaN values


7


sampled by realistic has 462 NaN values


8


sampled by realistic has 483 NaN values


9




Done regressing.
Done 2/2
Took 0 h 48 min 37 s to run.




In [7]:
print(load_from_file(pickle_pops_dir+"\poster_poplists_"+str(0)+".pik")[0,0].summarize())

Title: 'list of poster with 1 and 0.3'
Titles: [... 'poster with 1 and 0.3' ...]
N Persons: 10000 / 10000 = 1.00
N Days: [... 160 ...]


In [19]:
#timing
starttime = datetime.now()
print(f"Started at {starttime.strftime('%H:%M')}.")

outs = []

try:
    nfiles_per_category = 1
    ncategories = 2 #poster and worddoc
    for i in range(nfiles_per_category):
        #poplists = load_from_file(pickle_pops_dir+"\poster_poplists_"+str(i)+".pik")
        #regress(poplists, "poster_sampled_poplists", index=i, verbose=True)
        poplists = load_from_file(pickle_pops_dir+"\worddoc_poplists_"+str(i)+".pik")
        outs.append(regress(poplists, "worddoc_sampled_poplists", index=i, verbose=True))
        print(f"Done {ncategories*(i+1)}/{nfiles_per_category*ncategories}")
        
finally:
    #timing
    endtime = datetime.now()
    deltatime = int((endtime-starttime).total_seconds())
    print(f"Took {deltatime//3600} h {(deltatime%3600)//60} min {deltatime%60} s to run.")


Started at 00:39.

(0, 0)
0




1




2




3




4




5




6




7




8




9
Took 0 h 7 min 3 s to run.


KeyboardInterrupt: 

In [21]:
for res in outs[0][0,0].unwrap(): print(res.summary().tables[0].iloc[4,2:4].to_numpy())

IndexError: list index out of range

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tools.sm_exceptions import ConvergenceWarning

In [None]:
data = sm.datasets.get_rdataset('dietox', 'geepack').data
md = smf.mixedlm("Weight ~ Time", data, groups=data["Pig"])
mdf = md.fit(method=["lbfgs"])
print(mdf.summary())

In [None]:
data = sm.datasets.get_rdataset('dietox', 'geepack').data
md = smf.mixedlm("Weight ~ Time", data, groups=data["Pig"], re_formula="~Time")
mdf = md.fit(method=["lbfgs"])
print(mdf.summary())

In [None]:
data = sm.datasets.get_rdataset('dietox', 'geepack').data
md = smf.mixedlm("Weight ~ Time", data, groups=data["Pig"], re_formula="~ 0")
mdf = md.fit(method=["lbfgs"])
print(mdf.summary())