In [None]:
import pandas as pd
import numpy
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
anx_data = pd.read_pickle('symp_anx_rec.p')

In [None]:
anx_data.info()

In [None]:
anx_data['prop_scaled'] = StandardScaler().fit_transform(
    numpy.array(anx_data['prop']).reshape(-1, 1))

In [None]:
anx_target = anx_data.dropna()

In [None]:
anx_target['prop'].plot()

In [None]:
anx_target['prop_scaled'].plot()

In [None]:
test_vars = ['covid_symptoms', 'dateofbirth_year']
ind_vars = [x for x in anx_data if x not in [
    'date', 'anxiety', 'loc_code', 
    'Unnamed: 0_x', 'id_code', 'Unnamed: 0_y', 
    'postcode', 'postcode.1', 'ccg', 
    'date_rank', 'prop', 'lat', 'long' , 0.0, 1.0,
    'covid_stay_home'
]]
ind_vars

In [None]:
ind_train, ind_test, dep_train, dep_test = train_test_split(
    anx_target[ind_vars], anx_target['anxiety'])

In [None]:
empty_tree = tree.DecisionTreeRegressor(
    criterion='mse',
    max_depth=4,
    min_samples_split=100,
    min_samples_leaf=100,
    ccp_alpha=0.0
)
fitted_tree = empty_tree.fit(ind_train, dep_train)

In [None]:
fitted_tree.score(ind_test, dep_test)

In [None]:
ind_vars_labels = [*ind_vars[:-1], 'Local Infection']
ind_vars_labels

In [None]:
plt.figure(figsize=(30,20))
image = tree.plot_tree(
    fitted_tree, 
    max_depth=4,
    feature_names=ind_vars_labels,
    filled=True,
    rounded=True,
    precision=2,
    fontsize=15,
    impurity=False,
    proportion=False
)
plt.savefig('local_tree.png')

In [None]:
dates = anx_data.groupby(['date'])['id_code'].count()

In [None]:
dates[dates < 30]