### An analysis which uses the best performing forecasting method on multiple data splits

It takes as an input, the timeseries obtained from DTM.

The best performing regression model was determined in Regression Parameter Tunning analysis

The data is used in two different ways for training and testing:
* random sampling it from the whole dataset, with a 90-10 ratio
* creating progressive splits, moving the start and end date

All the data splits will be K Fold Validated

The end product of this analysis will be to use the model in order to predict the value of the next time step

### Initialisation and parameter settings

In [None]:
import datetime
import pickle
import random
import sys
import time

import fastcluster
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
from joblib import dump
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.metrics import (mean_absolute_error,
                             mean_absolute_percentage_error,
                             mean_squared_error, r2_score)
from sklearn.preprocessing import OneHotEncoder
from tqdm.notebook import tqdm, trange


In [None]:
topics_over_time = pd.read_csv('output/DTM/DTM_collab_test_manual_sentence-transformers(1-2)_full.csv')

In [None]:
def train_test_model(train_df,test_df,features,verbose=False,
                     imputer_type='simple'):

  # Training phase
  train_df = train_df.dropna(subset=['Frequency_Next_Year'])
  if imputer_type == 'knn':
    imputer = KNNImputer(n_neighbors=2)
  else:
    imputer = SimpleImputer()

  x_training = imputer.fit_transform(train_df[features])
  y_training = train_df['Frequency_Next_Year']
  mdl = RandomForestRegressor(n_estimators=estimators,
                                            max_features=max_features,
                                            max_depth=max_depth,
                                            min_samples_leaf = min_samples_leaf,
                                            min_samples_split = min_samples_split,
                                            bootstrap = btstr,
                                            random_state=0, n_jobs=16)
  mdl.fit(x_training,y_training)

  # Testing phase
  x_testing = imputer.fit_transform(test_df[features])
  y_testing = test_df['Frequency_Next_Year']
  predicted_testing = mdl.predict(x_testing)
  mask = ~np.isnan(y_testing)
  r2 = r2_score(y_testing[mask],predicted_testing[mask])

  if verbose:
    # Output metrics for results
    print(f'MSE: {mean_squared_error(y_testing[mask],predicted_testing[mask])}')
    print(f'MAE: {mean_absolute_error(y_testing[mask],predicted_testing[mask])}')
    print(f'MAPE: {mean_absolute_percentage_error(y_testing[mask],predicted_testing[mask])}')
    print(f'R2: {r2}')

    # Output results for visual comparasion
    print(test_df.loc[np.isnan(y_testing)])
    print(predicted_testing[np.isnan(y_testing)])
  s_mdl = pickle.dumps(mdl)

  return (s_mdl, r2, y_testing[mask], predicted_testing[mask])

  

In [None]:
def plot_prediction(yval,p):
  t = np.linspace(min(yval),max(yval),len(yval))
  fig = go.Figure()

  # Add traces
  fig.add_trace(go.Scatter(x=yval, y=p,
                      mode='markers',
                      name='markers'))
  fig.add_trace(go.Scatter(x=t, y=t,
                      mode='lines',
                      name='lines'))
  fig.show()

### Feature engineering

It creates several features to enhance the performance of the model

In [None]:
topics_over_time.shape

In [None]:
topics = topics_over_time['Topic'].unique()
len(topics)

In [None]:
timestamps = topics_over_time['Timestamp'].unique()
len(timestamps)

In [None]:
# Normalisation methods available: None | mean | min-max
normalisation_method = None

# Hyperparameter tuning
hyperparameter_tuning = False

In [None]:
topics_over_time = topics_over_time.sort_values(by=['Topic','Timestamp'])
if normalisation_method == 'mean':
  topics_over_time['Frequency'] = ((topics_over_time['Frequency']
                                   -topics_over_time['Frequency'].mean())
                                   /topics_over_time['Frequency'].std())
elif normalisation_method == 'min-max':
  topics_over_time['Frequency'] = (topics_over_time['Frequency']-
                                   topics_over_time['Frequency'].min())/(
                                       topics_over_time['Frequency'].max()
                                       -topics_over_time['Frequency'].min())

In [None]:
topics_over_time['Frequency_Next_Year'] = topics_over_time.groupby('Topic')['Frequency'].shift(-1)
topics_over_time['Lag-1'] = topics_over_time.groupby('Topic')['Frequency'].shift(1)
topics_over_time['Diff-1'] = topics_over_time.groupby('Topic')['Frequency'].diff(1)
topics_over_time['Rolling-4'] = topics_over_time.groupby('Topic')['Frequency'].rolling(4).mean().reset_index(level=0,drop=True)

Use OneHotEncoder for representing Topic feature

In [None]:
features_df = topics_over_time[['Timestamp','Topic','Frequency_Next_Year',
                                'Frequency','Lag-1','Diff-1','Rolling-4']]

ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto', sparse=False),
      ['Timestamp','Topic'])],  remainder='passthrough'
)

X = ct.fit_transform(features_df)
features = []
for year in timestamps:
  features.append(f'Is_Year_{year}')
for topic in topics:
  features.append(f'Is_Topic_{topic}')
features.extend(['Frequency_Next_Year','Frequency','Lag-1',
                          'Diff-1','Rolling-4'])
one_hot_encoded_df = pd.DataFrame(X, columns = features)
features.remove('Frequency_Next_Year')
one_hot_encoded_df

In [None]:
f, ax = plt.subplots(figsize=(10, 8))
corr = one_hot_encoded_df.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool),
            cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

Hyperparameter Tuning for RandomForestRegressor

In [None]:
if hyperparameter_tuning:
  n_estimators = [5, 20, 50, 100, 200, 300, 500, 600, 1000] # number of trees in the random forest
  max_features = ['auto', 'sqrt'] # number of features in consideration at every split
  max_depth = [int(x) for x in np.linspace(10, 120, num = 12)] # maximum number of levels allowed in each decision tree
  min_samples_split = [2, 6, 10] # minimum sample number to split a node
  min_samples_leaf = [1, 3, 4] # minimum sample number that can be stored in a leaf node
  bootstrap = [True, False] # method used to sample data points
else:
  estimators = 50 
  max_features = 'auto'
  max_depth = 40
  min_samples_split = 2
  min_samples_leaf = 1
  btstr = False

In [None]:
if hyperparameter_tuning:

  # Dataframe to store results of the hyperparameter tuning
  hyperparameter_tuning_random_forest_df = pd.DataFrame({
      'execution_datetime':pd.Series(dtype='str'),
      'execution_datetime':pd.Series(dtype='float64'),
      'split_number':pd.Series(dtype='int'),
      'r2':pd.Series(dtype='float64'),
      'mape':pd.Series(dtype='float64'),
      'mae':pd.Series(dtype='float64'),
      'mse':pd.Series(dtype='float64'),
      'n_estimators':pd.Series(dtype='int'),
      'max_features':pd.Series(dtype='str'),
      'max_depth':pd.Series(dtype='int'),
      'min_samples_split':pd.Series(dtype='int'),
      'min_samples_leaf':pd.Series(dtype='int'),
      'bootstrap':pd.Series(dtype='bool')
  })

  shuffled_topics_over_time = one_hot_encoded_df.sample(frac=1)
  dataset_lenght = len(shuffled_topics_over_time)
  dataset_split = int(dataset_lenght/10)

  best_r2 = -1
  sum_r2 = 0
  best_yval = None
  best_ypredicted = None

  for index in trange(10):
    run_entry = {'split_number':(index+1)}
    train_df = shuffled_topics_over_time.iloc[(index*dataset_split):
                                                ((index+1)*dataset_split)]
    test_df = pd.concat([shuffled_topics_over_time.iloc[0:(index*dataset_split)],
                          shuffled_topics_over_time.iloc[((index+1)
                          *dataset_split):]],ignore_index=True)
    
    # Data preparation
    imputer = SimpleImputer()
    train_df = train_df.dropna(subset=['Frequency_Next_Year'])

    x_training = imputer.fit_transform(train_df[features])
    y_training = train_df['Frequency_Next_Year']
    x_testing = imputer.transform(test_df[features])
    y_testing = test_df['Frequency_Next_Year']

    for estimators in tqdm(n_estimators):
      run_entry['n_estimators'] = estimators
      for mx_features in max_features:
        run_entry['max_features'] = mx_features
        for mx_depth in max_depth:
          run_entry['max_depth'] = mx_depth
          for mn_samples_leaf in min_samples_leaf:
            run_entry['min_samples_leaf'] = mn_samples_leaf
            for mn_samples_split in min_samples_split:
              run_entry['min_samples_split'] = mn_samples_split
              for btstr in bootstrap:
                run_entry['bootstrap'] = btstr
                start_time = time.time()

                # Training phase
                mdl = RandomForestRegressor(n_estimators=estimators,
                                            max_features=mx_features,
                                            max_depth=mx_depth,
                                            min_samples_leaf = mn_samples_leaf,
                                            min_samples_split = mn_samples_split,
                                            bootstrap = btstr,
                                            random_state=0, n_jobs=16)
                mdl.fit(x_training,y_training)

                # Testing phase
                predicted_testing = mdl.predict(x_testing)
                run_entry['execution_time'] = (time.time() - start_time)
                run_entry['execution_datetime'] = datetime.datetime.now()
                mask = ~np.isnan(y_testing)

                run_entry['r2'] = r2_score(y_testing[mask],
                                          predicted_testing[mask])
                run_entry['mse'] = mean_squared_error(y_testing[mask],
                                          predicted_testing[mask])
                run_entry['mae'] = mean_absolute_error(y_testing[mask],
                                          predicted_testing[mask])
                run_entry['mape'] = mean_absolute_percentage_error(y_testing[mask],
                                          predicted_testing[mask])
                
                hyperparameter_tuning_random_forest_df = hyperparameter_tuning_random_forest_df.append(run_entry,ignore_index=True)
                
  hyperparameter_tuning_random_forest_df.to_csv(f'results/Hyperparameter Tuning/{datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")}_hyperparameter_tuning_rfr.csv')
  grouped_hyperparameters = hyperparameter_tuning_random_forest_df.groupby(
    by=['n_estimators','max_features','max_depth','min_samples_split',
        'min_samples_leaf','bootstrap']).mean().sort_values(by=['r2',
                                                                    'mse',
                                                                    'mae',
                                                                    'mape']
                                                            ,ascending=[False,
                                                                        True,
                                                                        True,
                                                                        True])
  grouped_hyperparameters.to_csv(
    f'results/Hyperparameter Tuning/{datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")}_grouped_hyperparameter_tuning_rfr.csv')
  grouped_hyperparameters = grouped_hyperparameters.reset_index()
  estimators = grouped_hyperparameters['n_estimators'].iloc[0]
  max_features = grouped_hyperparameters['max_features'].iloc[0]
  max_depth = grouped_hyperparameters['max_depth'].iloc[0]
  min_samples_split = grouped_hyperparameters['min_samples_split'].iloc[0]
  min_samples_leaf = grouped_hyperparameters['min_samples_leaf'].iloc[0]
  btstr = grouped_hyperparameters['bootstrap'].iloc[0]


In [None]:
if hyperparameter_tuning:
  print(hyperparameter_tuning_random_forest_df)

In [None]:
if hyperparameter_tuning:
  print(grouped_hyperparameters)

In [None]:
shuffled_topics_over_time = one_hot_encoded_df.sample(frac=1)
dataset_lenght = len(shuffled_topics_over_time)
dataset_split = int(dataset_lenght/10)

best_r2 = -1
sum_r2 = 0
best_yval = None
best_ypredicted = None

for index in range(10):
  print(f'Split # {(index+1)}')
  train_data = shuffled_topics_over_time.iloc[(index*dataset_split):
                                              ((index+1)*dataset_split)]
  test_data = pd.concat([shuffled_topics_over_time.iloc[0:(index*dataset_split)],
                        shuffled_topics_over_time.iloc[((index+1)
                        *dataset_split):]],ignore_index=True)
  s_mdl, r2, yval, ypredicted = train_test_model(train_data,test_data,features)
  sum_r2 += r2

  if r2 > best_r2:
    best_r2 = r2
    regression_model = pickle.loads(s_mdl)
    dump(regression_model,
         'models/ohe_best_model.joblib')
    best_yval = yval
    best_ypredicted = ypredicted

In [None]:
f"Average R2 score: {sum_r2/10}"

In [None]:
f"Best R2 score: {best_r2}"

In [None]:
plot_prediction(best_yval,best_ypredicted)

Check features

In [None]:
features = ['Topic','Frequency','Lag-1','Diff-1','Rolling-4']

In [None]:
sns.clustermap(topics_over_time[features].corr())

In [None]:
topics_over_time[topics_over_time['Topic']==36]

In [None]:
value = topics_over_time[topics_over_time['Topic']==36]['Words'].unique()
value

### First data split: percentage based sampling from each topic

In [None]:
topics_timeseries = topics_over_time[['Topic','Frequency','Timestamp',
                                              'Frequency_Next_Year','Lag-1',
                                              'Diff-1',
                                              'Rolling-4']]
topics_list = list(topics_timeseries['Topic'].unique())

train_data = pd.DataFrame()
test_data = pd.DataFrame()
predict_data = pd.DataFrame()

for topic in topics_list:
  temp_df = topics_timeseries[topics_timeseries['Topic']==topic].sort_values(
      'Timestamp')
  topic_size = len(temp_df)
  predict_data = pd.concat([predict_data,temp_df.iloc[topic_size-1:]],
                           ignore_index=True)
  test_data = pd.concat([test_data,temp_df.iloc[topic_size-2:topic_size-1]],
                        ignore_index=True)
  train_data = pd.concat([train_data,temp_df.iloc[:topic_size-2]],
                         ignore_index=True)

timeseries_mdl,r2_timeseries, yval, ypredicted = train_test_model(train_data,
                                                                  test_data,
                                                                  features,
                                                                  verbose=True)
mdl = pickle.loads(timeseries_mdl)
imputer = KNNImputer(n_neighbors=2)
to_predict_values = x_testing = imputer.fit_transform(predict_data[features])
predicted_values = mdl.predict(to_predict_values)
predict_data['Frequency_Next_Year'] = predicted_values

In [None]:
predict_data

In [None]:
plot_prediction(yval,ypredicted)

### Second data split: random sampling from dataset(90/10)

In [None]:
shuffled_topics_over_time = topics_over_time[['Topic','Frequency','Timestamp',
                                              'Frequency_Next_Year','Lag-1',
                                              'Diff-1',
                                              'Rolling-4']].sample(frac=1)
dataset_lenght = len(shuffled_topics_over_time)
dataset_split = int(dataset_lenght/10)

# Redirect output for logging purposes
orig_stdout = sys.stdout
f = open(f'logs/{datetime.datetime.now().strftime("%Y-%m-%d_%H_%M")}_random_regression.log', "w")
sys.stdout = f
print(datetime.datetime.now())

best_r2 = -1
sum_r2 = 0
best_yval = None
best_ypredicted = None

for index in range(10):
  print(f'Split # {(index+1)}')
  train_data = shuffled_topics_over_time.iloc[(index*dataset_split):
                                              ((index+1)*dataset_split)]
  test_data = pd.concat([shuffled_topics_over_time.iloc[0:(index*dataset_split)],
                        shuffled_topics_over_time.iloc[((index+1)*dataset_split):]],ignore_index=True)
  s_mdl, r2, yval, ypredicted = train_test_model(train_data,test_data,features,
                                                 verbose=True)
  sum_r2 += r2

  if r2 > best_r2:
    best_r2 = r2
    regression_model = pickle.loads(s_mdl)
    dump(regression_model,
         'models/best_model.joblib')
    best_yval = yval
    best_ypredicted = ypredicted

# Restore default output
sys.stdout = orig_stdout
f.close()



In [None]:
f"Average R2 score: {sum_r2/10}"

In [None]:
f"Best R2 score: {best_r2}"

In [None]:
plot_prediction(best_yval,best_ypredicted)