In [None]:
import random
from datetime import datetime
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_percentage_error, r2_score
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

In [None]:
topics_over_time = pd.read_csv(
    'DTM_collab_dataset_small_07_22__220000_manual_sentence-transformers_default.csv')


In [None]:
# Specifying how many values to predict
time_step = 1
topic_number = 10


In [None]:
topics_over_time = topics_over_time[topics_over_time['Topic']!=-1]
topics_over_time.shape


In [None]:
topics_over_time['Timestamp'].max()

In [None]:
topics_over_time = topics_over_time.sort_values(by=['Topic', 'Timestamp'])
topics_over_time['Frequency'] = (topics_over_time['Frequency']-topics_over_time['Frequency'].mean())/topics_over_time['Frequency'].std()


In [None]:
topics_over_time['Frequency_Next_Year'] = topics_over_time.groupby('Topic')['Frequency'].shift(-1)
topics_over_time['Lag-1'] = topics_over_time.groupby('Topic')['Frequency'].shift(1)
topics_over_time['Diff-1'] = topics_over_time.groupby('Topic')['Frequency'].diff(1)
topics_over_time['Rolling-4'] = topics_over_time.groupby(
    'Topic')['Frequency'].rolling(4).mean().reset_index(level=0, drop=True)


In [None]:
topics_over_time[topics_over_time['Topic']==topic_number]

In [None]:
topics_index = list(set(topics_over_time['Topic'].unique()))
training_topics_index = random.sample(topics_index, 90)
training = topics_over_time[topics_over_time['Topic'].isin(training_topics_index)]
testing = topics_over_time[~(topics_over_time['Topic'].isin(training_topics_index))]
training = training.dropna()


In [None]:
# training = topics_over_time[(topics_over_time['Timestamp'] <= '2018')]
# testing = topics_over_time[(topics_over_time['Timestamp'] > '2018')]

In [None]:
training.head()

In [None]:
testing.head()

In [None]:
training[training['Topic'] == topic_number]


In [None]:
features = ['Frequency', 'Lag-1', 'Diff-1', 'Rolling-4']


In [None]:
imputer = SimpleImputer()
Xtr = imputer.fit_transform(training[features])
ytr = training['Frequency_Next_Year']

mdl = RandomForestRegressor(n_estimators=600, random_state=0, n_jobs=16)
mdl.fit(Xtr, ytr)


In [None]:
testing = testing.dropna(subset=['Frequency_Next_Year'])
Xval = imputer.transform(testing[features])
yval = testing['Frequency_Next_Year']

p = mdl.predict(Xval)


In [None]:
t_testing = testing.dropna()
mean_absolute_percentage_error(t_testing['Frequency_Next_Year'], t_testing['Frequency'])


In [None]:
print(mean_absolute_percentage_error(yval, p))
print(yval[:10])
print(p[:10])


In [None]:
r2_score(yval, p)

In [None]:
t = np.linspace(min(yval),max(yval),len(yval))
fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=yval, y=p,
                    mode='markers',
                    name='markers'))
fig.add_trace(go.Scatter(x=t, y=t,
                    mode='lines',
                    name='lines'))
fig.show()