In [None]:
from google.colab import drive

drive.mount("/content/drive")

In [None]:
!pip install bertopic

In [None]:
from bertopic import BERTopic
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from tqdm.notebook import trange
from sklearn.metrics import classification_report
from sklearn.svm import SVC
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
topic_model = BERTopic.load(
    "/content/drive/MyDrive/Colab Notebooks/topic-modelling-risk-factors/models/BERTopic_model_collab_custom_default_default"
)

In [None]:
input_term = "tractor"

In [None]:
similar_topics, similarity = topic_model.find_topics(input_term, top_n=5)

In [None]:
similar_topics[0]

In [None]:
# Uses saved DTM output stored on Drive
topics_over_time = pd.read_csv(
    "/content/drive/MyDrive/Colab Notebooks/topic-modelling-risk-factors/results/DTM/DTM_collab_custom_default_default.csv"
)

In [None]:
topics = topics_over_time["Topic"].unique()
len(topics)

In [None]:
timestamps = topics_over_time["Timestamp"].unique()
len(timestamps)

In [None]:
# Normalisation methods available: None | mean | min-max
normalisation_method = None

In [None]:
topics_over_time = topics_over_time.sort_values(by=["Topic", "Timestamp"])
if normalisation_method == "mean":
    topics_over_time["Frequency"] = (
        topics_over_time["Frequency"] - topics_over_time["Frequency"].mean()
    ) / topics_over_time["Frequency"].std()
elif normalisation_method == "min-max":
    topics_over_time["Frequency"] = (
        topics_over_time["Frequency"] - topics_over_time["Frequency"].min()
    ) / (topics_over_time["Frequency"].max() - topics_over_time["Frequency"].min())

In [None]:
topics_over_time["Frequency_Next_Year"] = topics_over_time.groupby("Topic")[
    "Frequency"
].shift(-1)
topics_over_time["Lag-1"] = topics_over_time.groupby("Topic")["Frequency"].shift(1)
topics_over_time["Diff-1"] = topics_over_time.groupby("Topic")["Frequency"].diff(1)
topics_over_time["Rolling-4"] = (
    topics_over_time.groupby("Topic")["Frequency"]
    .rolling(4)
    .mean()
    .reset_index(level=0, drop=True)
)
topics_over_time["is_growing"] = (
    topics_over_time["Frequency_Next_Year"] - topics_over_time["Frequency"]
) > 0

In [None]:
topics_over_time["Rolling-2"] = (
    topics_over_time.groupby("Topic")["Frequency"]
    .rolling(2)
    .mean()
    .reset_index(level=0, drop=True)
)
topics_over_time["Significance"] = (
    topics_over_time.groupby("Topic")["Frequency"]
    .rolling(1)
    .mean()
    .reset_index(level=0, drop=True)
)
topics_over_time["MACD"] = topics_over_time["Rolling-4"] - topics_over_time["Rolling-2"]
topics_over_time["Signal"] = (
    topics_over_time.groupby("Topic")["MACD"]
    .rolling(1)
    .mean()
    .reset_index(level=0, drop=True)
)
topics_over_time["Hist"] = topics_over_time["MACD"] - topics_over_time["Signal"]

In [None]:
f, ax = plt.subplots(figsize=(10, 8))
corr = topics_over_time[["Topic", "Frequency", "Diff-1", "MACD"]].corr()
sns.heatmap(
    corr,
    mask=np.zeros_like(corr, dtype=np.bool),
    cmap=sns.diverging_palette(220, 10, as_cmap=True),
    square=True,
    ax=ax,
)

In [None]:
import plotly.express as px

# plot_tot = topics_over_time[topics_over_time['Frequency'] < 50]
fig = px.scatter_3d(
    topics_over_time, x="MACD", y="Frequency", z="Diff-1", color="is_growing"
)
fig.show()

In [None]:
features_df = topics_over_time[
    [
        "Timestamp",
        "Topic",
        "Frequency_Next_Year",
        "Frequency",
        "Lag-1",
        "Diff-1",
        "Rolling-4",
    ]
]

ct = ColumnTransformer(
    [
        (
            "one_hot_encoder",
            OneHotEncoder(categories="auto", sparse=False),
            ["Timestamp", "Topic"],
        )
    ],
    remainder="passthrough",
)

X = ct.fit_transform(features_df)
features = []
for year in timestamps:
    features.append(f"Is_Year_{year}")
for topic in topics:
    features.append(f"Is_Topic_{topic}")
features.extend(["Frequency_Next_Year", "Frequency", "Lag-1", "Diff-1", "Rolling-4"])
one_hot_encoded_df = pd.DataFrame(X, columns=features)
features.remove("Frequency_Next_Year")
one_hot_encoded_df

In [None]:
one_hot_encoded_df["is_growing"] = (
    one_hot_encoded_df["Frequency_Next_Year"] - one_hot_encoded_df["Frequency"] > 0
)
one_hot_encoded_df

In [None]:
shuffled_topics_over_time = one_hot_encoded_df.sample(frac=1)
dataset_lenght = len(shuffled_topics_over_time)
dataset_split = int(dataset_lenght / 10)

best_r2 = -1
sum_r2 = 0
best_yval = None
best_ypredicted = None

for index in trange(10):
    run_entry = {"split_number": (index + 1)}
    train_df = shuffled_topics_over_time.iloc[
        (index * dataset_split) : ((index + 1) * dataset_split)
    ]
    test_df = pd.concat(
        [
            shuffled_topics_over_time.iloc[0 : (index * dataset_split)],
            shuffled_topics_over_time.iloc[((index + 1) * dataset_split) :],
        ],
        ignore_index=True,
    )

    # Data preparation
    imputer = SimpleImputer()
    train_df = train_df.dropna(subset=["Frequency_Next_Year", "is_growing"])

    x_training = imputer.fit_transform(train_df[features])
    y_training = train_df["is_growing"]
    x_testing = imputer.transform(test_df[features])
    y_testing = test_df["is_growing"]

    clf = make_pipeline(StandardScaler(), SVC(gamma="auto"))
    clf.fit(x_training, y_training)

    y_pred = clf.predict(x_testing)
    print(classification_report(y_testing, y_pred))