In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier)
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from tqdm.notebook import trange
from sklearn.metrics import confusion_matrix
import plotly.figure_factory as ff
from sklearn.metrics import roc_curve, auc


In [None]:
# Parameter selection
random_state_const = None

# For reproducing resuts
random_state_const = 777

# Imputer model
# simple - SimpleImputer
# k - KMeanImputer
imputer_choice = 'k'

# Classification model
# svc - Using an SVC pipeline with a standard scaler
# gb - GradientBoostingClassifier 
# ada - AdaBoostClassifier
classification_choice = 'gb'

# Data preprocessing to assist with imbalance between topics which are rising or not
undersample = False
oversample = False # To improve the accuracy of the 'increasing' category

In [None]:
# Uses saved DTM output stored on Drive
topics_over_time = pd.read_csv('output/DTM/DTM_exp_local_custom_use_custom_10_(1, 2)_800.csv')

In [None]:
topics = topics_over_time['Topic'].unique()
len(topics)

In [None]:
timestamps = topics_over_time['Timestamp'].unique()
len(timestamps)

In [None]:
# Normalisation methods available: None | mean | min-max
normalisation_method = None

In [None]:
topics_over_time = topics_over_time.sort_values(by=['Topic','Timestamp'])
if normalisation_method == 'mean':
  topics_over_time['Frequency'] = ((topics_over_time['Frequency']
                                   -topics_over_time['Frequency'].mean())
                                   /topics_over_time['Frequency'].std())
elif normalisation_method == 'min-max':
  topics_over_time['Frequency'] = (topics_over_time['Frequency']-
                                   topics_over_time['Frequency'].min())/(
                                       topics_over_time['Frequency'].max()
                                       -topics_over_time['Frequency'].min())

In [None]:
topics_over_time['Frequency_Next_Year'] = topics_over_time.groupby('Topic')['Frequency'].shift(-1)
topics_over_time['Lag-1'] = topics_over_time.groupby('Topic')['Frequency'].shift(1)
topics_over_time['Diff-1'] = topics_over_time.groupby('Topic')['Frequency'].diff(1)
topics_over_time['Rolling-8'] = topics_over_time.groupby('Topic')['Frequency'].rolling(8).mean().reset_index(level=0,drop=True)
topics_over_time['Rolling-4'] = topics_over_time.groupby('Topic')['Frequency'].rolling(4).mean().reset_index(level=0,drop=True)
topics_over_time['is_growing'] = ((topics_over_time['Frequency_Next_Year']-topics_over_time['Frequency'])>0)

In [None]:
topics_over_time['Rolling-2'] = topics_over_time.groupby('Topic')['Frequency'].rolling(2).mean().reset_index(level=0,drop=True)
topics_over_time['MACD'] = topics_over_time['Rolling-8'] - topics_over_time['Rolling-4']
topics_over_time['Signal'] = topics_over_time.groupby('Topic')['MACD'].rolling(2).mean().reset_index(level=0,drop=True)
topics_over_time['Hist'] = topics_over_time['MACD'] - topics_over_time['Signal'] 

In [None]:
print(topics_over_time[topics_over_time['is_growing']==True].shape)
print(topics_over_time[topics_over_time['is_growing']==False].shape)

In [None]:
topics_over_time[topics_over_time['MACD'].notna()]

In [None]:
f, ax = plt.subplots(figsize=(10, 8))
corr = topics_over_time[['Topic','Frequency','Diff-1','MACD','Hist','Frequency_Next_Year']].corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool),
            cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax,annot=True)

In [None]:
sns.clustermap(topics_over_time[['Diff-1','Lag-1','Rolling-4','Rolling-8','MACD','Signal','Hist']].corr())

In [None]:
sns.clustermap(topics_over_time[['Topic','Frequency','Diff-1','MACD','Hist','Frequency_Next_Year','Lag-1']].corr(),annot=True)

In [None]:
features = ['Topic','Frequency','Diff-1','MACD','Hist','Lag-1']

In [None]:
fig = px.scatter_3d(topics_over_time, x='Lag-1', y='Diff-1', z='MACD', color='Topic', symbol='is_growing', size='Frequency')
fig.show()

In [None]:
def plot_confusion_matrix(mat):
    x = ['Decreasing topic','Increasing topic']
    y =  ['Decreasing topic','Increasing topic']

    # change each element of z to type string for annotations
    z_text = [[str(y) for y in x] for x in mat]
    # set up figure 
    fig = ff.create_annotated_heatmap(mat, x=x, y=y, annotation_text=z_text, colorscale='Viridis')

    # add title
    fig.update_layout(title_text='<i><b>Confusion matrix</b></i>',
                    #xaxis = dict(title='x'),
                    #yaxis = dict(title='x')
                    )

    # add custom xaxis title
    fig.add_annotation(dict(font=dict(color="black",size=14),
                            x=0.5,
                            y=-0.15,
                            showarrow=False,
                            text="Predicted value",
                            xref="paper",
                            yref="paper"))

    # add custom yaxis title
    fig.add_annotation(dict(font=dict(color="black",size=14),
                            x=-0.35,
                            y=0.5,
                            showarrow=False,
                            text="Real value",
                            textangle=-90,
                            xref="paper",
                            yref="paper"))

    # adjust margins to make room for yaxis title
    fig.update_layout(margin=dict(t=50, l=200))

    # add colorbar
    fig['data'][0]['showscale'] = True
    fig.show()


In [None]:
def plot_roc(fpr, tpr):
    fig = px.area(
        x=fpr, y=tpr,
        title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
        labels=dict(x='False Positive Rate', y='True Positive Rate'),
        width=700, height=500
    )
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1
    )

    fig.update_yaxes(scaleanchor="x", scaleratio=1)
    fig.update_xaxes(constrain='domain')
    fig.show()


In [None]:
shuffled_topics_over_time = topics_over_time.sample(frac=1)

dataset_lenght = len(shuffled_topics_over_time)
dataset_split = int(dataset_lenght/10)

best_r2 = -1
sum_r2 = 0
best_yval = None
best_ypredicted = None

cfs_matrix, cls_report ,fpr, tpr = None, None, None, None
for index in trange(10):

    run_entry = {'split_number': (index+1)}

    if undersample:

        true_class_df = shuffled_topics_over_time[shuffled_topics_over_time['is_growing'] == True]
        train_true_df = true_class_df.sample(frac=0.9, random_state=random_state_const)
        test_true_df = true_class_df.drop(train_true_df.index)

        one_class_size = len(true_class_df)

        false_class_df = shuffled_topics_over_time[shuffled_topics_over_time['is_growing'] == False].sample(
            n=one_class_size, random_state=random_state_const)
        train_false_df = false_class_df.sample(frac=0.9, random_state=random_state_const)
        test_false_df = false_class_df.drop(train_false_df.index)

        train_df = pd.concat([train_true_df, train_false_df], ignore_index=True)
        test_df = pd.concat([test_true_df, test_false_df], ignore_index=True)

    else:
        train_df = shuffled_topics_over_time.sample(frac=0.9, random_state=random_state_const)  # random state is a seed value
        test_df = shuffled_topics_over_time.drop(train_df.index)

    # Data preparation
    train_df = train_df.dropna(subset=['Frequency_Next_Year', 'is_growing'])

    if imputer_choice == 'k':
        imputer = KNNImputer()
    else:
        imputer = SimpleImputer()
    

    x_training = imputer.fit_transform(train_df[features])
    y_training = train_df['is_growing']
    x_testing = imputer.transform(test_df[features])
    y_testing = test_df['is_growing']
    
    if oversample:
        sm = SMOTE(random_state=random_state_const,n_jobs=8)
        x_training, y_training = sm.fit_resample(x_training, y_training)

   
    if classification_choice=='svc':
        clf = make_pipeline(StandardScaler(), SVC(gamma='auto',probability=True))
        clf.fit(x_training,y_training)
        y_pred = clf.predict(x_testing)
        y_score = clf.predict_proba(x_testing)[:, 1]
    elif classification_choice=='gb':
        model_GB = GradientBoostingClassifier(n_estimators=1000)
        model_GB.fit(x_training , y_training)
        y_pred = model_GB.predict(x_testing)
        y_score = model_GB.predict_proba(x_testing)[:, 1]
    else:
        model_ad = AdaBoostClassifier()
        model_ad.fit(x_training , y_training)
        y_pred = model_ad.predict(x_testing)
        print(classification_report(y_testing, y_pred))
        y_score = model_ad.predict_proba(x_testing)[:, 1]

    cfs_matrix = confusion_matrix(y_testing, y_pred)
    cls_report = classification_report(y_testing, y_pred)
    fpr, tpr, thresholds = roc_curve(y_testing, y_score)

print(cls_report)
plot_confusion_matrix(cfs_matrix)
plot_roc(fpr,tpr)
