# Table of contents

[Supervised - basic models](./supervised_basic.ipynb)

[Supervised - RNN models](./supervised_rnn.ipynb)

[Unsupervised - Word2Vec](./unsupervised_w2v.ipynb)

[Unsupervised - Dimensionality Reduction](./unsupervised_dim.ipynb)

[Unsupervised - LDA](./unsupervised_LDA.ipynb)

[Performance Evaluation](./evaluation.ipynb)

In [104]:

from evaluation_helper import evaluation_helper
import plotly.express as px
import plotly.graph_objects as go


In [105]:
evaluator = evaluation_helper('./data/cleaned_results.json')

In [106]:
eval_df = evaluator.to_df()
eval_df.loc[eval_df.index[:12], 'supervised'] = True
eval_df.loc[eval_df.index[12:], 'supervised'] = False
eval_df.loc[eval_df.index[7], 'supervised'] = False

eval_df = eval_df.reset_index()
eval_df["time_min"] = eval_df["time"].apply(lambda t: float(t) / 60)

eval_df = eval_df.round(4)

In [107]:
eval_df

Unnamed: 0,index,f1,accuracy,precision,recall,roc_auc,time,supervised,time_min
0,LogisticRegression_tokenized,0.6858,0.6869,0.6904,0.6813,0.687,58.4203,True,0.9737
1,MultinomialNB_tokenized,0.6504,0.6582,0.6678,0.6339,0.6583,28.7777,True,0.4796
2,MLPClassifier_tokenized,0.6906,0.6903,0.6921,0.6891,0.6903,525.8764,True,8.7646
3,LogisticRegression,0.7111,0.7105,0.7116,0.7107,0.7105,36.6508,True,0.6108
4,LogisticRegression_w_gridsearch,0.7113,0.7106,0.7117,0.711,0.7106,35.4168,True,0.5903
5,MultinomialNB,0.6505,0.6583,0.6678,0.6341,0.6583,14.1141,True,0.2352
6,MLPClassifier,0.7182,0.7081,0.6962,0.7415,0.708,3380.7564,True,56.3459
7,KMeans_w2v,0.6317,0.5318,0.5206,0.803,0.5318,7.8126,False,0.1302
8,RandomForestClassifier,0.688,0.6765,0.6664,0.711,0.6764,24.2002,True,0.4033
9,RandomForestClassifier_tokenized,0.6505,0.6583,0.6678,0.6341,0.6583,15.2623,True,0.2544


In [108]:
supervised_df = eval_df[eval_df['supervised'] == True]
supervised_df["tokenized"] = supervised_df["index"].apply(lambda i: True if "tokenized" in i else False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [109]:
supervised_df

Unnamed: 0,index,f1,accuracy,precision,recall,roc_auc,time,supervised,time_min,tokenized
0,LogisticRegression_tokenized,0.6858,0.6869,0.6904,0.6813,0.687,58.4203,True,0.9737,True
1,MultinomialNB_tokenized,0.6504,0.6582,0.6678,0.6339,0.6583,28.7777,True,0.4796,True
2,MLPClassifier_tokenized,0.6906,0.6903,0.6921,0.6891,0.6903,525.8764,True,8.7646,True
3,LogisticRegression,0.7111,0.7105,0.7116,0.7107,0.7105,36.6508,True,0.6108,False
4,LogisticRegression_w_gridsearch,0.7113,0.7106,0.7117,0.711,0.7106,35.4168,True,0.5903,False
5,MultinomialNB,0.6505,0.6583,0.6678,0.6341,0.6583,14.1141,True,0.2352,False
6,MLPClassifier,0.7182,0.7081,0.6962,0.7415,0.708,3380.7564,True,56.3459,False
8,RandomForestClassifier,0.688,0.6765,0.6664,0.711,0.6764,24.2002,True,0.4033,False
9,RandomForestClassifier_tokenized,0.6505,0.6583,0.6678,0.6341,0.6583,15.2623,True,0.2544,True
10,SVC,0.7085,0.7072,0.7075,0.7094,0.7072,46212.4337,True,770.2072,False


In [110]:
import plotly.graph_objects as go

for tokenized in [False, True]:
    
    df = supervised_df[supervised_df["tokenized"] == tokenized]

    fig = go.Figure(
        data=[
            go.Bar(name="roc_auc", x=df["index"], y=df["roc_auc"], yaxis='y', offsetgroup=1, text=df["roc_auc"], textposition="auto"),
            go.Bar(name="accuracy", x=df["index"], y=df["accuracy"], yaxis='y', offsetgroup=2, text=df["accuracy"], textposition="auto"),
            go.Bar(name="time_min", x=df["index"], y=df["time_min"], yaxis='y2', offsetgroup=3, text=df["time_min"], textposition="auto"),
        ],
        layout={
            'yaxis': {'title': 'Score'},
            'yaxis2': {'title': 'Time (min)', 'overlaying': 'y', 'side': 'right'}
        }
    )

    # Change the bar mode
    fig.update_layout(barmode='group')
    fig.show()

In [111]:
fig = px.scatter(
    eval_df,
    x="time_min",
    y="roc_auc",
    log_x=True,
    color="supervised",
    text='index'
)

fig.update_traces(textposition="bottom right")
fig.show()

In [112]:
fig = px.scatter(
    eval_df,
    x="time",
    y="accuracy",
    log_x=True,
    color="supervised",
    text='index'
)

fig.update_traces(textposition="bottom right")
fig.show()