### Import

In [1]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer



### Load JSON

In [2]:
df = pd.read_json(r"C:\Users\hdmqu\Documents\GitHub\AI2Papers\data\data_30-01-2024_mr_2000_edited.json")

### Explore data

In [3]:
print("Number of entries:", len(df))

print("DataFrame Structure:")
print(df.head())

Number of entries: 400
DataFrame Structure:
                                               title  \
0  Two Stones Hit One Bird: Bilevel Positional En...   
1   Learning to Manipulate under Limited Information   
2  Scaling Sparse Fine-Tuning to Large Language M...   
3  A Survey on Visual Anomaly Detection: Challeng...   
4  Zero-shot Imitation Policy via Search in Demon...   

                                             authors  \
0  Zhenyu He, Guhao Feng, Shengjie Luo, Kai Yang,...   
1  Wesley H. Holliday, Alexander Kristoffersen, E...   
2  Alan Ansell, Ivan Vulić, Hannah Sterz, Anna Ko...   
3  Yunkang Cao, Xiaohao Xu, Jiangning Zhang, Yuqi...   
4  Federco Malato, Florian Leopold, Andrew Melnik...   

                                            abstract  \
0    In this work, we leverage the intrinsic segm...   
1    By classic results in social choice theory, ...   
2    Large Language Models (LLMs) are difficult t...   
3    Visual Anomaly Detection (VAD) endeavors to ...   
4 

In [4]:
missing_values = df.isnull().sum()

print("Missing Values:")
print(missing_values)

Missing Values:
title         0
authors       0
abstract      0
link          0
label       245
dtype: int64


In [5]:
count_vec = CountVectorizer()

ab_counts = count_vec.fit_transform(df["abstract"])
ti_counts = count_vec.fit_transform(df["title"])

In [6]:
ab_transformer = TfidfTransformer(use_idf=False).fit(ab_counts)
ti_transformer = TfidfTransformer(use_idf=False).fit(ti_counts)

ab_tf = ab_transformer.transform(ab_counts)
ti_tf = ti_transformer.transform(ti_counts)


#### 2 Components PCA

In [7]:
# Apply PCA to reduce dimensionality
pca = PCA(n_components=2)
pca_ab = pca.fit_transform(ab_tf.toarray())
pca_ti = pca.fit_transform(ti_tf.toarray())

# Add PCA components to DataFrame
df["PCA1_abstract"] = pca_ab[:, 0]
df["PCA2_abstract"] = pca_ab[:, 1]
df["PCA1_title"] = pca_ti[:, 0]
df["PCA2_title"] = pca_ti[:, 1]

#### Visualization

In [8]:
# Non-null for visualizations
df_filtered = df[df['label'].notna()]

In [9]:
# Scatter plot of PCA components of Abstract
fig = px.scatter(df_filtered, x="PCA1_abstract", y="PCA2_abstract", 
                 color="label", hover_data=["label"], 
                 title="PCA of TF-IDF Vectors of Abstract")
# Remove legend
fig.update_traces(legendgroup="", showlegend=False)
fig.update_coloraxes(showscale=False)
fig.show()


In [10]:
# Scatter plot of PCA components of Title
fig = px.scatter(df_filtered, x="PCA1_title", y="PCA2_title", 
                 color="label", hover_data=["label"], 
                 title="PCA of TF-IDF Vectors of Title")
# Remove legend
fig.update_traces(legendgroup="", showlegend=False)
fig.update_coloraxes(showscale=False)
fig.show()


#### Scatter matrix

In [11]:
col = ["PCA1_title", "PCA2_title", "PCA1_abstract", "PCA2_abstract", "label"]

fig = px.scatter_matrix(df_filtered[col], color="label", opacity=0.7)

fig.show()


iteritems is deprecated and will be removed in a future version. Use .items instead.



#### 3D scatter plot

In [12]:
fig = px.scatter_3d(df_filtered, x="PCA2_title", y="PCA2_abstract", z="PCA1_title", color="label", opacity=0.7)
fig.update_coloraxes(showscale=False)
fig.show()

#### Correlation Heatmap

In [19]:
correlation_matrix = df_filtered[col].corr()
print(correlation_matrix)

# Mask the upper triangular part (including the diagonal)
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

# Create a heatmap using plotly
fig = go.Figure(data=go.Heatmap(z=correlation_matrix.values,
                                x=correlation_matrix.columns,
                                y=correlation_matrix.columns,
                                colorscale="balance",
                                colorbar=dict(title="Correlation")))

# Update layout for better visibility
fig.update_layout(title="Correlation Heatmap",
                  width=800, height=600,
                  xaxis_title="Features",
                  yaxis_title="Features")

# Show the plot
fig.show()

               PCA1_title  PCA2_title  PCA1_abstract  PCA2_abstract     label
PCA1_title       1.000000   -0.089965      -0.055393       0.120406 -0.007418
PCA2_title      -0.089965    1.000000       0.018919      -0.119735 -0.019771
PCA1_abstract   -0.055393    0.018919       1.000000       0.037093 -0.010224
PCA2_abstract    0.120406   -0.119735       0.037093       1.000000 -0.190512
label           -0.007418   -0.019771      -0.010224      -0.190512  1.000000


#### 3 feature PCA

In [None]:
# Apply PCA to reduce dimensionality
pca = PCA(n_components=3)
pca_ab = pca.fit_transform(ab_tf.toarray())
pca_ti = pca.fit_transform(ti_tf.toarray())

# Add PCA components to DataFrame
df["PCA1_abstract"] = pca_ab[:, 0]
df["PCA2_abstract"] = pca_ab[:, 1]
df["PCA3_abstract"] = pca_ab[:, 2]
df["PCA1_title"] = pca_ti[:, 0]
df["PCA2_title"] = pca_ti[:, 1]
df["PCA3_title"] = pca_ab[:, 2]

In [None]:
col = ["PCA1_title", "PCA2_title", "PCA3_title", "PCA1_abstract", "PCA2_abstract", "PCA3_abstract", "label"]

fig = px.scatter_matrix(df_filtered[col], color="label", opacity=0.7)

fig.show()


iteritems is deprecated and will be removed in a future version. Use .items instead.

