In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import umap

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def apply_pca2(merged_df, column_list, n_components, new_columns): 
    '''Applies PCA to a df and reduces the vector to 2D'''
    # scaler
    scaler = StandardScaler()
    merged_df[new_columns] = scaler.fit_transform(merged_df[column_list])

    # pca 
    pca = PCA(n_components=n_components)
    merged_df["pca_component1"], merged_df["pca_component2"] = zip(*pca.fit_transform(merged_df[new_columns]))
    return pca, merged_df

def apply_umap2(merged_df, column_list, n_components, new_columns):
    '''Applies UMAP to a df and reduces the vector to 2D'''
    # scaler 
    scaler = StandardScaler()
    merged_df[new_columns] = scaler.fit_transform(merged_df[column_list])

    # umap 
    reducer = umap.UMAP(n_components=n_components, random_state=42)
    merged_df["umap_component1"], merged_df["umap_component2"] = zip(*reducer.fit_transform(merged_df[new_columns]))
    return umap, merged_df

In [2]:
june_df = pd.read_csv("../output_network/stance_df_june.csv", sep = "\t")
dec_df = pd.read_csv("../output_network/stance_df_december.csv", sep = "\t")

In [4]:
june_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2514 entries, 0 to 2513
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               2514 non-null   int64  
 1   user                     2514 non-null   object 
 2   A_x                      2514 non-null   float64
 3   F_x                      2514 non-null   float64
 4   N_x                      2514 non-null   float64
 5   stance_abortion          2514 non-null   float64
 6   A_y                      2514 non-null   float64
 7   F_y                      2514 non-null   float64
 8   N_y                      2514 non-null   float64
 9   stance_marriage          2514 non-null   float64
 10  A                        2514 non-null   float64
 11  F                        2514 non-null   float64
 12  N                        2514 non-null   float64
 13  stance_political         2514 non-null   float64
 14  stance_political_scaled 

In [4]:
june_df[["stance_political", "stance_abortion", "stance_marriage"]].corr()

Unnamed: 0,stance_political,stance_abortion,stance_marriage
stance_political,1.0,0.081584,0.049109
stance_abortion,0.081584,1.0,0.15247
stance_marriage,0.049109,0.15247,1.0


In [6]:
pca, new_j_df = apply_pca2(june_df, ["stance_abortion", "stance_marriage", "stance_political"], 2, ["stance_political_scaled", "stance_abortion_scaled", "stance_marriage_scaled"])

In [None]:
#Explained variance ratio of 1st and 2nd PC respectively
pca.explained_variance_ratio_

array([0.39877038, 0.32008285])

In [18]:
umap_obj, new_umap_df = apply_pca2(june_df, ["stance_abortion", "stance_marriage", "stance_political"], 2, ["stance_political_scaled", "stance_abortion_scaled", "stance_marriage_scaled"])

In [19]:
from sklearn.manifold import trustworthiness

# X is original 3D data (n_samples, 3)
# X_umap is UMAP-reduced data (n_samples, 2)

trust = trustworthiness(new_umap_df[["stance_political_scaled", "stance_abortion_scaled", "stance_marriage_scaled"]], new_umap_df[["umap_component1", "umap_component2"]], n_neighbors=10)
print(f"Trustworthiness: {trust:.3f}")

Trustworthiness: 0.995


In [17]:
anon_test = pd.read_csv("../annotated/re_annotated_test_data.csv", sep=',')
anon_train = pd.read_csv("../annotated/re_annotated_train_data.csv", sep=',')


In [21]:
anon_test["annotated_stance"].value_counts()

annotated_stance
neutral    34
against    34
for        32
Name: count, dtype: int64

In [22]:
anon_train["annotated_stance"].value_counts()

annotated_stance
for        40
against    30
neutral    30
Name: count, dtype: int64