The goal of the notebook is to create a function to run PCA. The main purpose of this is to reduce the dimensionality of our BERT embeddings from the video animations.

In [1]:
import pandas as pd 
data = pd.read_csv("../Video_Data/embeddings_data.csv")

In [10]:
import numpy as np
from sklearn.decomposition import PCA

def pca_90_percent(X):
    # Standardize the data (mean=0, variance=1)
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Initialize PCA and fit to the data
    pca = PCA()
    pca.fit(X_scaled)
    
    # Calculate cumulative explained variance
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    
    # Find the number of components to retain 90% variance
    num_components = np.argmax(cumulative_variance >= 0.95) + 1
    
    # Fit PCA again with the determined number of components
    pca = PCA(n_components=num_components)
    X_pca = pca.fit_transform(X_scaled)
    
    return X_pca, pca

# Example usage:
# X is your data matrix (e.g., shape (n_samples, n_features))
# X_pca, pca = pca_90_percent(X)


In [11]:
X = data.drop(columns=["game_id", "play_id"]).select_dtypes('number')

In [12]:
X

Unnamed: 0,emb1,emb2,emb3,emb4,emb5,emb6,emb7,emb8,emb9,emb10
0,0.638034,0.028732,0.386127,-0.498500,1.383350,-0.747871,-0.490596,0.729719,-0.308404,0.236588
1,0.209874,-0.080251,0.324387,-0.229344,0.811116,-0.284877,-0.062086,0.502232,-0.038244,-0.056882
2,0.334418,-0.050700,0.340614,-0.304905,0.972251,-0.416566,-0.187807,0.565736,-0.116735,0.026979
3,0.334044,-0.050846,0.340536,-0.304610,0.971612,-0.416081,-0.187429,0.565443,-0.116457,0.026655
4,0.334425,-0.050712,0.340605,-0.304892,0.972238,-0.416564,-0.187825,0.565737,-0.116745,0.026985
...,...,...,...,...,...,...,...,...,...,...
1421,-0.047799,-0.113828,0.303247,-0.104842,0.542509,-0.048374,0.199671,0.408788,0.107977,-0.204012
1422,0.209283,-0.080454,0.324257,-0.228904,0.810218,-0.284181,-0.061533,0.501882,-0.037887,-0.057300
1423,-0.047810,-0.113822,0.303231,-0.104839,0.542502,-0.048364,0.199662,0.408793,0.107955,-0.204005
1424,0.091826,-0.107235,0.309873,-0.159042,0.660973,-0.161517,0.057629,0.443341,0.036148,-0.135646


In [13]:
X_pca, pca = pca_90_percent(X)


In [14]:
X_pca

array([[ 3.70810855],
       [-1.78894314],
       [-0.23932368],
       ...,
       [-4.39710601],
       [-3.2337244 ],
       [-3.23181192]])

In [16]:
data

Unnamed: 0,game_id,play_id,emb1,emb2,emb3,emb4,emb5,emb6,emb7,emb8,emb9,emb10
0,2022090800,56,0.638034,0.028732,0.386127,-0.498500,1.383350,-0.747871,-0.490596,0.729719,-0.308404,0.236588
1,2022090800,80,0.209874,-0.080251,0.324387,-0.229344,0.811116,-0.284877,-0.062086,0.502232,-0.038244,-0.056882
2,2022090800,101,0.334418,-0.050700,0.340614,-0.304905,0.972251,-0.416566,-0.187807,0.565736,-0.116735,0.026979
3,2022090800,122,0.334044,-0.050846,0.340536,-0.304610,0.971612,-0.416081,-0.187429,0.565443,-0.116457,0.026655
4,2022090800,167,0.334425,-0.050712,0.340605,-0.304892,0.972238,-0.416564,-0.187825,0.565737,-0.116745,0.026985
...,...,...,...,...,...,...,...,...,...,...,...,...
1421,2022103012,3671,-0.047799,-0.113828,0.303247,-0.104842,0.542509,-0.048374,0.199671,0.408788,0.107977,-0.204012
1422,2022103012,3693,0.209283,-0.080454,0.324257,-0.228904,0.810218,-0.284181,-0.061533,0.501882,-0.037887,-0.057300
1423,2022103012,3717,-0.047810,-0.113822,0.303231,-0.104839,0.542502,-0.048364,0.199662,0.408793,0.107955,-0.204005
1424,2022103012,3739,0.091826,-0.107235,0.309873,-0.159042,0.660973,-0.161517,0.057629,0.443341,0.036148,-0.135646


In [21]:
np.hstack((data.loc[:,["game_id", "play_id"]].values,X_pca))

array([[ 2.02209080e+09,  5.60000000e+01,  3.70810855e+00],
       [ 2.02209080e+09,  8.00000000e+01, -1.78894314e+00],
       [ 2.02209080e+09,  1.01000000e+02, -2.39323679e-01],
       ...,
       [ 2.02210301e+09,  3.71700000e+03, -4.39710601e+00],
       [ 2.02210301e+09,  3.73900000e+03, -3.23372440e+00],
       [ 2.02210301e+09,  3.76100000e+03, -3.23181192e+00]])

In [22]:
pca_df = pd.DataFrame(np.hstack((data.loc[:,["game_id", "play_id"]].values,X_pca)))


In [25]:
pca_df.columns = ["gameId","playId","pca"]

In [26]:
pca_df

Unnamed: 0,gameId,playId,pca
0,2.022091e+09,56.0,3.708109
1,2.022091e+09,80.0,-1.788943
2,2.022091e+09,101.0,-0.239324
3,2.022091e+09,122.0,-0.245404
4,2.022091e+09,167.0,-0.239448
...,...,...,...
1421,2.022103e+09,3671.0,-4.397029
1422,2.022103e+09,3693.0,-1.797558
1423,2.022103e+09,3717.0,-4.397106
1424,2.022103e+09,3739.0,-3.233724


In [27]:

pca_df.to_csv("pca_df_final.csv")

In [34]:
pca_df

Unnamed: 0,0,1
0,3.735305,0.320971
1,-1.755982,0.082447
2,-0.208947,0.162037
3,-0.215749,0.171472
4,-0.211361,0.192573
...,...,...
1421,-4.413023,-0.539310
1422,-1.817401,-0.416444
1423,-4.414696,-0.518028
1424,-3.253444,-0.459642
