## Create a dataset from CSV to pickle

In [25]:
import csv
import json
import pickle
import pandas as pd
from sentence_transformers import SentenceTransformer,util
import time

model = SentenceTransformer('all-MiniLM-L6-v2')
start_time = time.time()
for i in range(229):
    df = pd.read_csv('data/Episode_%d.csv'% i,delimiter=',')
    # New column name
    new_column_name = 'Embedding'

    # Initialize an empty list to store the array values
    new_column_values = []

    # Loop through each row
    for index in range(len(df)):
        array_value = model.encode(df.iloc[index][2])
        # Append the array value to the list
        new_column_values.append(array_value)
        # Assign the new column values to the DataFrame
    df[new_column_name] = new_column_values
    df = df.drop(df.columns[0], axis=1)
    #save the dataset 
    df.to_pickle('data_pickle/Episode_%d.pkl'% i)
    #pickle.dump( df, open( "data_pickle/Episode_%d.p"% i, "wb" ) )

end_time = time.time()
execution_time = end_time - start_time

print("Execution time:", execution_time, "seconds")
# Execution time: 1634.8973593711853 seconds

Execution time: 1634.8973593711853 seconds


## Create a dataset with 20 PCA

In [9]:
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import time
import sklearn

nb_components = 20

start_time = time.time()
for i in range(229):
    df = pd.read_pickle('data_pickle/Episode_%d.pkl'% i)
    X = np.stack(df['Embedding'])

    x_scaled = sklearn.preprocessing.StandardScaler().fit_transform(X)

    pca = PCA(n_components=nb_components)
    pca_features = pca.fit_transform(x_scaled)

    pca_df = pd.DataFrame(
    data=pca_features, 
    columns=[f'PCA{i}' for i in range(1, nb_components+1)])  

    pca_df['Person'] = df['Person']
    pca_df['Said'] = df['Said']

    #save the dataset 
    pca_df.to_pickle('data_pickle/Episode_%d_pca.pkl'% i)
    

end_time = time.time()
execution_time = end_time - start_time

print("Execution time:", execution_time, "seconds")
# Execution time: 24.69307231903076 seconds

Execution time: 24.69307231903076 seconds


In [16]:
pcadf = pd.read_pickle('data_pickle/Episode_2_pca.pkl')
print(pcadf[pcadf['Person'].str.contains('Ross')])

         PCA1      PCA2       PCA3      PCA4      PCA5      PCA6      PCA7  \
0   -3.214021 -6.583893   0.927764 -1.394294 -4.950457  0.928006 -1.871326   
1    0.816358 -5.890586   2.039022 -0.332602 -3.103422 -2.348556 -0.896433   
57  -1.692426 -0.738958  -1.436221 -0.134397  1.472818 -1.834631  5.916831   
59  -4.923463 -1.003422  -0.110871  0.011539 -1.324152  2.799000  0.402521   
65   7.038074 -2.704910  -2.493699  4.556439 -1.745871  6.694041  3.227133   
66   7.038074 -2.704909  -2.493701  4.556438 -1.745872  6.694040  3.227133   
67   7.038074 -2.704909  -2.493701  4.556438 -1.745873  6.694040  3.227133   
70   4.680836  0.641407   0.483737  3.377570  3.899576  0.576717  1.989116   
71   3.525734  3.399807   1.257220  1.567044  6.583753 -3.226231 -0.359171   
73  -4.334968  1.433003  -2.805278 -4.844103 -1.050501  2.538172  3.647447   
74  -3.891913  1.141731  -6.259309 -5.523299 -4.274821  1.413282  7.283161   
75  -4.465801  0.571299  -5.637900 -6.220068 -5.472583  0.672858

## Create a dataset with 20 Clusters

In [43]:
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import time
import sklearn

num_clusters=20

start_time = time.time()
for i in range(229):
    df = pd.read_pickle('data_pickle/Episode_%d_pca.pkl'% i)

    X = df.loc[:,'PCA1':'PCA20'] #select all the PCA

    
    #Initialize the class object
    kmeans = KMeans(
        n_clusters=num_clusters, 
        init='k-means++', 
        max_iter=100, 
        n_init=25)

     #predict the labels of clusters.
    label = kmeans.fit_predict(X)
    # adding a cluster column to the dataset
    df['Cluster']= label
    #save the dataset 
    df.to_pickle('data_pickle/Episode_%d_pca_clusters.pkl'% i)
    

end_time = time.time()
execution_time = end_time - start_time

print("Execution time:", execution_time, "seconds")
#Execution time: 106.9984381198883 seconds

Execution time: 106.9984381198883 seconds


In [46]:
## All sentences dataset

In [51]:
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import time
import sklearn

nb_components = 20

start_time = time.time()
for i in range(1):
    df = pd.read_pickle('All_Sentences.pkl')
    X = np.stack(df['Embedding'])

    x_scaled = sklearn.preprocessing.StandardScaler().fit_transform(X)

    pca = PCA(n_components=nb_components)
    pca_features = pca.fit_transform(x_scaled)

    pca_df = pd.DataFrame(
    data=pca_features, 
    columns=[f'PCA{i}' for i in range(1, nb_components+1)])  

    pca_df['Person'] = df['Person']
    pca_df['Said'] = df['Said']

    #save the dataset 
    pca_df.to_pickle('data_pickle/all_sentences_pca.pkl')
    

end_time = time.time()
execution_time = end_time - start_time

print("Execution time:", execution_time, "seconds")


Execution time: 3.811974287033081 seconds
