In [14]:
pip install deepmultilingualpunctuation



In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
import os
import re
import pandas as pd
import nltk
import ssl
import torch
import spacy
import numpy as np
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from deepmultilingualpunctuation import PunctuationModel
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from scipy.linalg import triu

In [17]:
modelpunct = PunctuationModel()
# Load pre-trained BERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')



In [18]:
# Load SpaCy model for NLP tasks
nlp = spacy.load('en_core_web_sm')

# Function to group consecutive words into sentences
def group_words_into_sentences(df):
    sentences = ''
    previous_word = None

    for _, row in df.iterrows():
        word = row['word']
        if word != previous_word:
            sentences += " " +word
        previous_word = word

    return sentences.strip()

# Function to chunk text for the punctuation pipeline
def chunk_text(text, chunk_size=512):
    words = text.split()
    for i in range(0, len(words), chunk_size):
        yield ' '.join(words[i:i + chunk_size])

# Function to punctuate chunks
def punctuate_chunks(chunks):
    punctuated_chunks = []
    for chunk in chunks:
        punctuated_text = modelpunct.restore_punctuation(chunk)
        punctuated_chunks.append(punctuated_text)
    return punctuated_chunks

def combine_chunks(chunks):
    return ' '.join(chunks)

# Function to extract NLP features using SpaCy
def extract_nlp_features(text):
    doc = nlp(text)
    features = []
    for token in doc:
        features.append({
            'word': token.text,
            'lemma': token.lemma_,
            'tag': token.tag_,
            'dep': token.dep_,
        })
    return features


In [29]:
# Function to remove records with transcripts '000' or '999'
def filter_special_cases(df):
    return df[~df['word'].isin(['000', '999'])]

# Function to remove column that are always null
def filter_null(df):
    columns_to_drop = df.columns[(df == 0).all()]
    # Drop the identified columns
    return df.drop(columns=columns_to_drop)

# Function to remove records with filler words
def filter_fillers(df):
    # Define a list of filler words
    filler_words = ['like', 'god', 'oh', 'jesus']
    return df[~df['word'].isin(filler_words)]

# Function to retain the last 300 joint positions of words spanning more than 300 frames (longer than ~2 seconds of speech)
def retain_last_n_jp(df, max_rows):
    previous_word = None
    drop = []
    rows = []
    # Loop through each row
    for _,row in df.iterrows():
        current_word = row['word']
        frame = row['frame']
        if current_word != previous_word and previous_word!= None:
            if len(rows) > max_rows:
                drop = df[df['frame'].isin(rows[:-max_rows])].index
                df.drop(drop, axis =0, inplace = True)
                rows = []
        else:
            rows.append(frame)

        previous_word = current_word
    return df

# Handling Missing Values
def handle_missing_values(df):
    df.fillna(method='ffill', inplace=True)

    nlp_features = ['word', 'lemma', 'tag', 'dep']

    for feature in nlp_features:
      df[feature].fillna('', inplace=True)

    return df

# Normalization/Standardization
def normalize_features(df, numerical_features):
    scaler = StandardScaler()
    df[numerical_features] = scaler.fit_transform(df[numerical_features])
    return df

# Feature Engineering (Velocities and Accelerations)
def compute_rotation_dynamics(df):
    df['time_diff'] = df['time'].diff().fillna(0)
    joints = ['RightShoulder', 'RightArm', 'RightForeArm', 'RightHand', 'LeftShoulder', 'LeftArm', 'LeftForeArm', 'LeftHand']

    dynamics = pd.DataFrame(index=df.index)
    for joint in joints:
        for axis in ['x', 'y', 'z']:
            rotation_col = f'{joint}_rotation_{axis}'
            dynamics[f'{joint}_angular_velocity_{axis}'] = df[rotation_col].diff().fillna(0) / df['time_diff']
            dynamics[f'{joint}_angular_acceleration_{axis}'] = dynamics[f'{joint}_angular_velocity_{axis}'].diff().fillna(0) / df['time_diff']

    df.drop(columns=['time_diff'], inplace=True)
    df = pd.concat([df, dynamics], axis=1)
    return df


# Function to compute relative positions and angles based on parent-child relationships
def compute_relative_rotations(df):
    parent_map = {
        'Hips': None,
        'Spine': 'Hips',
        'Spine1': 'Spine',
        'Spine2': 'Spine1',
        'Spine3': 'Spine2',
        'RightShoulder': 'Spine3',
        'RightArm': 'RightShoulder',
        'RightForeArm': 'RightArm',
        'RightHand': 'RightForeArm',
        'LeftShoulder': 'Spine3',
        'LeftArm': 'LeftShoulder',
        'LeftForeArm': 'LeftArm',
        'LeftHand': 'LeftForeArm'
    }
    for child, parent in parent_map.items():
        if parent is not None:
            for axis in ['x', 'y', 'z']:
                # Compute relative rotations
                df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
    return df

def extract_file_numbers(file_list):
    file_numbers = [int(re.findall(r'\d+', os.path.basename(file))[0]) for file in file_list]
    return file_numbers

In [22]:
def split_file_numbers(file_numbers, test_size=0.2, val_size=0.25, random_state=42):
    train_files, test_files = train_test_split(file_numbers, test_size=test_size, random_state=random_state)
    train_files, val_files = train_test_split(train_files, test_size=val_size, random_state=random_state)
    return train_files, val_files, test_files

# # Function to train an embedding model on the words
# def train_embedding_model(df, vector_size=100):
#   sentences = df['word'].values.reshape(-1, 1).tolist()
#   word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
#   return word2vec_model


# Function to get Word2Vec embeddings
def get_word2vec_embedding(word, model, vector_size):
    try:
        return model.wv[word].tolist()
    except KeyError:
        return [0] * vector_size

In [27]:
# Function to preprocess each CSV file
def preprocess_csv_file(filepath, output_path):

    df = pd.read_csv(filepath)
    # Apply preprocessing from the analysis of the data
    df['Hips_parent'].fillna('None', inplace=True)
    df['word'] = df['word'].astype(str)
    df_cleaned = filter_special_cases(df)
    df_cleaned = filter_null(df_cleaned)
    df_cleaned = filter_fillers(df_cleaned)
    df_cleaned = retain_last_n_jp(df_cleaned, max_rows=300)

    # Apply text preprocessing
    text = group_words_into_sentences(df_cleaned)
    chunks = list(chunk_text(text, chunk_size=100))
    punctuated_chunks = punctuate_chunks(chunks)
    punctuated_text = combine_chunks(punctuated_chunks)
    nlp = extract_nlp_features(punctuated_text)
    nlp_df = pd.DataFrame(nlp)


    # Initialize columns for NLP features in the enhanced DataFrame
    for feature in ['lemma', 'tag', 'dep']:
        df_cleaned[feature] = None

    # Merge NLP features with the cleaned DataFrame
    nlp_index = 0
    nlp_row_count = len(nlp_df)
    for index, row in df_cleaned.iterrows():
        word = row['word']
        if nlp_index < nlp_row_count and word == nlp_df.iloc[nlp_index]['word']:
            for feature in ['lemma', 'tag', 'dep']:
                df_cleaned.at[index, feature] = nlp_df.iloc[nlp_index][feature]
        else:
            if nlp_index < nlp_row_count - 1:
                nlp_index += 1
                if word == nlp_df.iloc[nlp_index]['word']:
                    for feature in ['lemma', 'tag', 'dep']:
                        df_cleaned.at[index, feature] = nlp_df.iloc[nlp_index][feature]


    df = handle_missing_values(df_cleaned)
    df = compute_rotation_dynamics(df)
    df = compute_relative_rotations(df)

    print(df.head)

    # Drop parent columns
    parent_columns = [col for col in df.columns if '_parent' in col]
    df.drop(columns=parent_columns, inplace=True)

    # Get embeddings for the transcript words
    sentences = df['word'].values.reshape(-1, 1).tolist()
    word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    df['word_embedding'] = df['word'].apply(lambda x: get_word2vec_embedding(x, word2vec_model, 100))
    word_embeddings_df = pd.DataFrame(df['word_embedding'].to_list(), index=df.index)
    word_embeddings_df.columns = [f'word_emb_{i}' for i in range(100)]
    df = pd.concat([df, word_embeddings_df], axis=1)
    df.drop(columns=['word_embedding', 'word'], inplace=True)

    # One-hot encode NLP features and joint parent data
    nlp_features = ['lemma', 'tag', 'dep']
    df = pd.get_dummies(df, columns=nlp_features)

    # Save final data
    df.to_csv(output_path, index=False)
    print(f"preprocessed features and saved data for {filepath} to {output_path}")



In [24]:
# Process each CSV file
def process_files (file_set, output_dir):
    for file_number in file_set:
        file_path = os.path.join(csv_dir, f'features_{file_number}.csv')
        output_path = os.path.join(output_dir, f'preprocessed_features_{file_number}.csv')
        preprocess_csv_file(file_path, output_path)


In [None]:
# Process each CSV file
def process_files_train (file_set, output_dir):
    for file_number in file_set:
      if file_number!=11 or file_number!=25 or file_number!=18:
          print(file_number)
          file_path = os.path.join(csv_dir, f'features_{file_number}.csv')
          output_path = os.path.join(output_dir, f'preprocessed_features_{file_number}.csv')
          preprocess_csv_file(file_path, output_path)

In [25]:
# Directory containing the feature CSV files
csv_dir = '/content/drive/MyDrive/Unipi_Thesis/ExtractedFeatures'

# List all feature CSV files in the directory
csv_files = [f for f in os.listdir(csv_dir) if f.startswith('features_') and f.endswith('.csv')]


In [30]:
# Splitting train-val-test files
file_numbers = extract_file_numbers(csv_files)
train_files, val_files, test_files = split_file_numbers(file_numbers)
print("train", train_files)
print("test", test_files)
print("tval", val_files)

# Directories to save the processed datasets
train_output_dir = '/content/drive/MyDrive/Unipi_Thesis/Sets/Train'
val_output_dir = '/content/drive/MyDrive/Unipi_Thesis/Sets/Validation'
test_output_dir = '/content/drive/MyDrive/Unipi_Thesis/Sets/Test'

# Ensure the directories exist
os.makedirs(train_output_dir, exist_ok=True)
os.makedirs(val_output_dir, exist_ok=True)
os.makedirs(test_output_dir, exist_ok=True)

# Process and save training, validation, and test files
process_files(train_files, train_output_dir)
process_files(val_files, val_output_dir)
process_files(test_files, test_output_dir)

print("Preprocessing complete and datasets saved.")

train [11, 25, 18, 27, 17, 23, 10, 4, 30, 5, 21, 14, 26]
test [19, 13, 1, 12, 22]
tval [16, 2, 7, 15, 8]


  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time    word  Hips_position_x  Hips_position_y  \
195         11    196    4.813333      so         -20.8610          91.5823   
231         11    232    5.113333      so         -15.2218          91.8221   
232         11    233    5.121666      so         -15.0729          91.8261   
233         11    234    5.129999      so         -14.9166          91.8272   
234         11    235    5.138333      so         -14.7562          91.8271   
...        ...    ...         ...     ...              ...              ...   
75818       11  78031  653.438073  people         -11.8670          90.8276   
75819       11  78032  653.446407  people         -11.5817          90.7976   
75820       11  78033  653.454740  people         -11.2977          90.7615   
75821       11  78034  653.463073  people         -11.0334          90.7318   
75822       11  78035  653.471407  people         -10.7572          90.6946   

       Hips_position_

  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time   word  Hips_position_x  Hips_position_y  \
8           25      8   11.746667  right        -0.185901          89.0822   
9           25      9   11.755000  right        -0.122105          89.1139   
10          25     10   11.763333  right        -0.061020          89.1478   
11          25     11   11.771667  right         0.000764          89.1839   
12          25     12   11.780000  right         0.063035          89.2216   
...        ...    ...         ...    ...              ...              ...   
75997       25  78223  663.538073   yeah       -10.367000          90.4133   
75998       25  78224  663.546406   yeah       -10.426300          90.4027   
75999       25  78225  663.554739   yeah       -10.489000          90.3882   
76000       25  78226  663.563073   yeah       -10.555900          90.3687   
76001       25  78227  663.571406   yeah       -10.625700          90.3513   

       Hips_position_z  Hips_rota

  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time    word  Hips_position_x  Hips_position_y  \
7           18      7   12.275333     the         -2.52449          90.1109   
8           18      8   12.283667     the         -2.48011          90.0993   
9           18      9   12.292000     the         -2.43399          90.0885   
10          18     10   12.300333     the         -2.40218          90.0776   
11          18     11   12.308667     the         -2.36809          90.0685   
...        ...    ...         ...     ...              ...              ...   
76102       18  78278  664.533406  anyway        -21.48630          90.5372   
76103       18  78279  664.541739  anyway        -21.53280          90.5393   
76104       18  78280  664.550072  anyway        -21.58320          90.5399   
76105       18  78281  664.558406  anyway        -21.63580          90.5413   
76106       18  78282  664.566739  anyway        -21.68360          90.5416   

       Hips_position_

  df = pd.read_csv(filepath)
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time  word  Hips_position_x  Hips_position_y  \
61          27     61   12.688333   the          16.7589          88.5372   
62          27     62   12.696666   the          16.7428          88.5351   
63          27     63   12.705000   the          16.7272          88.5340   
64          27     64   12.713333   the          16.7131          88.5324   
65          27     65   12.721666   the          16.7006          88.5302   
...        ...    ...         ...   ...              ...              ...   
89417       27  92226  780.729693  yeah          17.8395          88.3509   
89418       27  92227  780.738026  yeah          17.7265          88.3506   
89419       27  92228  780.746359  yeah          17.6112          88.3493   
89420       27  92229  780.754693  yeah          17.4927          88.3463   
89421       27  92230  780.763026  yeah          17.3661          88.3450   

       Hips_position_z  Hips_rotation_x  Hips

  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time    word  Hips_position_x  Hips_position_y  \
78          17     79   13.416333    okay        -13.25280          88.5356   
94          17     95   13.549666    okay         -9.84087          88.5040   
95          17     96   13.558000    okay         -9.61601          88.5146   
96          17     97   13.566333    okay         -9.39031          88.5234   
97          17     98   13.574666    okay         -9.16818          88.5361   
...        ...    ...         ...     ...              ...              ...   
81939       17  84338  715.574386  anyway          1.36688          87.9188   
81940       17  84339  715.582719  anyway          1.45461          87.9233   
81941       17  84340  715.591052  anyway          1.55236          87.9221   
81942       17  84341  715.599386  anyway          1.65018          87.9160   
81943       17  84342  715.607719  anyway          1.74313          87.9120   

       Hips_position_

  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time    word  Hips_position_x  Hips_position_y  \
295         23    295   17.398332    yeah        -10.26460          90.1357   
296         23    296   17.406666    yeah        -10.41740          90.1343   
297         23    297   17.414999    yeah        -10.56980          90.1330   
298         23    298   17.423332    yeah        -10.71890          90.1264   
299         23    299   17.431666    yeah        -10.86770          90.1192   
...        ...    ...         ...     ...              ...              ...   
75011       23  77224  658.473076  thanks          3.74942          88.8572   
75012       23  77225  658.481409  thanks          3.77077          88.8377   
75013       23  77226  658.489743  thanks          3.79179          88.8182   
75014       23  77227  658.498076  thanks          3.80794          88.7977   
75015       23  77228  658.506409  thanks          3.82405          88.7771   

       Hips_position_

  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time  word  Hips_position_x  Hips_position_y  \
15          10     15    3.215000   hey         -10.0249          90.3793   
16          10     16    3.223333   hey         -10.0335          90.3786   
17          10     17    3.231667   hey         -10.0497          90.3792   
18          10     18    3.240000   hey         -10.0503          90.3751   
19          10     19    3.248333   hey         -10.0444          90.3787   
...        ...    ...         ...   ...              ...              ...   
77971       10  80179  671.248066  cool         -11.1652          91.1066   
77972       10  80180  671.256399  cool         -11.2088          91.0941   
77973       10  80181  671.264733  cool         -11.2477          91.0783   
77974       10  80182  671.273066  cool         -11.2834          91.0617   
77975       10  80183  671.281399  cool         -11.3265          91.0449   

       Hips_position_z  Hips_rotation_x  Hips

  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time  word  Hips_position_x  Hips_position_y  \
513          4    513    8.974998    do         -25.8904          89.7122   
514          4    514    8.983332    do         -26.2412          89.6846   
515          4    515    8.991665    do         -26.5804          89.6553   
516          4    516    8.999998    do         -26.9013          89.6231   
517          4    517    9.008332    do         -27.2163          89.5916   
...        ...    ...         ...   ...              ...              ...   
72782        4  74650  626.783084  yeah         -23.3758          91.8651   
72783        4  74651  626.791418  yeah         -23.3610          91.8704   
72784        4  74652  626.799751  yeah         -23.3480          91.8715   
72785        4  74653  626.808084  yeah         -23.3384          91.8660   
72786        4  74654  626.816418  yeah         -23.3400          91.8585   

       Hips_position_z  Hips_rotation_x  Hips

  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time       word  Hips_position_x  \
130         30    130   17.453333  including          6.51178   
131         30    131   17.461666  including          6.50586   
132         30    132   17.470000  including          6.49978   
133         30    133   17.478333  including          6.49164   
134         30    134   17.486666  including          6.48329   
...        ...    ...         ...        ...              ...   
65928       30  68032  583.303107         go       -172.16500   
65929       30  68033  583.311440         go       -172.35200   
65930       30  68034  583.319773         go       -172.54200   
65931       30  68035  583.328107         go       -172.74800   
65932       30  68036  583.336440         go       -172.94000   

       Hips_position_y  Hips_position_z  Hips_rotation_x  Hips_rotation_y  \
130            88.8762         -68.2507         -4.33339         -161.264   
131            88.8696         -68.

  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time  word  Hips_position_x  Hips_position_y  \
90           5     90    4.700000   the          2.75120          90.9934   
91           5     91    4.708333   the          3.00862          91.0015   
92           5     92    4.716666   the          3.24882          91.0129   
93           5     93    4.725000   the          3.47278          91.0260   
94           5     94    4.733333   the          3.67706          91.0423   
...        ...    ...         ...   ...              ...              ...   
74928        5  77115  646.574743  that        -20.06050          91.2953   
74929        5  77116  646.583076  that        -20.43290          91.2965   
74930        5  77117  646.591410  that        -20.79150          91.2998   
74931        5  77118  646.599743  that        -21.13040          91.3080   
74932        5  77119  646.608076  that        -21.45070          91.3212   

       Hips_position_z  Hips_rotation_x  Hips

  df = pd.read_csv(filepath)
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time        word  Hips_position_x  \
9           21      9   11.333000         the         -25.4544   
10          21     10   11.341333         the         -25.4783   
11          21     11   11.349667         the         -25.5020   
12          21     12   11.358000         the         -25.5257   
13          21     13   11.366333         the         -25.5475   
...        ...    ...         ...         ...              ...   
73572       21  75480  640.257748  limitation         -14.9560   
73573       21  75481  640.266082  limitation         -14.8727   
73574       21  75482  640.274415  limitation         -14.7761   
73575       21  75483  640.282748  limitation         -14.6976   
73576       21  75484  640.291082  limitation         -14.6145   

       Hips_position_y  Hips_position_z  Hips_rotation_x  Hips_rotation_y  \
9              89.8772         -19.3535        -2.383760         -169.957   
10             89.8732 

  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time word  Hips_position_x  Hips_position_y  \
2           14      2   16.191667  the         -16.1048          89.2582   
3           14      3   16.200000  the         -16.0107          89.2443   
4           14      4   16.208333  the         -15.9164          89.2342   
5           14      5   16.216667  the         -15.8091          89.2299   
6           14      6   16.225000  the         -15.6969          89.2215   
...        ...    ...         ...  ...              ...              ...   
73245       14  75360  644.174749   is        -225.0040          90.5877   
73246       14  75361  644.183082   is        -225.6460          90.6846   
73247       14  75362  644.191415   is        -226.2810          90.7758   
73248       14  75363  644.199749   is        -226.9250          90.8857   
73249       14  75364  644.208082   is        -227.5490          91.0101   

       Hips_position_z  Hips_rotation_x  Hips_rotation_y 

  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time  word  Hips_position_x  Hips_position_y  \
207         26    208   15.983333   yes          14.5734          89.6352   
208         26    209   15.991666   yes          14.5068          89.6140   
209         26    210   15.999999   yes          14.4405          89.5925   
210         26    211   16.008333   yes          14.3788          89.5679   
211         26    212   16.016666   yes          14.3177          89.5433   
...        ...    ...         ...   ...              ...              ...   
79535       26  81990  697.499727  yeah          24.5796          89.4562   
79536       26  81991  697.508060  yeah          24.6503          89.4756   
79537       26  81992  697.516393  yeah          24.7174          89.4953   
79538       26  81993  697.524727  yeah          24.7789          89.5155   
79539       26  81994  697.533060  yeah          24.8393          89.5369   

       Hips_position_z  Hips_rotation_x  Hips

  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time  word  Hips_position_x  Hips_position_y  \
5           16      5   11.991667   the         -13.4196          89.3053   
6           16      6   12.000000   the         -13.4662          89.3070   
7           16      7   12.008333   the         -13.5065          89.3066   
8           16      8   12.016667   the         -13.5478          89.3026   
9           16      9   12.025000   the         -13.5897          89.2981   
...        ...    ...         ...   ...              ...              ...   
67634       16  69636  592.249768  yeah        -185.3890          91.9142   
67635       16  69637  592.258101  yeah        -185.9650          91.8415   
67636       16  69638  592.266435  yeah        -186.5610          91.7642   
67637       16  69639  592.274768  yeah        -187.1410          91.6840   
67638       16  69640  592.283101  yeah        -187.7310          91.5943   

       Hips_position_z  Hips_rotation_x  Hips

  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time word  Hips_position_x  Hips_position_y  \
131          2    131    5.591666  yup          5.13282          92.1269   
132          2    132    5.600000  yup          5.15334          92.1316   
133          2    133    5.608333  yup          5.15107          92.1279   
134          2    134    5.616666  yup          4.99671          92.1462   
135          2    135    5.625000  yup          5.00497          92.1443   
...        ...    ...         ...  ...              ...              ...   
69017        2  70948  595.733097  got        -10.74200          91.1258   
69018        2  70949  595.741430  got        -10.72870          91.0955   
69019        2  70950  595.749763  got        -10.69790          91.0792   
69020        2  70951  595.758097  got        -10.64010          91.0757   
69021        2  70952  595.766430  got        -10.71850          90.9827   

       Hips_position_z  Hips_rotation_x  Hips_rotation_y 

  df = pd.read_csv(filepath)
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time    word  Hips_position_x  Hips_position_y  \
7            7      7    6.628333     the       -12.192900          91.4777   
8            7      8    6.636667     the       -12.188400          91.4774   
9            7      9    6.645000     the       -12.186600          91.4768   
10           7     10    6.653333     the       -12.182800          91.4746   
11           7     11    6.661667     the       -12.180100          91.4717   
...        ...    ...         ...     ...              ...              ...   
73329        7  75467  635.461415  guilty        -0.669915          92.3290   
73330        7  75468  635.469748  guilty        -0.616062          92.3267   
73331        7  75469  635.478082  guilty        -0.561656          92.3213   
73332        7  75470  635.486415  guilty        -0.505713          92.3167   
73333        7  75471  635.494748  guilty        -0.451665          92.3102   

       Hips_position_

  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time word  Hips_position_x  Hips_position_y  \
14          15     14   13.366667  the         -2.31543          88.5472   
15          15     15   13.375000  the         -2.18437          88.5367   
16          15     16   13.383333  the         -2.05552          88.5271   
17          15     17   13.391667  the         -1.92440          88.5166   
18          15     18   13.400000  the         -1.79791          88.5063   
...        ...    ...         ...  ...              ...              ...   
69891       15  71887  612.308094  the       -197.62400          89.2220   
69892       15  71888  612.316427  the       -198.37100          89.0880   
69893       15  71889  612.324760  the       -199.13300          88.9632   
69894       15  71890  612.333094  the       -199.89400          88.8625   
69895       15  71891  612.341427  the       -200.65000          88.7708   

       Hips_position_z  Hips_rotation_x  Hips_rotation_y 

  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time  word  Hips_position_x  Hips_position_y  \
5            8      5    4.881667   the         -16.6812          91.9365   
6            8      6    4.890000   the         -16.7525          91.9495   
7            8      7    4.898333   the         -16.7272          91.9443   
8            8      8    4.906667   the         -16.7194          91.9400   
9            8      9    4.915000   the         -16.7275          91.9397   
...        ...    ...         ...   ...              ...              ...   
69021        8  71112  597.439763  yeah         -13.3454          91.5287   
69022        8  71113  597.448096  yeah         -13.3724          91.5182   
69023        8  71114  597.456430  yeah         -13.4078          91.5103   
69024        8  71115  597.464763  yeah         -13.4438          91.5020   
69025        8  71116  597.473096  yeah         -13.4809          91.4956   

       Hips_position_z  Hips_rotation_x  Hips

  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time  word  Hips_position_x  Hips_position_y  \
12          19     12   12.400000   the         0.467401          88.5575   
13          19     13   12.408333   the         0.513314          88.5508   
14          19     14   12.416667   the         0.553817          88.5462   
15          19     15   12.425000   the         0.586377          88.5382   
16          19     16   12.433333   the         0.613155          88.5314   
...        ...    ...         ...   ...              ...              ...   
67255       19  69054  587.749770  ones       -16.783200          88.8109   
67256       19  69055  587.758103  ones       -16.764100          88.7961   
67257       19  69056  587.766436  ones       -16.752800          88.7820   
67258       19  69057  587.774770  ones       -16.732500          88.7637   
67259       19  69058  587.783103  ones       -16.721900          88.7422   

       Hips_position_z  Hips_rotation_x  Hips

  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time    word  Hips_position_x  Hips_position_y  \
149         13    149    3.741666    okay        -13.91580          90.5060   
150         13    150    3.749999    okay        -13.92000          90.5136   
151         13    151    3.758333    okay        -13.92260          90.5205   
152         13    152    3.766666    okay        -13.92770          90.5263   
153         13    153    3.774999    okay        -13.92670          90.5315   
...        ...    ...         ...     ...              ...              ...   
74559       13  76661  641.341411  anyway         -6.76368          91.3491   
74560       13  76662  641.349744  anyway         -6.68797          91.3476   
74561       13  76663  641.358078  anyway         -6.60887          91.3515   
74562       13  76664  641.366411  anyway         -6.54189          91.3547   
74563       13  76665  641.374744  anyway         -6.48402          91.3565   

       Hips_position_

  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time  word  Hips_position_x  Hips_position_y  \
319          1    319    7.468332   the          7.94736          92.7058   
320          1    320    7.476666   the          7.96694          92.7059   
321          1    321    7.484999   the          7.98878          92.7046   
322          1    322    7.493332   the          8.00875          92.7042   
323          1    323    7.501666   the          8.02555          92.7020   
...        ...    ...         ...   ...              ...              ...   
71110        1  73137  614.284756  okay        -12.08830          91.1470   
71111        1  73138  614.293090  okay        -12.28170          91.0945   
71112        1  73139  614.301423  okay        -12.48870          91.0381   
71113        1  73140  614.309756  okay        -12.68440          90.9658   
71114        1  73141  614.318090  okay        -12.86770          90.8776   

       Hips_position_z  Hips_rotation_x  Hips

  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time  word  Hips_position_x  Hips_position_y  \
398         12    399    6.124999    so        -25.96940          92.2268   
425         12    426    6.349999    so        -27.45790          91.8890   
426         12    427    6.358332    so        -27.56130          91.8739   
427         12    428    6.366665    so        -27.67230          91.8641   
428         12    429    6.374999    so        -27.79440          91.8559   
...        ...    ...         ...   ...              ...              ...   
75921       12  78144  653.999740  that         -6.42185          91.0093   
75922       12  78145  654.008073  that         -6.39394          90.9807   
75923       12  78146  654.016406  that         -6.37309          90.9483   
75924       12  78147  654.024740  that         -6.33872          90.9097   
75925       12  78148  654.033073  that         -6.30204          90.8678   

       Hips_position_z  Hips_rotation_x  Hips

  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']
  df[f'{child}_rel_rotation_{axis}'] = df[f'{child}_rotation_{axis}'] - df[f'{parent}_rotation_{axis}']


<bound method NDFrame.head of        take_id  frame        time  word  Hips_position_x  Hips_position_y  \
224         22    224   18.166666  okay        -23.56390          90.0797   
225         22    225   18.174999  okay        -23.61090          90.0953   
226         22    226   18.183333  okay        -23.65910          90.1100   
227         22    227   18.191666  okay        -23.70400          90.1276   
228         22    228   18.199999  okay        -23.74830          90.1457   
...        ...    ...         ...   ...              ...              ...   
70536       22  72703  622.158091  skin         -5.71802          90.2729   
70537       22  72704  622.166424  skin         -5.54472          90.2291   
70538       22  72705  622.174758  skin         -5.36757          90.1873   
70539       22  72706  622.183091  skin         -5.18496          90.1483   
70540       22  72707  622.191424  skin         -4.99932          90.1145   

       Hips_position_z  Hips_rotation_x  Hips

In [31]:
import shutil

In [32]:
# Define the folder name you want to zip
folder_name = '/content/drive/MyDrive/Unipi_Thesis/Sets'
output_filename = '/content/drive/MyDrive/Unipi_Thesis/sets_zipped_features.zip'

# Zip the folder
shutil.make_archive(output_filename.replace('.zip', ''), 'zip', folder_name)

'/content/drive/MyDrive/Unipi_Thesis/sets_zipped_features.zip'

In [None]:
    # # Add embeddings to the dataframe
    # vector_size = 100
    # for feature in nlp_features:
    #     df[f'{feature}_embedding'] = df[feature].apply(lambda x: get_embedding(x, embedding_models[feature], vector_size))


    # # Process embeddings in batches
    # batch_size = 32
    # vector_size = model.config.hidden_size

    # pca = PCA(n_components=50)  # Reduce to 50 dimensions
    # all_embeddings = []

    # # Add BERT embeddings to the dataframe for each NLP feature
    # for feature in nlp_features:
    #     embeddings = []
    #     for i in range(0, len(df), batch_size):
    #         batch_text = df[feature][i:i+batch_size].tolist()
    #         batch_embeddings = get_embeddings(batch_text)
    #         embeddings.extend(batch_embeddings)
    #     embeddings = pca.fit_transform(embeddings)  # Apply PCA
    #     embeddings_df = pd.DataFrame(embeddings, columns=[f'{feature}_emb_{i}' for i in range(pca.n_components_)])
    #     all_embeddings.append(embeddings_df)

    # # Combine all embeddings into the original dataframe
    # for i, feature in enumerate(nlp_features):
    #     df = pd.concat([df.reset_index(drop=True), all_embeddings[i].reset_index(drop=True)], axis=1)
    #     df.drop(columns=[feature], inplace=True)