### Jupyter Notebook To Create The New Aggregation Scheme

In [None]:
import time
import pandas as pd

In [None]:
def add_unique_id(df):
    df = df.copy(deep=True)
    unique_ids = []
    for i in range(len(df)):
        crnt_id = str(df['sub'][i])+"_"+str(df['episode'][i])+"_"+str(df['segment'][i])
        unique_ids.append(crnt_id)
    df['unique_id'] = unique_ids
    return df


def add_unique_id_all(df_list):
    for i in range(len(df_list)):
        df_list[i] = add_unique_id(df_list[i])
    return df_list


def extract_statistics(column):
    quantiles = list(column.quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]))
    mean = column.mean()
    sd = column.std()
    minimum = column.min()
    maximum = column.max()
    final_statistics = quantiles + [mean,sd,minimum,maximum]
    return final_statistics



def create_data_frame(input_data):
    final_data = []
    feature_names = ["Feature_"+str(n+1) for n in range(4992)]
    original_features = [str(n) for n in range(384)]
    all_ids = list(input_data['unique_id'].unique())
    for j in range(len(all_ids)):
        if j%50==0:
            print("Unique: "+str(j)+"/"+str(len(all_ids)))
        crnt_data = input_data[input_data['unique_id']==all_ids[j]]
        crnt_features = []
        for f in original_features:
            crnt_features = crnt_features + extract_statistics(crnt_data[f])
        final_data.append(crnt_features)
    final_dataframe = pd.DataFrame(final_data, columns=feature_names, index=all_ids)
    return final_dataframe


def create_all_dataframes(df_list):
    names = ['subject 5','subject 6','subject 7','subject 8','subject 12','subject 15','subject 17','subject 18']
    complete_dataframes = []
    for i in range(len(df_list)):
        start_time = time.time()
        print("Creating Data for "+str(names[i]))
        complete_dataframes.append(create_data_frame(df_list[i]))
        end_time = time.time()
        elapsed_time = end_time - start_time
        print("Time for "+names[i]+" :"+str(elapsed_time))
        print("Starting Cooldown for 2 minutes")
        time.sleep(120)
        print("========================================================================")
    return complete_dataframes

In [None]:
Y_val_names = ['irritation','nostalgia','pride','relief','sadness','satisfaction','surprise',
              'sympathy','triumph','arousal','valence','contempt','contentment','embarrassment',
              'empathic_pain','envy','gratitude','disgust','disappointment','despair','admiration',
              'amusement','aesthetic_appreciation','anger','anxiety','awe','calmness',
              'confusion','excitement','fear','guilt','interest',
              'joy','pleasure','romance','craving','entrancement','hope','boredom','adoration','jealousy'
              ,'horror','sexual_desire']

In [None]:
sub5 = pd.read_csv("Y:\Data Science Readings\Applied Project Semester B\Question 4\sub-005_task-alltrain_desc-prosodyopenai--whisper-tinyLayer2ResSegRaw.csv")
sub6 = pd.read_csv("Y:\Data Science Readings\Applied Project Semester B\Question 4\sub-006_task-alltrain_desc-prosodyopenai--whisper-tinyLayer2ResSegRaw.csv")
sub7 = pd.read_csv("Y:\Data Science Readings\Applied Project Semester B\Question 4\sub-007_task-alltrain_desc-prosodyopenai--whisper-tinyLayer2ResSegRaw.csv")
sub8 = pd.read_csv("Y:\Data Science Readings\Applied Project Semester B\Question 4\sub-008_task-alltrain_desc-prosodyopenai--whisper-tinyLayer2ResSegRaw-002.csv")
sub12 = pd.read_csv("Y:\Data Science Readings\Applied Project Semester B\Question 4\sub-012_task-alltrain_desc-prosodyopenai--whisper-tinyLayer2ResSegRaw.csv")
sub15 = pd.read_csv("Y:\Data Science Readings\Applied Project Semester B\Question 4\sub-015_task-alltrain_desc-prosodyopenai--whisper-tinyLayer2ResSegRaw-004.csv")
sub17 = pd.read_csv("Y:\Data Science Readings\Applied Project Semester B\Question 4\sub-017_task-alltrain_desc-prosodyopenai--whisper-tinyLayer2ResSegRaw-001.csv")
sub18 = pd.read_csv("Y:\Data Science Readings\Applied Project Semester B\Question 4\sub-018_task-alltrain_desc-prosodyopenai--whisper-tinyLayer2ResSegRaw-003.csv")

In [None]:
data_fromq1 = pd.read_csv("Y:\\Data Science Readings\\Applied Project Semester B\\final_data_merged.csv")

In [None]:
sub_list  = [sub5,sub6,sub7,sub8,sub12,sub15,sub17,sub18]

In [None]:
preliminary_sub_list = [sub5,sub6]

In [None]:
sub5.shape

(432000, 388)

In [None]:
sub6.shape

(387000, 388)

In [None]:
sub7.shape

(436500, 388)

In [None]:
sub8.shape

(597750, 388)

In [None]:
sub12.shape

(401250, 388)

In [None]:
sub15.shape

(534000, 388)

In [None]:
sub17.shape

(555750, 388)

In [None]:
sub18.shape

(593250, 388)

In [None]:
sub5.shape[0]+sub6.shape[0]+sub7.shape[0]+sub8.shape[0]+sub12.shape[0]+sub15.shape[0]+sub17.shape[0]+sub18.shape[0]

3937500

### Starting Data Aggregation Step:

In [None]:
sub_list = add_unique_id_all(sub_list)

In [None]:
sub_list = create_all_dataframes(sub_list)

Creating Data for subject 5
Unique: 0/576
Unique: 50/576
Unique: 100/576
Unique: 150/576
Unique: 200/576
Unique: 250/576
Unique: 300/576
Unique: 350/576
Unique: 400/576
Unique: 450/576
Unique: 500/576
Unique: 550/576
Time for subject 5 :168.17377877235413
Starting Cooldown for 2 minutes
Creating Data for subject 6
Unique: 0/516
Unique: 50/516
Unique: 100/516
Unique: 150/516
Unique: 200/516
Unique: 250/516
Unique: 300/516
Unique: 350/516
Unique: 400/516
Unique: 450/516
Unique: 500/516
Time for subject 6 :143.60149383544922
Starting Cooldown for 2 minutes
Creating Data for subject 7
Unique: 0/582
Unique: 50/582
Unique: 100/582
Unique: 150/582
Unique: 200/582
Unique: 250/582
Unique: 300/582
Unique: 350/582
Unique: 400/582
Unique: 450/582
Unique: 500/582
Unique: 550/582
Time for subject 7 :164.119366645813
Starting Cooldown for 2 minutes
Creating Data for subject 8
Unique: 0/797
Unique: 50/797
Unique: 100/797
Unique: 150/797
Unique: 200/797
Unique: 250/797
Unique: 300/797
Unique: 350/797
U

In [None]:
completed_data = pd.concat(sub_list)

In [None]:
completed_data_backup = completed_data.copy(deep=True)

In [None]:
completed_data_work = completed_data.copy(deep=True)

In [None]:
data_from_semesterA = add_unique_id(data_fromq1)

In [None]:
data_from_semesterA_subset = data_from_semesterA[['unique_id']+Y_val_names]

In [None]:
data_from_semesterA_subset

Unnamed: 0,unique_id,irritation,nostalgia,pride,relief,sadness,satisfaction,surprise,sympathy,triumph,...,pleasure,romance,craving,entrancement,hope,boredom,adoration,jealousy,horror,sexual_desire
0,sub-005_s01e01_1,0.791616,1.702135,2.877832,0.874105,0.769715,1.912958,0.916882,0.779931,0.794652,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0
1,sub-005_s01e01_2,0.791616,1.702135,0.877832,0.874105,0.769715,0.912958,0.916882,2.779931,0.794652,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0
2,sub-005_s01e01_3,0.791616,1.702135,0.877832,0.874105,0.769715,0.912958,0.916882,0.779931,0.794652,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0
3,sub-005_s01e01_4,1.791616,1.702135,0.877832,0.874105,0.769715,0.912958,1.916882,1.779931,0.794652,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0
4,sub-005_s01e01_5,1.791616,0.702135,0.877832,0.874105,0.769715,0.912958,1.916882,0.779931,0.794652,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6418,sub-022_s05e03p03_39,1.808872,0.666223,0.701531,2.711278,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.760691,0.690604,0.0,4.644404,0.0,0.742458,0.0
6419,sub-022_s05e03p03_40,2.808872,0.666223,0.701531,2.711278,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.760691,0.690604,0.0,2.644404,0.0,0.742458,0.0
6420,sub-022_s05e03p03_41,2.808872,0.666223,0.701531,3.711278,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.760691,0.690604,0.0,4.644404,0.0,0.742458,0.0
6421,sub-022_s05e03p03_42,1.808872,0.666223,0.701531,4.711278,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.760691,0.690604,0.0,3.644404,0.0,0.742458,0.0


In [None]:
completed_data_work['unique_id'] = list(completed_data_work.index)

In [None]:
merged_df = pd.merge(completed_data_work, data_from_semesterA_subset, on='unique_id', how='inner')

In [None]:
merged_df.to_csv("Y:\\Data Science Readings\\Applied Project Semester B\\Question 4\\New_Aggregation_Data.csv")