In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from os import path
from scipy import stats
from itertools import product

In [None]:
path_eval = '/Users/alouette/Documents/Perceptual_Space_ALS/'

### Evaluation data (.csv) 
#### Emotionality, Hedonicity, Familiarity, Complexity
Explore, clean the dataset (remove outliers)

In [None]:
#evaluation data
df_evals = pd.read_csv(path_eval+'all_evaluations.csv', index_col = 0)

subjs = np.unique(df_evals['subject'])
stims = np.unique(df_evals['stimulus_name'])
print('>> evaluations for {} stims in {} subjs'.format(len(stims), len(subjs)))
print(stims)

#sanity checks of the data
if df_evals.shape[0] != len(subjs)*len(stims):
    print('missing rows in evals df')

#dealing with missing evals & outliers
evals = ['hedo','emo','fam','compl']
df_na = df_evals[evals].isna()*1
df_all = pd.concat([df_evals[['subject','stimulus_name','stimulus_type']],
                    df_na], axis=1, join="inner")

df2 = df_all.groupby(['subject']).sum()
df2_z = abs(stats.zscore(df2,axis=0))
max_dev = 2
df2_clean = df2[(df2_z < max_dev).all(axis=1)]
print('{} subj outliers'.format((df2.shape[0] - df2_clean.shape[0])))
print('total nb of subj = {}'.format(df2_clean.shape[0]))

df2_clean.sum().plot(kind='bar',figsize=(5,4),title="# missing evals")
df2_clean.groupby(['subject']).sum().plot(kind='bar',figsize=(10,4),
                                              title='#missing by subj')

subj_c = df2_clean.index
df3 = df_all[df_all.subject.isin(subj_c)]
df3 = df3.groupby('stimulus_type').sum()
df3.groupby(['stimulus_type']).sum().plot(kind='bar',figsize=(5,4),
                                              title='#missing by subj')

#filter and save clean data
df_evals_c = df_evals[df_evals.subject.isin(subj_c)]
df_evals_c.to_csv(path_eval+'all_evals_clean_n={}.csv'.format(str(df2_clean.shape[0])),
                 index=None)

### Create df for PCA analyses 
#### One df by subject and modality (face, odor, music)

In [None]:
df = pd.read_csv(path_eval+'all_evals_clean_n=48.csv')

subjs = np.unique(df['subject'])
s_type = np.unique(df['stimulus_type'])
s_name = np.unique(df['stimulus_name'])
cols_sel = ['hedo_scaled','fam_scaled','emo_scaled','compl_scaled',
           'subject','stimulus_type','stimulus_name']

#filter and save data for impute missing data (run in R)
for s, t in product(subjs, s_type):
    df_st = df[(df['subject'] == s)&(df['stimulus_type'] == t)][cols_sel]
    df_st = df_st.sort_values(by='stimulus_name', axis=0, ascending=True)
    df_st = df_st.sort_index(axis=1, ascending=False)
    df_st.to_csv(path_eval+'df_pca/evals_su={}_stim={}_scaled.csv'.format(s,t),
                 index=None)
