In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import re
import plotly.express as px
import plotly.graph_objs as go
import numpy as np
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objs as go

def load_speakers_metadata(feats_file, tsv_name):
    speaker_len = open(feats_file.replace('.npy','.len'),'r').read().split('\n')
    speaker_len = [int(f) for f in speaker_len if f != '']

    wav_paths = re.split('\t\d\n|\n',open(tsv_name, 'r').read())
    names = [f.split(".")[0] for f in wav_paths[1:] if f!=""]
#     print(names)
    return speaker_len, names 

def time_txt(time, max_time, time_frame=5):
    if time % time_frame == 0:
        return f"{round(time*0.02,2)}"
    return ""

def create_df(feats, speaker_len, names):
    cols = [f"val {i}" for i in range(feats.shape[1])]
    df = pd.DataFrame(feats,columns = cols)
    df['index'] = df.index
    time_index = {i:speaker_len[i] for i in range(len(speaker_len))}
    com_time_index = {i:sum(speaker_len[:i]) for i in range(len(speaker_len))}
    df_speaker_count = pd.Series(time_index)
    df_speaker_count = df_speaker_count.reindex(df_speaker_count.index.repeat(df_speaker_count.to_numpy())).rename_axis('speaker_id').reset_index()
    df['speaker_id'] = df_speaker_count['speaker_id']
    df['speaker_len']  = df['speaker_id'].apply(lambda row: speaker_len[row])
    df['com_sum'] = df['speaker_id'].apply(lambda i: com_time_index[i])
    df['speaker'] = df['speaker_id'].apply(lambda i: names[i])
    assert len(df.loc[df['speaker']==-1]) == 0
    assert len(df_speaker_count) == len(df)
    df_subset = df.copy()
    data_subset = df_subset[cols].values
    return data_subset,df_subset, cols 

def tsne(data_subset, init='random', early_exaggeration=12.0, lr = 200.0, n_comp = 2, perplexity = 40, iters = 300,random_state=None):
    tsne = TSNE(n_components=n_comp, verbose=1, perplexity=perplexity, n_iter=iters,init=init, early_exaggeration=early_exaggeration,
               learning_rate = lr,random_state=random_state)
    tsne_results = tsne.fit_transform(data_subset)
    return tsne_results


## Setup settings
- Model's label rate is 0.02 seconds. To not overflow the plot, time is shown every 5 samples (0.1 seconds).


In [2]:
feat_path, tsv_name = "/home/data/ronich/molly_losh_features/", "first_3_words.tsv"
seed = 31415
time_frame = 5
sentence_id = sentence = 'C'
group=1
layers = [12,8,6]

## creating the dataframe using the settings above

In [3]:
def speaker_task(speaker_str):
    if len(speaker_str.split("_")[2]) == 3:
        return speaker_str.split("_")[3][5],speaker_str.split("_")[3][6]
    else:
        return speaker_str.split("_")[2][-2],speaker_str.split("_")[2][-1]

tsne_1 = 'tsne-3d-one'
tsne_2 = 'tsne-3d-two'
tsne_3 = 'tsne-3d-thr'

def fill_tsne(df_subset, tsne_results):
    print(tsne_results[:,0].shape)
    df_subset[tsne_1] = tsne_results[:,0]
    df_subset[tsne_2] = tsne_results[:,1]
    if tsne_results.shape[1] == 3:
        df_subset[tsne_3] = tsne_results[:,2]
    return df_subset

def plot_tsne(df_subset):
    import plotly.offline as pyo
    import plotly.graph_objs as go
    
    pyo.init_notebook_mode()
    fig = px.scatter_3d(df_subset, x=tsne_1, y=tsne_2, z=tsne_3,
              color='speaker')
    fig.update_traces(mode='lines+markers+text')
    pyo.iplot(fig, filename='jupyter-styled_bar')
    
dfs = [] 
for layer in layers:
    feats_file = os.path.join(feat_path,f"layer{layer}",tsv_name.replace('.tsv','_0_1.npy'))
    feats = np.load(feats_file)
    speaker_len, names = load_speakers_metadata(feats_file, "tsvs/"+tsv_name)

    data_subset,df_subset, cols = create_df(feats, speaker_len, names)
    df_subset['idx'] = df_subset.index
    df_subset['exp_group'] = df_subset['speaker'].str.split("/").str[0]
    df_subset['group'] = df_subset['speaker'].str.split("/").str[1].str.split("group").str[1]
    df_subset['speaker_main'] = df_subset['speaker'].str.split("/").str[1].str.split("_").str[0]
    df_subset[['speaker_par','rep']] = df_subset['speaker'].str.split("/").str[1].apply(lambda row: speaker_task(row)).values.tolist()
    df_subset['time'] = df_subset['idx']-df_subset['com_sum']
    df_subset['time_txt'] = df_subset[['time','speaker_len']].apply(lambda row: time_txt(row['time'],row['speaker_len'],time_frame),axis=1)
    dfs.append((df_subset,data_subset))

df_layer12 = dfs[0]
df_layer8 = dfs[1]
df_layer6 = dfs[2]

# Plot two speakers



## &rarr; same speaker id was found in TD & CP
- 6540

In [4]:
pd.DataFrame([f.split("-") for f in df_subset[['speaker_main','exp_group']].agg('-'.join, axis=1).unique()],columns=['speaker','group']).groupby('speaker').count()

Unnamed: 0_level_0,group
speaker,Unnamed: 1_level_1
5011,1
5073,1
5201,1
5207,1
5222,1
6003,1
6512,1
6516,1
6517,1
6528,1


In [5]:
df_subset[['speaker_main','exp_group']].agg('-'.join, axis=1).unique()

array(['5073-PM_RAN', '5222-PM_RAN', '6582-PM_RAN', '6627-PM_RAN',
       '6641-PM_RAN', '5011-TD_RAN', '5201-TD_RAN', '5207-TD_RAN',
       '6003-TD_RAN', '6540-TD_RAN', '6528-FXS_RAN', '6551-FXS_RAN',
       '6643-FXS_RAN', '6658-FXS_RAN', '6512-Control_Parents',
       '6516-Control_Parents', '6517-Control_Parents',
       '6540-Control_Parents', '6618-Control_Parents'], dtype=object)

In [6]:
import plotly.express as px
import plotly.graph_objs as go
sentence_id=sentence
FXP = '6658'
TD1 ='6003'
TD2 = '5011'
PM ='6641'
CP1 ='6517'
CP2 = '6516' 

max_s1 = None#50
max_s2 = None#50
def plot_two_speakers(df_subset, S1,S2, max_s1=None,max_s2=None,title="", use_half=False):
    dcp = df_subset.loc[((df_subset['speaker_main'].isin([S1,S2]))&(df_subset['speaker_par']==sentence_id))&(df_subset['rep']=='A')&(df_subset.group=='1')].copy().rename(columns={tsne_1: "x",tsne_2:'y',tsne_3:'z'})
    dcp1 = dcp.loc[(dcp['speaker_main']==S1)].copy()
    dcp2 = dcp.loc[(dcp['speaker_main']==S2)].copy()
    dcp1['clr'] = np.linspace(0, 1, dcp.loc[(dcp['speaker_main']==S1)].shape[0])
    dcp2['clr'] = np.linspace(1, 0, dcp.loc[(dcp['speaker_main']==S2)].shape[0])
    
    if max_s1 is not None:
        dcp1 = dcp1[:max_s1]
    
    if max_s2 is not None:
        dcp2 = dcp2[:max_s2]
    
    if use_half:
        dcp1 = dcp1[:int(len(dcp1)/2)]
        dcp2 = dcp2[:int(len(dcp2)/2)]
    # S1
    fig = px.scatter_3d(dcp1, x='x', y='y', z='z',
                        color='clr',symbol='speaker_main',
                        text='time_txt',
                        labels={'x':'t-SNE-dim1','y':'t-SNE-dim2','z':'t-SNE-dim3'})
    fig.update_traces(marker_symbol='diamond',marker_coloraxis=None,marker_colorscale='burg', mode='lines+markers+text', line_color='lightgray')
    fig.for_each_trace(lambda t: t.update(textfont_color='darkred'))

    # S2
    fig2 = px.scatter_3d(dcp2, x='x', y='y', z='z',
                        color='clr',symbol='speaker_main',
                         text='time_txt',
                         labels={'x':'t-SNE-dim1','y':'t-SNE-dim2','z':'t-SNE-dim3'})
    fig2.update_traces(marker_coloraxis=None,marker_colorscale='ice', mode='lines+markers+text', line_color='lightgray')
    fig2.for_each_trace(lambda t: t.update(textfont_color='blue'))
#     axis_style = dict(range=[-10,10])

    fig3 = go.Figure(data=fig.data + fig2.data)
    fig3.update_layout(scene = dict(
#                     xaxis = axis_style,
#                      yaxis = axis_style,
#                      zaxis = axis_style,
                        xaxis_title='dimension 1 (t-SNE)',
                        yaxis_title='dimension 2 (t-SNE)',
                        zaxis_title='dimension 3 (t-SNE)',
    ),

#                         margin=dict(r=20, b=10, l=10, t=10),
                        legend_title="Speaker",
                        title=title)

    fig3.show()



In [7]:
df_subset,data_subset = dfs[i][0].copy(), dfs[i][1].copy()
df_subset= df_subset.loc[(df_subset['speaker_main'].isin([TD1,TD2]))&(df_subset.speaker_par.str.contains(sentence))&(df_subset.group=='1')]
data_subset = data_subset[df_subset.loc[df_subset.speaker_par.str.contains(sentence)&(df_subset.group=='1')].index]
df_subset

NameError: name 'i' is not defined

## The plots below are comparing CP:CP, TD:TD, CP:PM, TD,FXP, and showing the comparison for layers 12, 8 and 6
&rarr; Since the plots are packed, two cells below are the same plots but only for the first half

In [None]:
for i,layer in enumerate(layers):
    for S1,S2 in [(CP1,CP2),(TD1,TD2),(CP1,PM),(TD1,FXP)]:
        df_subset,data_subset = dfs[i][0].copy(), dfs[i][1].copy()
        df_subset= df_subset.loc[(df_subset['speaker_main'].isin([S1,S2]))&(df_subset.speaker_par.str.contains(sentence))&(df_subset.group=='1')]
        data_subset = data_subset[df_subset.loc[df_subset.speaker_par.str.contains(sentence)&(df_subset.group=='1')].index]
        tsne_results = tsne(data_subset, init='pca', early_exaggeration=12.0, lr='auto',n_comp=3, perplexity = 50, iters = 2000, random_state=seed)
        df_subset = fill_tsne(df_subset, tsne_results)
        groups = df_subset.exp_group.unique()
        group1 = groups[0]
        group2 = groups[0] if len(groups)==1 else groups[1]
        plot_two_speakers(df_subset, S1,S2,max_s1,max_s2,title=f"layer:{layer} task:{sentence} group:{group1}-{group2}")


In [None]:
for i,layer in enumerate(layers):
    for S1,S2 in [(CP1,CP2),(TD1,TD2),(CP1,PM),(TD1,FXP)]:
        df_subset,data_subset = dfs[i][0].copy(), dfs[i][1].copy()
        df_subset= df_subset.loc[(df_subset['speaker_main'].isin([S1,S2]))&(df_subset.speaker_par.str.contains(sentence))&(df_subset.group=='1')]
        data_subset = data_subset[df_subset.loc[df_subset.speaker_par.str.contains(sentence)&(df_subset.group=='1')].index]
        tsne_results = tsne(data_subset, init='pca', early_exaggeration=12.0, lr='auto',n_comp=3, perplexity = 50, iters = 2000, random_state=seed)
        df_subset = fill_tsne(df_subset, tsne_results)
        groups = df_subset.exp_group.unique()
        group1 = groups[0]
        group2 = groups[0] if len(groups)==1 else groups[1]
        plot_two_speakers(df_subset, S1,S2,use_half=True,title=f"layer:{layer} task:{sentence} group:{group1}-{group2}")
