Conducting Supervised Learning to predict context values per timepoint(only including timepoints where a word was spoken)

In [1]:
import numpy as np
import pandas as pd
import scipy
from scipy.io import loadmat
import json
from sklearn.model_selection import train_test_split
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import math
import pickle
import re
import os

In [26]:

def concat_spoken_fmri():
    '''
    Concatenate fMRI data across subjects for timepoints where a word was spoken

    Returns
    -------
    fmri_array : np.ndarray
        2d array representing the concatenated fMRI data across all subjects where a word was spoken, with data concatenated along the x-axis
    """
    '''
    all_chopped_fmri = []
    for i in range(1, 87):
        chopped_fmri = pd.read_csv(f'/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/fMRI_sub_{i}_spoken_timepoints_new.csv', header=None)
        all_chopped_fmri.append(chopped_fmri.values)
        
    ## First, we build the predictor by concatenating accross the x-axis all the fMRI data for each subject where a word was spoken
    fmri_array = all_chopped_fmri[0]
    for array in all_chopped_fmri[1:]:
        fmri_array = np.concatenate((fmri_array, array), axis=1)
    
    np.savetxt('/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/predictor_fMRI_concatenated_NEW.csv', fmri_array, delimiter =',')
    return fmri_array


In [27]:
concat_spoken_fmri()

array([[-1.73611298e-01,  8.86672363e-02, -9.72323269e-02, ...,
        -1.02434628e-01,  4.59222049e-01,  6.86753392e-01],
       [-7.08576068e-02,  2.51554281e-01, -3.14064295e-04, ...,
        -5.15036941e-01, -2.86967337e-01, -3.15023661e-01],
       [-3.96565832e-02, -4.63391840e-02, -4.78670001e-02, ...,
        -3.45937431e-01, -8.14586878e-03,  3.92116845e-01],
       ...,
       [ 8.86890963e-02, -1.22874796e-01, -7.55924452e-03, ...,
         1.68961108e-01,  2.19348922e-01,  1.23490781e-01],
       [ 1.59149379e-01,  1.62857957e-02, -2.60403782e-01, ...,
         3.02624494e-01,  5.47391117e-01,  4.53376979e-01],
       [-6.71684325e-01, -4.80926901e-01, -2.71127611e-01, ...,
         1.06237876e+00,  6.81948483e-01,  9.97906327e-01]])

In [47]:

def split_subjects_by_films(i):
    '''
    Conctructing the target variable -concatenating context value per timepoint per subject

    Parameters
    -----------
    i (int) : subject id
    '''
    #splitting subjects by the film they have watched
    all_film_paths_dict = {'5470':[], '6804':[],'7715':[],'6674':[], '5900':[], '7515':[], '8882':[], '8181':[], '6739':[], '6102':[]}
    for i in range(1,87):
        split_subjects_by_films(i)
    data=loadmat(f'/rds/general/user/ab5621/home/Masters-Dissertation/Helper Files/extended_schaefer_200/sub-{i}/full_ts.mat')
    data = data['data']
    data = np.mean(data, axis=0)

    for film_length in all_film_paths_dict.keys():
        if len(data) == int(film_length):
            all_film_paths_dict[film_length].append(i)

for i in range(1,87):
    split_subjects_by_films(i)

print(all_film_paths_dict)

{'5470': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], '6804': [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38], '7715': [81, 82, 83, 84, 85, 86], '6674': [63, 64, 65, 66, 67, 68], '5900': [75, 76, 77, 78, 79, 80], '7515': [57, 58, 59, 60, 61, 62], '8882': [45, 46, 47, 48, 49, 50], '8181': [51, 52, 53, 54, 55, 56], '6739': [69, 70, 71, 72, 73, 74], '6102': [39, 40, 41, 42, 43, 44]}


In [None]:
def concat_target_contextval():
    '''
    Concatenated context values to build target variable for the supervised learning algorithm

    Returns
    ---------
    targets_list (list): a list of context values for each word in each film viewed by each subject
    '''

    #splitting subjects by the film they have watched
    all_film_paths_dict = {'5470':[], '6804':[],'7715':[],'6674':[], '5900':[], '7515':[], '8882':[], '8181':[], '6739':[], '6102':[]}

    for i in range(1,87):
        split_subjects_by_films(i)
    targets_list = []
    for subject_id in range(1, 87):
        for film_length in all_film_paths_dict.keys():
            #Getting the right film for each subject
            if film_length == '5470':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/context_values/att_405overlap_per_timepoint_405_overlap_word_level_attention_500_days_of_summer_words_new.json.json.json'
            if film_length == '6804':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/context_values/att_405overlap_per_timepoint_405_overlap_word_level_attention_citizenfour_words_new.json.json.json'
            if film_length == '7715':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/context_values/att_405overlap_per_timepoint_405_overlap_word_level_attention_12_years_a_slave_words_new.json.json.json'
            if film_length == '6674':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/context_values/att_405overlap_per_timepoint_405_overlap_word_level_attention_back_to_the_future_words_new.json.json.json'
            if film_length == '5900':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/context_values/att_405overlap_per_timepoint_405_overlap_word_level_attention_little_miss_sunshine_words_new.json.json.json'
            if film_length == '7515':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/context_values/att_405overlap_per_timepoint_405_overlap_word_level_attention_the_prestige_words_new.json.json.json'
            if film_length == '8882':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/context_values/att_405overlap_per_timepoint_405_overlap_word_level_attention_pulp_fiction_words_new.json.json.json'
            if film_length == '8181':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/context_values/att_405overlap_per_timepoint_405_overlap_word_level_attention_the_shawshank_redemption_words_new.json.json.json'
            if film_length == '6739':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/context_values/att_405overlap_per_timepoint_405_overlap_word_level_attention_split_words_new.json.json.json'
            if film_length == '6102':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/context_values/att_405overlap_per_timepoint_405_overlap_word_level_attention_the_usual_suspects_words_new.json.json.json'
            if subject_id in all_film_paths_dict[film_length]:

                with open(movie_context_path, 'r') as attention_file:
                    att = json.load(attention_file)

                for att_val in att.values():
         
                    targets_list.append(att_val)
    np.savetxt('/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/supervised_learning/concat_target.csv', targets_list, delimiter=',')
    return targets_list



In [2]:
with open('/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/surprisal_split.txt', 'r') as file:
    # Read the contents of the file
    surprisals = file.read()
tuple_strings = re.findall(r'\(([^)]+)\)', surprisals)


def str_to_tuple(s):
    '''
    converts a string representation of a tuple to a tuple of integers
    '''
    return tuple(map(lambda x: float(x), s.split(',')))

# Convert each tuple string to a tuple of integers
tuples = [str_to_tuple(ts) for ts in tuple_strings]


In [6]:
def surprisals_per_timepoint(surprisal_path):
    '''
    Build average surprisal values per timepoint

    Parameters
    ----------
    surprisal_path : str
        local file path containing surprisal values and timepoints

    Returns
    -------
    target_surprisals_with_time : dict
        dictionary where the keys are timepoints and the values are the average surprisal values for all words spoken during that timepoint
    
    '''
    with open(surprisal_path, 'r') as file:
        # Read the contents of the file
        surprisals = file.read()
    tuple_strings = re.findall(r'\(([^)]+)\)', surprisals)

    #converting back to floats
    new_surprisals = [str_to_tuple(ts) for ts in tuple_strings]
    new_surprisals_for_film = [[surp, math.ceil(float(time))] for (surp, time) in new_surprisals]
    target_surprisals_with_time = {time: [] for [key, time] in new_surprisals_for_film}

    for word_surp in new_surprisals_for_film:
        target_surprisals_with_time[word_surp[1]].append(word_surp[0])
    #Averages for the surprisal of all words spoken in the TR-1 and Tr
    target_surprisals_with_time = {key: sum(values) / len(values) for key, values in target_surprisals_with_time.items()}

    
    return target_surprisals_with_time

In [None]:
def language_regions():
    """
    Extract only brain regions involved in language processing

    Returns
    -------
    all_fMRI_only_spoken_timepoints : np.ndarray
        A np array containing fMRI data for language-related brain regions at the timepoints where a word was spoken
    """
    #load the names of all regions
    extended_schaefer_200_data = pd.read_csv('/rds/general/user/ab5621/home/Masters-Dissertation/Helper Files/extended_schaefer_200 (1).csv')

    #list of the subset of regions that are involved in language
    language_related_regions = ["Aud 1", "Aud 2", "Aud 3", "FrOper 1", "FrOper 2", "TempPole 1", "TempPole 2", "TempPole 3", "TempPole 4", "Temp 1", "Temp 2", "Temp 3", "Temp 4", "IPL 1", "IPL 2", "TempPar 1", "TempPar 2", "TempPar 3", "TempPar 4", "IPS 1", "IPS 2", "ParOper 1", "ParMed 1", "ParMed 2", "PrC 1", "Cent 1", "Cent 2"]

    #filtering the dataframe to contain only language regions
    filtered_df = extended_schaefer_200_data[extended_schaefer_200_data['region'].isin(language_related_regions)]
    labels_list = filtered_df['label'].tolist()

    #taking the indexes of language regions in the dataframe
    language_indexes = [x - 1 for x in labels_list]
    language_indexes_2 = [element+232 for element in language_indexes]
    language_indexes_3 = [element+232 for element in language_indexes_2]

    all_language_indexes = language_indexes + language_indexes_2+language_indexes_3

    #taking only the brain regions which are involved in language as features
    all_fMRI_only_spoken_timepoints = pd.read_csv('/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/predictor_fMRI_concatenated_NEW.csv', header=None)
    all_fMRI_only_spoken_timepoints = np.array(all_fMRI_only_spoken_timepoints)
    all_fMRI_only_spoken_timepoints = all_fMRI_only_spoken_timepoints[language_indexes, :]
    np.savetxt('/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/predictor_fMRI_language_regions.csv', all_fMRI_only_spoken_timepoints, delimiter =',')

    return all_fMRI_only_spoken_timepoints


In [8]:
#Getting surprisal target values

def concat_target_surprisal():
    """
    Concatenates surprisal values for all subjects

    Returns
    -------
    targets_list : list of float
        list containing all concatenated surprisal values from the films watched by all subjects
    """
    #splitting subjects by the film they have watched
    all_film_paths_dict = {'5470':[], '6804':[],'7715':[],'6674':[], '5900':[], '7515':[], '8882':[], '8181':[], '6739':[], '6102':[]}
    for i in range(1,87):
        split_subjects_by_films(i)
        
    targets_list = []
    for subject_id in range(1, 87):

        for film_length in all_film_paths_dict.keys():
            #Getting the right film for each subject
            if film_length == '5470':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/surp_per_timepoint_500_days_of_summer.pkl'
            if film_length == '6804':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/surp_per_timepoint_citizenfour.pkl'
            if film_length == '7715':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/surp_per_timepoint_12_years_of_slave.pkl'
            if film_length == '6674':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/surp_per_timepoint_back_to_the_future.pkl'
            if film_length == '5900':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/surp_per_timepoint_little_miss_sunshine.pkl'
            if film_length == '7515':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/surp_per_timepoint_the_prestige.pkl'
            if film_length == '8882':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/surp_per_timepoint_pulp_fiction.pkl'
            if film_length == '8181':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/surp_per_timepoint_the_shawshank_redemption.pkl'
            if film_length == '6739':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/surp_per_timepoint_split.pkl'
            if film_length == '6102':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/surp_per_timepoint_the_usual_suspects.pkl'
            if subject_id in all_film_paths_dict[film_length]:

                with open(movie_context_path, 'rb') as surprisal_file:
                    att = pickle.load(surprisal_file)
                #Loading context values
                for att_val in att.values():
                    targets_list.append(att_val)
    np.savetxt('/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/target_surprisals.csv', targets_list, delimiter=',')
    return targets_list

In [54]:
#GETTING times when a word was spoken

def spoken_times():
    '''
    Extract synergy, redundancy, and unique information at the timepoints where a word was spoken, for all subjects

    Returns
    -------
    final_list : np.ndarray
        2D array containing concatenated synergy, redundancy, and unique information for all subjects at the timepoints where a word was spoken

    '''
    #splitting subjects by the film they have watched
    all_film_paths_dict = {'5470':[], '6804':[],'7715':[],'6674':[], '5900':[], '7515':[], '8882':[], '8181':[], '6739':[], '6102':[]}
    for i in range(1,87):
        split_subjects_by_films(i)
    targets_list = []
    for subject_id in range(1, 87):
        syn_data = pd.read_csv(f'/rds/general/user/ab5621/home/Masters-Dissertation/Results/Dynamic_Information/{subject_id}_dynamic_synergy.csv', header=None)
        red_data = pd.read_csv(f'/rds/general/user/ab5621/home/Masters-Dissertation/Results/Dynamic_Information/{subject_id}_dynamic_redundancy.csv', header=None)
        uni_data = pd.read_csv(f'/rds/general/user/ab5621/home/Masters-Dissertation/Results/Dynamic_Information/{subject_id}_dynamic_unique.csv', header=None)
        syn_data = np.array(syn_data)
        red_data = np.array(red_data)
        uni_data = np.array(uni_data)
        for film_length in all_film_paths_dict.keys():
            #Getting the right film for each subject
            if film_length == '5470':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/surp_per_timepoint_500_days_of_summer.pkl'
            if film_length == '6804':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/surp_per_timepoint_citizenfour.pkl'
            if film_length == '7715':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/surp_per_timepoint_12_years_of_slave.pkl'
            if film_length == '6674':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/surp_per_timepoint_back_to_the_future.pkl'
            if film_length == '5900':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/surp_per_timepoint_little_miss_sunshine.pkl'
            if film_length == '7515':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/surp_per_timepoint_the_prestige.pkl'
            if film_length == '8882':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/surp_per_timepoint_pulp_fiction.pkl'
            if film_length == '8181':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/surp_per_timepoint_the_shawshank_redemption.pkl'
            if film_length == '6739':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/surp_per_timepoint_split.pkl'
            if film_length == '6102':
                movie_context_path = '/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/surp_per_timepoint_the_usual_suspects.pkl'
            if subject_id in all_film_paths_dict[film_length]:

                with open(movie_context_path, 'rb') as surprisal_file:
                    surprisal = pickle.load(surprisal_file)

                timings = list(surprisal.keys())
                timings_list = [time-1 for time in timings]
                print(syn_data.shape)
                syn_data = syn_data[:, timings_list]
                red_data = red_data[:, timings_list]
                uni_data = uni_data[:, timings_list]
                all_info_list = [syn_data, red_data, uni_data]
                concat_all_info_one_subj = np.concatenate(all_info_list, axis=0)
       
    
                targets_list.append(concat_all_info_one_subj)
    final_list = np.concatenate(targets_list, axis=1)
    np.savetxt('/rds/general/user/ab5621/home/Masters-Dissertation/movie_subtitles/surprisal/allinfo_only_spoken_timepoints.csv', final_list, delimiter=',')
    return final_list