# Individual Feature Determination

## Preparation

### Import

In [None]:
import pandas as pd
import os
import statistics
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pd.set_option('display.max_rows', 100)

## Function Definition

### Prepare DataFrame

In [None]:
def preparation_filter_data(df):

  # Change Id name
  df.rename(columns={'Speaker': 'ID'}, inplace=True)

  # Calculate the duration for each speaker and add a new column to the DataFrame
  df["Duration"] = df.apply(lambda row: int(row["End time (s)"] - row["Start time (s)"]), axis=1)

  df = df[df["Duration"] >= 1] 

  # Group lines by speaker and combine back-to-back lines
  df = df.groupby((df["ID"] != df["ID"].shift()).cumsum()).agg({
      "ID": "first",
      "Start time": "first",
      "End time": "last",
      "Start time (s)": "first",
      "End time (s)": "last",
      "Subtitle": lambda x: ' '.join(x),
      "speech_neu": "mean",
      "speech_ang": "mean",
      "speech_hap": "mean",
      "speech_sad": "mean",
      "text_joy": "mean",
      "text_anger":"mean",
      "text_fear": "mean",
      "text_sadness": "mean",
      "Duration": "sum"
  }).reset_index(drop=True)

  return df

### Individual Time Spoken

In [None]:
def calc_individual_time_spoken(df,df_features):

  # Calculate the duration for each speaker and add a new column to the DataFrame
  df["Duration"] = df.apply(lambda row: int(row["End time (s)"] - row["Start time (s)"]), axis=1)

  # Group the data by speaker and sum the durations
  duration_by_speaker = df.groupby("ID")["Duration"].sum().to_dict()

  # Set the values in the indiv_spoken_time column based on speaker's ID
  for speaker_id, duration in duration_by_speaker.items():
      df_features.loc[df_features['ID'] == speaker_id, 'indiv_spoken_time'] = duration

### Average Turn Duration

In [None]:
def avg_turn_duration(df,df_features):

  # Calculate the duration for each speaker and add a new column to the DataFrame
  df["Duration"] = df.apply(lambda row: int(row["End time (s)"] - row["Start time (s)"]), axis=1)

  # Group the data by speaker and sum the durations
  duration_by_speaker = df.groupby("ID")["Duration"].mean().to_dict()

  # Set the values in the indiv_spoken_time column based on speaker's ID
  for speaker_id, avg_duration in duration_by_speaker.items():
      df_features.loc[df_features['ID'] == speaker_id, 'average_turn_duration'] = avg_duration

### Max and Avg Time without Speaking

In [None]:
def max_and_avg_time_without_speaking(df,df_features):

    # Create dictionaries to store max and avg time
    unique_speakers = df['ID'].unique()
    max_time_speaker = {speaker: 0 for speaker in unique_speakers}
    times_speaker = {speaker: [] for speaker in unique_speakers}

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        speaker = row['ID']
        end_time = row['End time (s)']
        n = 1

        while index+n+1 < len(df) and df.loc[index + n, 'ID'] != speaker:
            n += 1
        
        if index + n < len(df):
            time_no_speak = df.loc[index + n, 'Start time (s)'] - end_time
            times_speaker[speaker].append(time_no_speak)

            if time_no_speak > max_time_speaker[speaker]:
                max_time_speaker[speaker] = time_no_speak

    times_speaker_avg = {key: statistics.mean(values) for key, values in times_speaker.items()}
    
    # Set the values in the indiv_spoken_time column based on speaker's ID
    for speaker_id, max_duration in max_time_speaker.items():
        df_features.loc[df_features['ID'] == speaker_id, 'max_time_without_speaking'] = max_duration

    # Set the values in the indiv_spoken_time column based on speaker's ID
    for speaker_id, avg_duration in times_speaker_avg.items():
        df_features.loc[df_features['ID'] == speaker_id, 'avg_time_without_speaking'] = avg_duration


### Number of Turns

In [None]:
def calc_num_turns(df,df_features):

  speaker_counts = df['ID'].value_counts().to_dict()

  # Set the values in the indiv_spoken_time column based on speaker's ID
  for speaker_id, count in speaker_counts.items():
      df_features.loc[df_features['ID'] == speaker_id, 'num_turns'] = count

### Max and Avg Turns without Speaking

In [None]:
def max_and_avg_turns_without_speaking(df,df_features):

    # Create dictionaries to store max and avg time
    unique_speakers = df['ID'].unique()
    max_turns_speaker = {speaker: 0 for speaker in unique_speakers}
    turns_speaker = {speaker: [] for speaker in unique_speakers}

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        speaker = row['ID']
        n = 1

        while index+n+1 < len(df) and df.loc[index + n, 'ID'] != speaker:
            n += 1
        n=n-1
        if index + n < len(df):
            turns_speaker[speaker].append(n)

            if n > max_turns_speaker[speaker]:
                max_turns_speaker[speaker] = n

    turns_speaker_avg = {key: statistics.mean(values) for key, values in turns_speaker.items()}
    
    # Set the values in the indiv_spoken_time column based on speaker's ID
    for speaker_id, max_turns in max_turns_speaker.items():
        df_features.loc[df_features['ID'] == speaker_id, 'max_turns_without_speaking'] = max_turns

    # Set the values in the indiv_spoken_time column based on speaker's ID
    for speaker_id, avg_turns in turns_speaker_avg.items():
        df_features.loc[df_features['ID'] == speaker_id, 'avg_turns_without_speaking'] = avg_turns


### Number of Words

In [None]:
def calc_num_words(df,df_features):

  # Split the subtitle column into words and count the number of words in each row
  df['word_count'] = df['Subtitle'].str.split().str.len()

  # Group by speaker ID and sum the word counts
  word_counts = df.groupby('ID')['word_count'].sum().to_dict()

  df = df.drop(columns=['word_count'])

  for speaker_id, words in word_counts.items():
      df_features.loc[df_features['ID'] == speaker_id, 'num_words'] = words

### Average Words per Turn

In [None]:
def avg_words_turn(df,df_features):

  # Split the subtitle column into words and count the number of words in each row
  df['word_count'] = df['Subtitle'].str.split().str.len()

  # Group by speaker ID and sum the word counts
  avg_word_counts = df.groupby('ID')['word_count'].mean().to_dict()

  df = df.drop(columns=['word_count'])

  for speaker_id, words in avg_word_counts.items():
      df_features.loc[df_features['ID'] == speaker_id, 'avg_words_turn'] = words

### Max Words per Turn

In [None]:
def max_words_turn(df,df_features):

  # Split the subtitle column into words and count the number of words in each row
  df['word_count'] = df['Subtitle'].str.split().str.len()

  # Group by speaker ID and sum the word counts
  avg_word_counts = df.groupby('ID')['word_count'].max().to_dict()

  df = df.drop(columns=['word_count'])

  for speaker_id, words in avg_word_counts.items():
      df_features.loc[df_features['ID'] == speaker_id, 'max_words_turn'] = words

### Calculate Average of 1 Emotion

In [None]:
def calc_avg_one_emotion(df, df_features, col):

    # Group by speaker ID and calculate the mean of the emotion column
    speakers_emotion = df.groupby('ID')[col].mean().to_dict()

    for speaker_id, emotion in speakers_emotion.items():
        df_features.loc[df_features['ID'] == speaker_id, col] = emotion

### Calculate Average of All Emotions

In [None]:
def calc_avg_emotions(df, df_features,start_col, end_col):

    cols_range = df.columns[df.columns.get_loc(start_col):df.columns.get_loc(end_col)+1].tolist()
    
    # Loop through the emotion columns and calculate the average for each
    for col in cols_range:
        calc_avg_one_emotion(df, df_features, col)

### Calculate All Individual Features for a Single Group

In [None]:
def calc_individual_features_dataframes(df_features,meeting_file):

  # Read and prepare dataframe for analysis
  df= pd.read_csv(meeting_file)
  df = preparation_filter_data(df)

  # Apply functions to calc features
  calc_individual_time_spoken(df,df_features)
  avg_turn_duration(df,df_features)
  max_and_avg_time_without_speaking(df,df_features)
  calc_num_turns(df,df_features)
  max_and_avg_turns_without_speaking(df,df_features)
  calc_num_words(df,df_features)
  avg_words_turn(df,df_features)
  max_words_turn(df,df_features)
  calc_avg_emotions(df, df_features,'speech_neu', 'text_sadness')

  return df_features  

### Determine Features for Each Group

In [None]:
def determine_features_foreach_group(directory_in, indiv_features_blank_file, indiv_features_file_out):

    df_features= pd.read_excel(indiv_features_blank_file)
  
    # Get a list of all CSV files in the directory
    csv_files = [f for f in os.listdir(directory_in) if os.path.isfile(os.path.join(directory_in, f)) and f.endswith('.csv')]

    # Iterate over each CSV file and call calc_individual_features_dataframes
    for i, csv_file in enumerate(csv_files):
        meeting_file = os.path.join(directory_in, csv_file)
        df_features = calc_individual_features_dataframes(df_features, meeting_file)
        print(f"Processed file {i+1}/{len(csv_files)}: {csv_file}")

    df_features.to_csv(indiv_features_file_out,encoding='utf-8-sig', index=False)


## Use of Function

In [None]:
indiv_features_blank_file = r'/content/drive/MyDrive/Projects/tps/data/12. features/2_individual_features_byhand_values_filled.xlsx'
indiv_features_file_out = r'/content/drive/MyDrive/Projects/tps/data/12. features/3_individual_features_final.xlsx'
directory_in = r'/content/drive/MyDrive/Projects/tps/data/11. speech_text/old (may9)'

In [None]:
determine_features_foreach_group(directory_in, indiv_features_blank_file, indiv_features_file_out)

Processed file 1/12: 6_speech_text.csv
Processed file 2/12: 11_speech_text.csv
Processed file 3/12: 8_speech_text.csv
Processed file 4/12: 10_speech_text.csv
Processed file 5/12: 2_speech_text.csv
Processed file 6/12: 4_speech_text.csv
Processed file 7/12: 9_speech_text.csv
Processed file 8/12: 1_speech_text.csv
Processed file 9/12: 5_speech_text.csv
Processed file 10/12: 7_speech_text.csv
Processed file 11/12: 12_speech_text.csv
Processed file 12/12: 3_speech_text.csv
