In [None]:
import re
import math
import datetime
import csv

In [None]:
def parse_file_content(filename):
  with open(filename, 'r') as file:
        content = file.read()
  segments = []
  for segment in content.split('\n\n'):
    lines = segment.split('\n')
    if len(lines) >= 3:
        # Extract time and speaker
        times = re.findall(r'\d+:\d\d:\d\d\.\d+', lines[1])
        if len(times) == 2:
          start_time = times[0]
          end_time = times[1]
          speaker = lines[2].split(':')[0]
          text = lines[2].split(':')[1]
          segments.append(( start_time,end_time,  speaker, text))
  return segments


def parse_timestamp(timestamp_str):
    timestamp_str = timestamp_str.strip().replace('.', ',')
    return datetime.datetime.strptime(timestamp_str, '%H:%M:%S,%f')

def calculate_features(segments):
    turn_switches = 0

    #patient features
    pat_sp_count = pat_lp_count = 0
    total_pat_word_count = 0
    total_pat_sp_duration = total_pat_lp_duration = datetime.timedelta()
    total_pat_duration = datetime.timedelta()
    ga_pi_count = 0
    total_ga_pi_duration = datetime.timedelta()
    pat_turn_count = 0
    total_pat_turn_word_count = 0

    #interviewer features
    int_sp_count = int_lp_count = 0
    total_int_sp_duration = total_int_lp_duration = datetime.timedelta()
    total_int_word_count = 0
    total_int_duration = datetime.timedelta()
    ga_ip_count = 0
    total_ga_ip_duration = datetime.timedelta()
    int_turn_count = 0
    total_int_turn_word_count = 0
    la_count = ga_count = 0
    total_la_duration = total_ga_duration = datetime.timedelta()
    last_end_time = parse_timestamp(segments[0][1])
    last_speaker = segments[0][2]
    for start_str, end_str, speaker, text in segments:
        start_time = parse_timestamp(start_str)
        end_time = parse_timestamp(end_str)
        words = len(text.split())
        silence_duration = (start_time - last_end_time)
        if speaker == "PAT":
            total_pat_duration += (start_time - end_time)
            total_pat_word_count += words
        if speaker == "INT":
            total_int_duration += (start_time - end_time)
            total_int_word_count += words
        if last_speaker != speaker:
            turn_switches += 1
            if silence_duration.total_seconds() >= 1.5:
                la_count += 1
                total_la_duration += silence_duration
            elif silence_duration.total_seconds() >= 0.5:
                ga_count += 1
                total_ga_duration += silence_duration
                if last_speaker == "PAT":
                    ga_pi_count += 1
                    total_ga_pi_duration += silence_duration
                else:
                    ga_ip_count += 1
                    total_ga_ip_duration += silence_duration
        elif (last_speaker == speaker and speaker=="PAT"):
            pat_turn_count += 1
            total_pat_turn_word_count += words
            if silence_duration.total_seconds() >= 1.5:
                pat_lp_count += 1
                total_pat_lp_duration += silence_duration
            elif silence_duration.total_seconds() >= 0.5:
                pat_sp_count += 1
                total_pat_sp_duration += silence_duration
        elif (last_speaker == speaker and speaker=="INT"):
            int_turn_count += 1
            total_int_turn_word_count += words
            if silence_duration.total_seconds() > 1.5:
                int_lp_count += 1
                total_int_lp_duration += silence_duration
            elif silence_duration.total_seconds() > 5:
                int_sp_count += 1
                total_int_sp_duration += silence_duration
        last_end_time = end_time
        last_speaker = speaker

    # Normalize durations by total speaking time
    total_duration_seconds = (last_end_time - parse_timestamp(segments[0][0])).total_seconds()
    normalized_la_duration = total_la_duration.total_seconds() / total_duration_seconds if total_duration_seconds else 0
    normalized_ga_duration = total_ga_duration.total_seconds() / total_duration_seconds if total_duration_seconds else 0

    # Calculate turn switches per minute
    total_duration_minutes = total_duration_seconds / 60
    turn_switches_per_minute = turn_switches / total_duration_minutes if total_duration_minutes else 0

    #Calculate Patient features
    pat_sp_count = pat_sp_count / total_pat_word_count
    pat_lp_count = pat_lp_count / total_pat_word_count
    normalized_pat_sp_duration = total_pat_sp_duration.total_seconds() / total_pat_duration.total_seconds()  if total_pat_duration else 0
    normalized_pat_lp_duration = total_pat_lp_duration.total_seconds() / total_pat_duration.total_seconds()  if total_pat_duration else 0
    gap_pi_count = ga_pi_count / turn_switches if turn_switches else 0
    normalized_ga_pi_duration = total_ga_pi_duration.total_seconds() / gap_pi_count if gap_pi_count else 0
    standardized_pause_rate =  total_pat_word_count / (pat_sp_count + pat_lp_count) if  (pat_sp_count + pat_lp_count) else 0
    standardized_phonation_time =  total_pat_word_count / total_pat_duration.total_seconds() if total_pat_duration else 0
    phonation_rate = total_pat_duration.total_seconds() +  total_pat_sp_duration.total_seconds() + total_pat_lp_duration.total_seconds()
    if phonation_rate == 0:
        transformed_phonation_rate = 0
    else:
        ratio = total_pat_word_count / phonation_rate
        if ratio >= 0:  # Ensure non-negative input for sqrt
            sqrt_ratio = math.sqrt(ratio)
            if -1 <= sqrt_ratio <= 1:  # Ensure input is within [-1, 1] for asin
                transformed_phonation_rate = math.asin(sqrt_ratio)
            else:
                transformed_phonation_rate = 0
        else:
            transformed_phonation_rate = 0

    floor_control_ratio = total_duration_seconds / total_pat_duration.total_seconds() if total_pat_duration else 0
    pat_turn_length =   total_pat_turn_word_count/pat_turn_count if pat_turn_count else 0

    #Calculate interviewer features
    int_sp_count = int_sp_count / total_int_word_count if total_int_word_count else 0
    int_lp_count = int_lp_count / total_int_word_count if total_int_word_count else 0
    normalized_int_sp_duration = total_int_sp_duration.total_seconds() / total_int_duration.total_seconds()  if total_int_duration else 0
    normalized_int_lp_duration = total_int_lp_duration.total_seconds() / total_int_duration.total_seconds()  if total_int_duration else 0
    gap_ip_count = ga_ip_count / turn_switches if turn_switches else 0
    normalized_ga_ip_duration = total_ga_ip_duration.total_seconds() / gap_ip_count if gap_ip_count else 0
    int_turn_length =   total_int_turn_word_count/int_turn_count if int_turn_count else 0
    # Return features
    return {
        'LA': la_count,
        'Dur_LA': normalized_la_duration,
        'GA': ga_count,
        'Dur_GA': normalized_ga_duration,
        'Turn_switches_per_Minute': turn_switches_per_minute,
        'PAT_SP': pat_sp_count,
        'PAT_LP': pat_lp_count,
        'Dur_PAT_SP': normalized_pat_sp_duration,
        'Dur_PAT_LP': normalized_pat_lp_duration,
        'GA_PI': ga_pi_count,
        'Dur_GA_PI': normalized_ga_pi_duration,
        'SPR': standardized_pause_rate,
        'SPT': standardized_phonation_time,
        'TPR': transformed_phonation_rate,
        'FCR': floor_control_ratio,
        'PAT_TL': pat_turn_length,
        #int features 7
        'INT_SP': pat_sp_count,
        'INT_LP': pat_lp_count,
        'Dur_INT_SP': normalized_int_sp_duration,
      'Dur_INT_LP': normalized_int_lp_duration,
        'GA_IP': ga_ip_count,
        'Dur_GA_IP': normalized_ga_ip_duration,
        'INT_TL': int_turn_length,

    }

def process_srt_to_csv(srt_path, csv_path):
    segments = parse_file_content(srt_path)
    features = calculate_features(segments)
    audio_name = srt_path.split('/')[-1].replace('.srt', '')

    with open(csv_path, 'w', newline='') as csvfile:
        # Use the keys from the features dictionary as fieldnames
        fieldnames = list(features.keys()) + ['Audio_File']
        csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames)
        csvwriter.writeheader()

        # Add the audio file name to the features dictionary
        features['Audio_File'] = audio_name

        # Write the dictionary to the CSV file
        csvwriter.writerow(features)

# **Generate CSV File**

In [None]:
import os
import csv
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
directory_path = '/content/drive/MyDrive/FYPRepository'
csv_path = os.path.join(directory_path, 'all_interactional_features.csv')
if not os.path.isfile(csv_path):
    with open(csv_path, 'w') as f:
        pass

In [None]:
output_dir = "/content/drive/MyDrive/FYPRepository/Transcripts"
csv_path = '/content/drive/MyDrive/FYPRepository/all_interactional_features.csv'
fieldnames = None  # This should be initialized outside the for-loop

# Open the CSV file once outside the loop
with open(csv_path, 'a', newline='') as csvfile:
    csvwriter = None  # Initialize the CSV writer variable

    for subfolder in os.listdir(output_dir):
        subfolder_path = os.path.join(output_dir, subfolder)
        if os.path.isdir(subfolder_path):
            for filename in os.listdir(subfolder_path):
                print(subfolder,filename)
                if filename.endswith(".srt"):
                    srt_path = os.path.join(subfolder_path, filename)
                    segments = parse_file_content(srt_path)
                    if len(segments) > 0:
                      features = calculate_features(segments)
                      audio_name = filename.replace('.srt', '')
                      if fieldnames is None:
                          # Use the keys from the first features dictionary as fieldnames
                          fieldnames = list(features.keys()) + ['Audio_File']
                          csvwriter = csv.DictWriter(csvfile, fieldnames=fieldnames)
                          csvwriter.writeheader()
                      # Add the audio file name to the features dictionary
                      features['Audio_File'] = audio_name
                      # Write the dictionary to the CSV file as a new row
                      csvwriter.writerow(features)


English S154.srt
English S156.srt
English S153.srt
English S151.srt
English S150.srt
English S149.srt
English S144.srt
English S139.srt
English S138.srt
English S142.srt
English S143.srt
English S148.srt
English S141.srt
English S145.srt
English S126.srt
English S128.srt
English S132.srt
English S125.srt
English S129.srt
English S136.srt
English S127.srt
English S130.srt
English S137.srt
English S135.srt
English S111.srt
English S110.srt
English S114.srt
English S118.srt
English S108.srt
English S107.srt
English S124.srt
English S122.srt
English S104.srt
English S116.srt
English S093.srt
English S096.srt
English S092.srt
English S094.srt
English S095.srt
English S100.srt
English S103.srt
English S090.srt
English S101.srt
English S097.srt
English S086.srt
English S087.srt
English S079.srt
English S084.srt
English S083.srt
English S081.srt
English S140.srt
English S082.srt
English S089.srt
English S080.srt
English S073.srt
English S072.srt
English S076.srt
English S077.srt
English S058.s