<a href="https://colab.research.google.com/github/ankurvarma7/interview_outcomes_ml/blob/main/feature_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import re
prosodic_df = pd.read_csv('/content/drive/MyDrive/prosodic_features.csv')
prosodic_df.head()

Unnamed: 0,participant&question,duration,energy,min_pitch,max_pitch,mean_pitch,pitch_sd,pitch_abs,pitch_quant,pitchUvsVRatio,...,numVoiceBreaks,PercentBreaks,speakRate,numPause,maxDurPause,avgDurPause,TotDurPause:3,iInterval,MaxRising:3,MaxFalling:3
0,P1Q1,51.952125,0.015331,75.232657,396.635613,127.989222,27.821528,217.628692,119.690367,0.77383,...,136,0.42117,0.000342,55,1.290667,0.494,27.176,138,274.562,257.247
1,P1Q2,38.677312,0.015185,75.165527,397.613041,131.06749,26.452853,195.852246,124.820583,0.717333,...,97,0.38674,0.000444,40,1.866667,0.522,20.875,100,268.613,270.193
2,P1Q3,43.593896,0.01468,71.034761,395.930688,127.739086,26.00633,189.441737,120.457848,1.068376,...,112,0.43099,0.000475,44,2.624,0.505,22.229,114,283.912,203.506
3,P1Q4,23.435813,0.00892,74.938673,248.733738,129.563914,19.334327,103.02464,124.473851,1.121487,...,55,0.46003,0.000907,24,2.933333,0.507,12.171,55,100.655,83.172
4,P1Q5,13.274833,0.003432,93.949854,263.669188,130.178482,26.595483,174.934601,120.27248,1.078493,...,32,0.46576,0.00157,12,2.848,0.628,7.541,32,129.591,117.119


In [None]:
prosodic_df['Participant'] = prosodic_df['participant&question'].str.extract(r'^([a-zA-Z]+[0-9]+)')
prosodic_df['Participant'] = prosodic_df['Participant'].str.lower()
prosodic_df.head()

Unnamed: 0,participant&question,duration,energy,min_pitch,max_pitch,mean_pitch,pitch_sd,pitch_abs,pitch_quant,pitchUvsVRatio,...,PercentBreaks,speakRate,numPause,maxDurPause,avgDurPause,TotDurPause:3,iInterval,MaxRising:3,MaxFalling:3,Participant
0,P1Q1,51.952125,0.015331,75.232657,396.635613,127.989222,27.821528,217.628692,119.690367,0.77383,...,0.42117,0.000342,55,1.290667,0.494,27.176,138,274.562,257.247,p1
1,P1Q2,38.677312,0.015185,75.165527,397.613041,131.06749,26.452853,195.852246,124.820583,0.717333,...,0.38674,0.000444,40,1.866667,0.522,20.875,100,268.613,270.193,p1
2,P1Q3,43.593896,0.01468,71.034761,395.930688,127.739086,26.00633,189.441737,120.457848,1.068376,...,0.43099,0.000475,44,2.624,0.505,22.229,114,283.912,203.506,p1
3,P1Q4,23.435813,0.00892,74.938673,248.733738,129.563914,19.334327,103.02464,124.473851,1.121487,...,0.46003,0.000907,24,2.933333,0.507,12.171,55,100.655,83.172,p1
4,P1Q5,13.274833,0.003432,93.949854,263.669188,130.178482,26.595483,174.934601,120.27248,1.078493,...,0.46576,0.00157,12,2.848,0.628,7.541,32,129.591,117.119,p1


In [None]:
print(prosodic_df.columns)

Index(['participant&question', 'duration', 'energy', 'min_pitch', 'max_pitch',
       'mean_pitch', 'pitch_sd', 'pitch_abs', 'pitch_quant', 'pitchUvsVRatio',
       'diffPitchMaxMin', 'diffPitchMaxMean', 'diffPitchMaxMode',
       'intensityMin', 'intensityMax', 'intensityMean', 'intensitySD',
       'intensityQuant', 'diffIntMaxMin', 'diffIntMaxMean', 'diffIntMaxMode',
       'avgVal1', 'avgBand2', 'avgBand3', 'jitter', 'shimmer',
       'numVoiceBreaks', 'PercentBreaks', 'speakRate', 'numPause',
       'maxDurPause', 'avgDurPause', 'TotDurPause:3', 'iInterval',
       'MaxRising:3', 'MaxFalling:3', 'Participant'],
      dtype='object')


In [None]:
def create_interpretable_prosodic_features(df):
    grouped = df.groupby('Participant')

    interpretable_features = []


    for participant, group in grouped:
        features = {
            'Participant': participant,
            'average_pitch': group['mean_pitch'].mean(),
            'pitch_expressiveness': group['pitch_sd'].mean(),
            'pitch_range': group['diffPitchMaxMin'].mean(),
            'rising_intonation': group['MaxRising:3'].mean(),
            'falling_intonation': group['MaxFalling:3'].mean(),
            'speech_volume': group['intensityMean'].mean(),
            'volume_variability': group['intensitySD'].mean(),
            'volume_range': group['diffIntMaxMin'].mean(),
            'voice_steadiness': -group['jitter'].mean(),
            'volume_steadiness': -group['shimmer'].mean(),
            'voice_break_frequency': group['numVoiceBreaks'].mean(),
            'speaking_pace': group['speakRate'].mean(),
            'pause_frequency': group['numPause'].sum() / group['duration'].sum(),
            'average_pause_length': group['avgDurPause'].mean(),
            'longest_pause': group['maxDurPause'].mean(),
            'average_response_length': group['duration'].mean()
        }

        interpretable_features.append(features)
    return pd.DataFrame(interpretable_features)

interpretable_prosodic_df = create_interpretable_prosodic_features(prosodic_df)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def create_normalized_prosodic_features(df):
    participant_ids = df['Participant']
    numeric_features = df.drop('Participant', axis=1)
    scaler = MinMaxScaler()
    normalized_features = scaler.fit_transform(numeric_features)
    normalized_df = pd.DataFrame(normalized_features, columns=numeric_features.columns)
    normalized_df['Participant'] = participant_ids

    return normalized_df

interpretable_prosodic_df = create_normalized_prosodic_features(interpretable_prosodic_df)
interpretable_prosodic_df.head()

Unnamed: 0,average_pitch,pitch_expressiveness,pitch_range,rising_intonation,falling_intonation,speech_volume,volume_variability,volume_range,voice_steadiness,volume_steadiness,voice_break_frequency,speaking_pace,pause_frequency,average_pause_length,longest_pause,average_response_length,Participant
0,0.207671,0.07774,0.467589,0.428718,0.358501,0.740083,0.740922,0.541901,0.816426,0.613586,0.15873,0.332031,0.959186,0.106757,0.167705,0.119374,p1
1,0.091465,0.086905,0.529998,0.547341,0.545657,0.78405,0.950291,0.59956,0.949159,0.809223,0.35873,0.067748,0.464906,0.322973,0.196354,0.48303,p10
2,0.614287,0.222887,0.811713,0.22346,0.44646,0.511864,0.526251,0.396885,0.955708,0.825564,0.267937,0.106167,0.807969,0.108333,0.271162,0.256146,p11
3,0.575075,0.285231,0.397166,0.086484,0.161349,0.616909,0.630435,0.685929,0.929086,0.934806,0.177143,0.230553,0.318489,0.611712,0.511785,0.133886,p12
4,0.576137,0.293221,0.430098,0.283635,0.257159,0.137373,0.36521,0.236175,0.841775,0.67378,0.245714,0.180086,0.420242,0.291441,0.218749,0.261353,p13


In [None]:
!pip install scikit-learn



In [None]:
scores_df = pd.read_csv('/content/drive/MyDrive/scores.csv')
scores_df.head()

def get_topk_features(df, k):
  top_k_features = df.head(k)
  return top_k_features['Feature'].tolist()


#Applying mutual info for feature selection
def mutual_info_feature_selection(interpretable_df, scores_df, k):
  from sklearn.feature_selection import mutual_info_regression

  if len(interpretable_df.columns) <k:
    print("Number of features is less than k")
    return None

  merged_data = interpretable_df.merge(scores_df, on='Participant', how='inner')
  merged_data.head()
  X = merged_data.drop(['Participant', 'Overall', 'Excited'], axis=1)
  y_performance = merged_data['Overall']
  y_excitement = merged_data['Excited']

  #For performance score
  mi_performance = mutual_info_regression(X, y_performance)
  mi_performance_df = pd.DataFrame({'Feature': X.columns, 'MI_Score': mi_performance})
  top_features_performance = mi_performance_df.sort_values('MI_Score', ascending=False)

  #For excitement score
  mi_excitement = mutual_info_regression(X, y_excitement)
  mi_excitement_df = pd.DataFrame({'Feature': X.columns, 'MI_Score': mi_excitement})
  top_features_excitement = mi_excitement_df.sort_values('MI_Score', ascending=False)

  top_k_features_performance = get_topk_features(top_features_performance, k)
  top_k_features_excitement = get_topk_features(top_features_excitement, k)
  return top_k_features_performance, top_k_features_excitement

top_features_performance_prosodic, top_features_excitement_prosodic = mutual_info_feature_selection(interpretable_prosodic_df, scores_df, 10)

In [None]:
sentiment_analysis_df = pd.read_csv('/content/drive/MyDrive/transcripts_sentiment_analysis.csv')
sentiment_analysis_df.head()

Unnamed: 0,Participant,Transcript,InterviewerTranscript,IntervieweeTranscript,OtherTranscript,overall_neg,overall_neu,overall_pos,overall_compound,interviewer_neg,...,interviewer_pos,interviewer_compound,interviewee_neg,interviewee_neu,interviewee_pos,interviewee_compound,other_neg,other_neu,other_pos,other_compound
0,p1,Interviewer: So how are you doing?|Interviewee...,So how are you doing? Ok well so please te...,Im pretty good. ok uhm so have you looked...,,0.013,0.859,0.128,0.9955,0.055,...,0.243,0.9277,0.015,0.868,0.117,0.9927,0.0,0.0,0.0,0.0
1,p10,Interviewer: So how you doing?|Interviewee: G...,So how you doing? I'm okay. I understand ...,Great how about you? I'm a little [???] by...,[laughter],0.026,0.852,0.122,0.9988,0.059,...,0.156,0.8418,0.026,0.842,0.132,0.9989,0.0,0.0,1.0,0.4939
2,p11,Interviewer: So tell me about yourself. |Inte...,So tell me about yourself. Great okay. C...,Uhh I’m a junior at MIT uhh I’m double maj...,,0.02,0.874,0.105,0.9966,0.029,...,0.236,0.9621,0.018,0.892,0.089,0.9936,0.0,0.0,0.0,0.0
3,p12,Interviewer: So how are you doing today?|Inter...,So how are you doing today? Good. So why d...,I'm good how are you? Ok so I'm a Junior...,(both laugh),0.027,0.842,0.131,0.9978,0.038,...,0.214,0.9654,0.023,0.852,0.125,0.9968,0.0,0.217,0.783,0.5574
4,p13,Interviewer: How are you doing today?|Intervie...,How are you doing today? Good. So why don'...,Good. Ok umm I'm currently a junior at M....,,0.038,0.838,0.124,0.9945,0.05,...,0.272,0.9714,0.051,0.841,0.107,0.9771,0.0,0.0,0.0,0.0


In [None]:
sentiment_analysis_df = sentiment_analysis_df.drop(['Transcript','InterviewerTranscript','IntervieweeTranscript','OtherTranscript'], axis=1)

In [None]:
top_features_sentiment_performance, top_features_sentiment_excitement = mutual_info_feature_selection(sentiment_analysis_df, scores_df, 5)

In [None]:
print(top_features_sentiment_performance)
print(top_features_sentiment_excitement)

['interviewee_pos', 'interviewer_pos', 'interviewer_compound', 'overall_neu', 'interviewee_neg']
['overall_neg', 'interviewee_compound', 'interviewee_pos', 'interviewer_pos', 'overall_compound']


In [None]:
print(len(interpretable_prosodic_df.columns))
print(len(sentiment_analysis_df.columns))

17
17


In [None]:
print(len(interpretable_prosodic_df))
print(len(sentiment_analysis_df))

138
138


In [None]:
import torch
import os

def save_participant_tensors(dataframe, selected_features, output_dir, target_column=None):
    """
    Saves a tensor for each participant's features and target (if provided)

    Parameters:
    dataframe: DataFrame with one row per participant
    selected_features: List of feature names to include
    output_dir: Directory to save tensors
    target_column: Optional target variable (e.g., 'performance_score')
    """
    os.makedirs(output_dir, exist_ok=True)
    features_to_use = [f for f in selected_features]
    participant_ids = dataframe['Participant'].values
    count = 0

    for i, participant_id in enumerate(participant_ids):
        participant_data = dataframe.iloc[i]
        feature_values = participant_data[features_to_use].values.astype(np.float64)
        feature_tensor = torch.tensor(feature_values, dtype=torch.float32)
        feature_path = os.path.join(output_dir, f"{participant_id}.pt")
        torch.save(feature_tensor, feature_path)
        if target_column:
            target_value = participant_data[target_column]
            target_tensor = torch.tensor([target_value], dtype=torch.float32)
            target_path = os.path.join(output_dir, f"participant_{participant_id}_{target_column}.pt")
            torch.save(target_tensor, target_path)

        count += 1

    print(f"Saved tensors for {count} participants to {output_dir}")
    return participant_ids.tolist()

In [None]:
for k in range(1,18):
  top_features_performance_prosodic, top_features_excitement_prosodic = mutual_info_feature_selection(interpretable_prosodic_df, scores_df, k)
  save_participant_tensors(interpretable_prosodic_df, top_features_performance_prosodic, f'/content/drive/MyDrive/data/prosodic/overall/{k}')
  save_participant_tensors(interpretable_prosodic_df, top_features_excitement_prosodic, f'/content/drive/MyDrive/data/prosodic/excited/{k}')

  top_features_sentiment_performance, top_features_sentiment_excitement = mutual_info_feature_selection(sentiment_analysis_df, scores_df, k)
  save_participant_tensors(sentiment_analysis_df, top_features_sentiment_performance, f'/content/drive/MyDrive/data/sentiment/overall/{k}')
  save_participant_tensors(sentiment_analysis_df, top_features_sentiment_excitement, f'/content/drive/MyDrive/data/sentiment/excited/{k}')



Saved tensors for 138 participants to /content/drive/MyDrive/data/prosodic/overall/1
Saved tensors for 138 participants to /content/drive/MyDrive/data/prosodic/excited/1
Saved tensors for 138 participants to /content/drive/MyDrive/data/sentiment/overall/1
Saved tensors for 138 participants to /content/drive/MyDrive/data/sentiment/excited/1
Saved tensors for 138 participants to /content/drive/MyDrive/data/prosodic/overall/2
Saved tensors for 138 participants to /content/drive/MyDrive/data/prosodic/excited/2
Saved tensors for 138 participants to /content/drive/MyDrive/data/sentiment/overall/2
Saved tensors for 138 participants to /content/drive/MyDrive/data/sentiment/excited/2
Saved tensors for 138 participants to /content/drive/MyDrive/data/prosodic/overall/3
Saved tensors for 138 participants to /content/drive/MyDrive/data/prosodic/excited/3
Saved tensors for 138 participants to /content/drive/MyDrive/data/sentiment/overall/3
Saved tensors for 138 participants to /content/drive/MyDrive

In [None]:
import os
import torch
import re

def get_embeddings(participant, feature_type, performance_type, k, root_dir_path:str ='/content/drive/MyDrive'):
  valid_feature_types = ['prosodic', 'sentiment']
  valid_performance_types = ['overall', 'excited']

  # Check participant format (e.g., p1, pp2, etc.)
  if not isinstance(participant, str) or not re.match(r'^([a-zA-Z]+[0-9]+)', participant):
      raise ValueError(f"Invalid participant ID '{participant}'. Must be a string like 'p1', 'pp2', etc.")

  if feature_type not in valid_feature_types:
      raise ValueError(f"Invalid feature_type '{feature_type}'. Must be one of {valid_feature_types}.")

  if performance_type not in valid_performance_types:
      raise ValueError(f"Invalid performance_type '{performance_type}'. Must be one of {valid_performance_types}.")

  if not isinstance(k, int) or k <= 0:
      raise ValueError(f"Invalid value for k: {k}. Must be a positive integer.")

  return torch.load(os.path.join(root_dir_path, f'data/{feature_type}/{performance_type}/{k}/{participant}.pt'))

print(get_embeddings('p1', 'prosodic', 'overall', 10))
print(get_embeddings('pp1', 'sentiment', 'excited', 5))

tensor([0.8164, 0.1068, 0.9592, 0.3585, 0.7401, 0.6136, 0.5419, 0.3320, 0.1587,
        0.4676])
tensor([0.0160, 0.9892, 0.1190, 0.1990, 0.9947])
