In [2]:
import pandas as pd

# Load the full feature set
df = pd.read_csv("../EDA/final_features_with_labels.csv")

print("Shape of dataset:", df.shape)
df.head()

Shape of dataset: (78, 402)


Unnamed: 0,participant_id,sadness,anger,joy,fear,nervousness,affection,negative_emotion,positive_emotion,confusion,...,embed_376,embed_377,embed_378,embed_379,embed_380,embed_381,embed_382,embed_383,phq8_score,phq8_binary
0,301_TRANSCRIPT,0.0,0.0,0.000678,0.000678,0.004746,0.000678,0.018983,0.010847,0.00339,...,-0.077552,0.048985,-0.061061,0.039652,0.019754,0.05087,-0.02516,0.033991,3,0
1,302_TRANSCRIPT,0.001629,0.0,0.0,0.0,0.0,0.006515,0.011401,0.013029,0.003257,...,-0.052673,0.065626,0.007221,-0.016275,0.060827,0.122291,-0.075341,-0.024642,4,0
2,303_TRANSCRIPT,0.000509,0.0,0.0,0.0,0.00458,0.000509,0.00916,0.011705,0.000509,...,-0.04504,-0.010693,-0.06434,0.095692,-0.01272,0.009505,-0.009843,0.043166,0,0
3,304_TRANSCRIPT,0.002026,0.0,0.001013,0.001013,0.00304,0.00304,0.016211,0.013171,0.0,...,-0.001942,0.075825,-0.045645,0.021254,-0.002397,0.064429,-0.025121,0.029481,6,0
4,305_TRANSCRIPT,0.001804,0.000301,0.0,0.000301,0.000902,0.001804,0.006915,0.006013,0.000301,...,-0.0227,0.058585,-0.016208,0.00801,0.013218,0.035188,-0.065164,-0.005411,7,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Columns: 402 entries, participant_id to phq8_binary
dtypes: float64(399), int64(2), object(1)
memory usage: 245.1+ KB


In [4]:
df[["phq8_score", "phq8_binary"]].describe()

Unnamed: 0,phq8_score,phq8_binary
count,78.0,78.0
mean,9.294872,0.512821
std,5.926189,0.503071
min,0.0,0.0
25%,4.0,0.0
50%,10.0,1.0
75%,13.0,1.0
max,23.0,1.0


In [5]:
# Count missing values per column
missing_values = df.isnull().sum()
missing_cols = missing_values[missing_values > 0]
print("Columns with missing values:\n", missing_cols)

# Total missing values
print("\nTotal missing values in dataset:", df.isnull().sum().sum())

Columns with missing values:
 Series([], dtype: int64)

Total missing values in dataset: 0


In [6]:
# Check distribution of target (binary and score)
print("PHQ8 Score distribution:\n", df["phq8_score"].value_counts().sort_index())
print("\nPHQ8 Binary distribution:\n", df["phq8_binary"].value_counts())

PHQ8 Score distribution:
 phq8_score
0     4
1     4
2     6
3     3
4     4
5     4
6     2
7     3
8     2
9     6
10    6
11    8
12    4
13    4
14    1
15    5
16    4
18    3
20    2
21    1
22    1
23    1
Name: count, dtype: int64

PHQ8 Binary distribution:
 phq8_binary
1    40
0    38
Name: count, dtype: int64


In [7]:
# Check for duplicate rows
print("Number of duplicate rows:", df.duplicated().sum())

Number of duplicate rows: 4


In [8]:
# Check that embedding columns exist and have valid values
embed_cols = [col for col in df.columns if col.startswith("embed_")]
print("Total embedding columns:", len(embed_cols))

# Check for any NaNs or infinite values in embeddings
nan_embed = df[embed_cols].isnull().sum().sum()
inf_embed = ((df[embed_cols] == float("inf")) | (df[embed_cols] == float("-inf"))).sum().sum()

print("Missing in embeddings:", nan_embed)
print("Infinite in embeddings:", inf_embed)

Total embedding columns: 384
Missing in embeddings: 0
Infinite in embeddings: 0


In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[embed_cols] = scaler.fit_transform(df[embed_cols])

In [10]:
df.head()

Unnamed: 0,participant_id,sadness,anger,joy,fear,nervousness,affection,negative_emotion,positive_emotion,confusion,...,embed_376,embed_377,embed_378,embed_379,embed_380,embed_381,embed_382,embed_383,phq8_score,phq8_binary
0,301_TRANSCRIPT,0.0,0.0,0.000678,0.000678,0.004746,0.000678,0.018983,0.010847,0.00339,...,-1.481519,-0.478516,-1.117195,0.581946,-0.527242,0.081209,1.208887,1.517671,3,0
1,302_TRANSCRIPT,0.001629,0.0,0.0,0.0,0.0,0.006515,0.011401,0.013029,0.003257,...,-0.722066,-0.020477,0.557991,-0.939162,0.676664,1.890535,-0.103128,-0.653181,4,0
2,303_TRANSCRIPT,0.000509,0.0,0.0,0.0,0.00458,0.000509,0.00916,0.011705,0.000509,...,-0.489043,-2.121102,-1.197633,2.106124,-1.47911,-0.966672,1.609357,1.857349,0,0
3,304_TRANSCRIPT,0.002026,0.0,0.001013,0.001013,0.00304,0.00304,0.016211,0.013171,0.0,...,0.826564,0.260233,-0.738992,0.081534,-1.176542,0.424712,1.209907,1.350702,6,0
4,305_TRANSCRIPT,0.001804,0.000301,0.0,0.000301,0.000902,0.001804,0.006915,0.006013,0.000301,...,0.192898,-0.214281,-0.016802,-0.278667,-0.718833,-0.316047,0.162953,0.058836,7,0


In [11]:
df.columns

Index(['participant_id', 'sadness', 'anger', 'joy', 'fear', 'nervousness',
       'affection', 'negative_emotion', 'positive_emotion', 'confusion',
       ...
       'embed_376', 'embed_377', 'embed_378', 'embed_379', 'embed_380',
       'embed_381', 'embed_382', 'embed_383', 'phq8_score', 'phq8_binary'],
      dtype='object', length=402)