In [None]:
import pandas as pd

# Load the full feature set
df = pd.read_csv("../EDA/final_features_with_labels.csv")

print("Shape of dataset:", df.shape)
df.head()

Shape of dataset: (8, 402)


Unnamed: 0,participant_id,sadness,anger,joy,fear,nervousness,affection,negative_emotion,positive_emotion,confusion,...,embed_376,embed_377,embed_378,embed_379,embed_380,embed_381,embed_382,embed_383,phq8_score,phq8_binary
0,301,0.0,0.0,0.000678,0.000678,0.004746,0.000678,0.018983,0.010847,0.00339,...,-0.077552,0.048985,-0.061061,0.039652,0.019754,0.05087,-0.02516,0.033991,3,0
1,303,0.000509,0.0,0.0,0.0,0.00458,0.000509,0.00916,0.011705,0.000509,...,-0.04504,-0.010693,-0.06434,0.095692,-0.01272,0.009505,-0.009843,0.043166,0,0
2,304,0.002026,0.0,0.001013,0.001013,0.00304,0.00304,0.016211,0.013171,0.0,...,-0.001942,0.075825,-0.045645,0.021254,-0.002397,0.064429,-0.025121,0.029481,6,0
3,305,0.001804,0.000301,0.0,0.000301,0.000902,0.001804,0.006915,0.006013,0.000301,...,-0.0227,0.058585,-0.016208,0.00801,0.013218,0.035188,-0.065164,-0.005411,7,0
4,308,0.005624,0.001125,0.001125,0.001125,0.005624,0.00225,0.016873,0.006749,0.001125,...,-0.012394,0.051637,-0.020888,-0.005703,-0.004676,0.019968,-0.066287,-0.020669,22,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Columns: 402 entries, participant_id to phq8_binary
dtypes: float64(399), int64(3)
memory usage: 25.3 KB


In [6]:
df[["phq8_score", "phq8_binary"]].describe()

Unnamed: 0,phq8_score,phq8_binary
count,8.0,8.0
mean,9.75,0.375
std,8.447316,0.517549
min,0.0,0.0
25%,3.75,0.0
50%,6.5,0.0
75%,16.5,1.0
max,22.0,1.0


In [7]:
# Count missing values per column
missing_values = df.isnull().sum()
missing_cols = missing_values[missing_values > 0]
print("Columns with missing values:\n", missing_cols)

# Total missing values
print("\nTotal missing values in dataset:", df.isnull().sum().sum())

Columns with missing values:
 Series([], dtype: int64)

Total missing values in dataset: 0


In [8]:
# Check distribution of target (binary and score)
print("PHQ8 Score distribution:\n", df["phq8_score"].value_counts().sort_index())
print("\nPHQ8 Binary distribution:\n", df["phq8_binary"].value_counts())

PHQ8 Score distribution:
 phq8_score
0     1
3     1
4     1
6     1
7     1
15    1
21    1
22    1
Name: count, dtype: int64

PHQ8 Binary distribution:
 phq8_binary
0    5
1    3
Name: count, dtype: int64


In [9]:
# Check for duplicate rows
print("Number of duplicate rows:", df.duplicated().sum())

Number of duplicate rows: 0


In [10]:
# Check that embedding columns exist and have valid values
embed_cols = [col for col in df.columns if col.startswith("embed_")]
print("Total embedding columns:", len(embed_cols))

# Check for any NaNs or infinite values in embeddings
nan_embed = df[embed_cols].isnull().sum().sum()
inf_embed = ((df[embed_cols] == float("inf")) | (df[embed_cols] == float("-inf"))).sum().sum()

print("Missing in embeddings:", nan_embed)
print("Infinite in embeddings:", inf_embed)

Total embedding columns: 384
Missing in embeddings: 0
Infinite in embeddings: 0


In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[embed_cols] = scaler.fit_transform(df[embed_cols])

In [12]:
df.head()

Unnamed: 0,participant_id,sadness,anger,joy,fear,nervousness,affection,negative_emotion,positive_emotion,confusion,...,embed_376,embed_377,embed_378,embed_379,embed_380,embed_381,embed_382,embed_383,phq8_score,phq8_binary
0,301,0.0,0.0,0.000678,0.000678,0.004746,0.000678,0.018983,0.010847,0.00339,...,-1.635961,0.111204,-0.400292,0.586673,0.021751,0.215121,0.38309,1.096271,3,0
1,303,0.000509,0.0,0.0,0.0,0.00458,0.000509,0.00916,0.011705,0.000509,...,-0.625594,-2.200371,-0.495034,2.332442,-1.213782,-0.866383,0.858006,1.416321,0,0
2,304,0.002026,0.0,0.001013,0.001013,0.00304,0.00304,0.016211,0.013171,0.0,...,0.713727,1.15083,0.045167,0.013509,-0.821047,0.569646,0.384299,0.93895,6,0
3,305,0.001804,0.000301,0.0,0.000301,0.000902,0.001804,0.006915,0.006013,0.000301,...,0.068639,0.483057,0.895783,-0.39906,-0.226936,-0.194882,-0.857282,-0.278265,7,0
4,308,0.005624,0.001125,0.001125,0.001125,0.005624,0.00225,0.016873,0.006749,0.001125,...,0.388898,0.21392,0.760535,-0.826242,-0.907745,-0.592816,-0.892106,-0.810561,22,1
