## Connect to drive

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Import packages

In [None]:
import os
import numpy as np
import pandas as pd
import csv

## Global attributes

In [None]:
patinets_dataroot = "/content/drive/MyDrive/term project/patient_info.csv"
txt_training_dataroot = "/content/drive/MyDrive/term project/NPL result.csv"


txt_df = pd.read_csv(txt_training_dataroot)
txt_df[:5]

Unnamed: 0,name,words_number,SPACE,ADV,VERB,ADP,DET,NOUN,ADJ,PUNCT,...,most_frequent,noun_chunk,person_singular_verbs,misspell,time_spec,spec,sentence,neg_word,contnetn,function
0,Baycrest11633,470.0,1.0,34.0,63.0,42.0,49.0,90.0,10.0,18.0,...,34.0,131.0,11.0,0.0,0.0,9.0,18.0,9.0,207.0,219.0
1,Baycrest11634,2636.0,1.0,200.0,393.0,242.0,243.0,391.0,91.0,34.0,...,194.0,683.0,48.0,9.0,11.0,49.0,13.0,49.0,1157.0,1281.0
2,Baycrest11976,1224.0,1.0,91.0,157.0,95.0,132.0,167.0,37.0,120.0,...,116.0,299.0,15.0,6.0,4.0,21.0,53.0,21.0,484.0,549.0
3,Baycrest12257,2224.0,1.0,181.0,268.0,150.0,138.0,269.0,42.0,368.0,...,310.0,540.0,52.0,20.0,8.0,56.0,50.0,56.0,822.0,879.0
4,Baycrest12813,2602.0,1.0,163.0,354.0,163.0,202.0,316.0,85.0,404.0,...,204.0,655.0,62.0,14.0,3.0,34.0,211.0,34.0,988.0,1064.0


## MFCC data extraction

In [None]:
class data_process:
    def __init__(self):
        self.labels = []
        self.mean_features = []
        self.std_features = []
        self.var_features = []
        self.min_features = []
        self.max_features = []

    def cal_mfcc_features(self, file_name, df):
        self.mean_features.append(df.mean(axis = 0))   # mean value of each frame
        self.std_features.append(df.std(axis = 0))     # standard deviation of each frame
        self.var_features.append(df.var(axis = 0))     # variance of each frame
        self.min_features.append(df.min(axis = 0))     # minimum of each frame
        self.max_features.append(df.max(axis = 0))     # maximum of each frame


## Read in datafiles


In [None]:
patient_info = pd.read_csv(patinets_dataroot)

dp = data_process()
print(patient_info.shape[0])

audio_names = []
mean_names = ["mean_0", "mean_1", "mean_2", "mean_3", "mean_4", "mean_5", "mean_6", "mean_7", "mean_8", "mean_9", "mean_10", "mean_11"]
std_names = ["std_0", "std_1", "std_2", "std_3", "std_4", "std_5", "std_6", "std_7", "std_8", "std_9", "std_10", "std_11"]
var_names = ["var_0", "var_1", "var_2", "var_3", "var_4", "var_5", "var_6", "var_7", "var_8", "var_9", "var_10", "var_11"]
min_names = ["min_0", "min_1", "min_2", "min_3", "min_4", "min_5", "min_6", "min_7", "min_8", "min_9", "min_10", "min_11"]
max_names = ["max_0", "max_1", "max_2", "max_3", "max_4", "max_5", "max_6", "max_7", "max_8", "max_9", "max_10", "max_11"]

for i in range(patient_info.shape[0]):
    aud_df = pd.read_csv(f"/content/drive/MyDrive/term project/MFCC_output/{patient_info.iloc[i,0]}.csv", header=None)
    audio_names.append(patient_info.iloc[i,0])
    dp.cal_mfcc_features(patient_info.iloc[i,0], aud_df)


mean_df = pd.DataFrame(dp.mean_features)
mean_df.columns = mean_names
mean_df["name"] = audio_names
mean_df = mean_df.reindex(["name", "mean_0", "mean_1", "mean_2", "mean_3", "mean_4", "mean_5", "mean_6", "mean_7", "mean_8", "mean_9", "mean_10", "mean_11"], axis=1)

std_df = pd.DataFrame(dp.std_features)
std_df.columns = std_names
std_df["name"] = audio_names

var_df = pd.DataFrame(dp.var_features)
var_df.columns = var_names
var_df["name"] = audio_names

min_df = pd.DataFrame(dp.min_features)
min_df.columns = min_names
min_df["name"] = audio_names

max_df = pd.DataFrame(dp.max_features)
max_df.columns = max_names
max_df["name"] = audio_names

mfcc_df = pd.merge(mean_df, std_df, how="right", on = "name")
mfcc_df = pd.merge(mfcc_df, var_df, how="right", on = "name")
mfcc_df = pd.merge(mfcc_df, min_df, how="right", on = "name")
mfcc_df = pd.merge(mfcc_df, max_df, how="right", on = "name")
mfcc_df


44


Unnamed: 0,name,mean_0,mean_1,mean_2,mean_3,mean_4,mean_5,mean_6,mean_7,mean_8,...,max_2,max_3,max_4,max_5,max_6,max_7,max_8,max_9,max_10,max_11
0,Baycrest11633,8.944184,-0.850197,0.948049,-1.998355,0.405676,-1.704454,0.908588,-2.11187,-0.609051,...,14.605125,6.813264,7.626088,4.219126,7.554993,3.555373,3.736752,4.596072,5.766482,4.197093
1,Baycrest11634,5.381672,-0.987148,3.31733,-2.465693,1.49111,-2.275351,1.254539,-1.619691,0.447528,...,18.087254,7.308491,9.137783,5.054695,8.340829,6.178095,6.114627,6.051663,6.463558,5.979562
2,Baycrest11976,4.540546,-2.101511,3.310822,-2.543396,1.94552,-2.013437,0.62149,-2.157566,-0.286386,...,14.846447,10.379416,9.796219,4.411665,7.607616,4.761656,6.275181,5.163419,6.454729,5.59299
3,Baycrest12257,6.837283,-0.529812,3.293784,-3.039978,1.341361,-2.148198,1.240532,-2.261506,0.059547,...,16.885852,5.193842,8.253646,4.406335,8.788866,4.327914,5.987578,5.588652,5.657975,4.573609
4,Baycrest12813,6.851045,-0.289273,2.855179,-3.296195,0.926552,-2.023755,1.035427,-3.048999,-0.467958,...,15.375857,5.243584,8.578858,5.328758,9.013362,4.415557,5.682638,5.307573,5.953049,4.48135
5,Baycrest12814,-0.224471,1.171967,2.887007,-1.241251,1.583782,-1.16886,1.656095,-2.060679,0.373887,...,15.451215,5.159746,7.711581,6.258129,7.873854,3.489952,4.697511,4.136739,5.614412,3.155303
6,Baycrest2103,5.936657,0.843193,2.174996,-3.497517,1.479591,-1.637668,1.456314,-1.977728,0.59098,...,16.605669,5.292241,7.221253,4.003773,7.241574,5.375266,5.221023,4.902013,5.641404,5.806933
7,Baycrest7352,5.230747,-1.50533,3.222062,-3.763847,1.419545,-2.334814,0.737271,-2.107579,0.463728,...,15.286777,4.727043,8.504117,4.93148,7.328781,4.431506,5.969218,4.472966,6.260807,5.396881
8,Baycrest8538,4.972934,-0.756911,2.774839,-2.395799,2.201676,-1.529462,1.561596,-1.987091,-0.127483,...,17.373645,6.802458,11.151,5.165459,7.529091,4.013816,5.641974,6.307668,5.467226,5.964157
9,Baycrest8961,4.119939,0.551614,4.183852,-2.515194,2.534332,-1.385665,1.66406,-2.180009,0.120439,...,16.886715,7.138473,10.060106,5.387462,8.520269,4.330364,6.225546,5.504334,6.219096,4.397002


## merge mfcc & txt data

In [None]:
txt_df = pd.read_csv(txt_training_dataroot)
merged_df = pd.merge(mfcc_df, txt_df, how="right", on = "name")
merged_df["AD_diagnose"] = np.ones(patient_info.shape[0], dtype = int)

merged_df.shape
# merged_df

(44, 93)

## Write to file

In [None]:
csv_file_path = '/content/drive/MyDrive/term project/merged_data.csv'
merged_df.to_csv(csv_file_path, index=False)