Import used libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io.wavfile import read 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from algorithms import get_features, plot_durations, get_audio, get_trim, pad_audio
import encode_dict_data 
import os
import math
import time

Indicate audios path and read CSVs to get features for both development and evaluation datasets

In [3]:
# Storing the addresses in corresponding variables
data_path = "dsl_data/development.csv"
evaluation_path = "dsl_data/evaluation.csv"

# Importing development and evaluation data from csv files
df = pd.read_csv(data_path)
evaluation_df = pd.read_csv(evaluation_path)

Analyze Data

In [4]:
# Discover that evaluation data only contains English (United States) and Native speakers
print("Unique values of current language:",evaluation_df['Current language used for work/school'].unique())
print("Unique values of first language:",evaluation_df['First Language spoken'].unique())
print("Unique values of fluency level:",evaluation_df['Self-reported fluency level '].unique())


Unique values of current language: ['English (United States)']
Unique values of first language: ['English (United States)']
Unique values of fluency level: ['native']


Remove data considered as "noise", rows where first & current language is not English (United States) and fluency is not native

In [5]:
modified_df = df[df['First Language spoken'] == 'English (United States)']
modified_df = modified_df[modified_df['Current language used for work/school'] == 'English (United States)']
modified_df = modified_df[modified_df['Self-reported fluency level '] == 'native']

df = modified_df

Label Encoding (Development & Evaluation)
 
transform categorical data into numerical values since sklearn doesn't accept strings

In [6]:
# Label Encoding

le_mapping = {}
encoding_columns = ['Self-reported fluency level ','First Language spoken','Current language used for work/school','ageRange','gender']

# Encode DEVELOPMENT DATA 
for col in encoding_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_mapping[col] = dict(zip(le.classes_, le.transform(le.classes_)))

# Encode EVALUATION DATA TOO
for col, mapping in le_mapping.items():
    evaluation_df[col] = evaluation_df[col].map(mapping)

Label Decoding (if needed)

In [None]:
# Decoding 

# for col, mapping in le_mapping.items():
#     df[col] = df[col].map(mapping)

Combine "action" & "object" in one dataframe column 

In [7]:
df["action-object"] = df['action'].astype(str) +"-"+ df["object"]


Load audio files (.wav)

used to load audio files at specific path and return the waveform as a numpy array data, along with the sample rate

In [None]:
df = df.join(df['path'].apply(get_audio))
evaluation_df = evaluation_df.join(evaluation_df['path'].apply(get_audio))

Trim audio files

reduce the duration of audio files by removing part of each audio signal that contains silence or "noise"

In [None]:
df = df.join(df[['data', 'sample_rate']].apply(get_trim, axis=1))
evaluation_df = evaluation_df.join(evaluation_df[['data', 'sample_rate']].apply(get_trim, axis=1))

# Exract maximum length to use in padding later
max_development = df['duration_trim'].max()
max_evaluation = evaluation_df['duration_trim'].max()
maximum_duration = np.maximum(max_development, max_evaluation)
print(f"Maximum duration in both sets: {maximum_duration}s")
maximum_duration = math.ceil(maximum_duration)
print(f"Ceil maximum duration in both sets: {maximum_duration}s")

Pad audio files to max audio length in both sets

adds silence (zero values) to the audio signals to make all audio signals same fixed length

In [None]:
df = df.join(df.apply(lambda x: pad_audio(x['data_trim'], x['sample_rate'],maximum_duration), axis=1))
evaluation_df = evaluation_df.join(evaluation_df.apply(lambda x: pad_audio(x['data_trim'], x['sample_rate'],maximum_duration), axis=1))

Extract Features

Apply get_features function to extract features from audio files and trim silence from beggining and end of each audio file
(Done together to reduce complexity)

In [None]:
df = df.join(df.apply(lambda x: get_features(x['data_trim'], x['sample_rate']), axis=1))
evaluation_df = evaluation_df.join(evaluation_df.apply(lambda x: get_features(x['data_trim'], x['sample_rate']), axis=1))

# # Save extracted features in csv files to avoid repeating steps 
df.to_csv('save_csv/training2.csv')
evaluation_df.to_csv('save_csv/evaluation2.csv')

OR get features from previously saved csv files to prevent loading and extracting features again

In [38]:
# # Import dataframes from previously saved csv files so we don't need to get features again

df = pd.read_csv(r'save_csv/training2.csv').iloc[:,1:]
evaluation_df = pd.read_csv(r'save_csv/evaluation2.csv').iloc[:,1:]


In [39]:
df

Unnamed: 0,Id,path,speakerId,action,object,Self-reported fluency level,First Language spoken,Current language used for work/school,gender,ageRange,...,968,969,970,971,972,973,974,975,976,977
0,106,dsl_data/audio/speakers/2ojo7YRL7Gck83Z3/3e7e4...,2ojo7YRL7Gck83Z3,deactivate,lights,0,0,0,0,0,...,6.396494e-08,1.014894e-07,1.193709e-07,2.390989e-07,3.414445e-07,2.453299e-07,4.460065e-08,1.093370e-08,6.335365e-10,5.685474e-12
1,107,dsl_data/audio/speakers/2ojo7YRL7Gck83Z3/4136f...,2ojo7YRL7Gck83Z3,change language,none,0,0,0,0,0,...,5.256315e-07,4.872966e-07,3.301417e-07,6.877973e-07,4.322525e-07,2.319994e-07,3.368491e-07,1.818309e-07,8.241035e-09,3.468823e-11
2,108,dsl_data/audio/speakers/2ojo7YRL7Gck83Z3/4694e...,2ojo7YRL7Gck83Z3,activate,music,0,0,0,0,0,...,6.418015e-08,8.978116e-08,1.000982e-07,1.687780e-07,1.550420e-07,6.748530e-08,3.102568e-08,1.195675e-08,4.580976e-09,3.438152e-09
3,109,dsl_data/audio/speakers/2ojo7YRL7Gck83Z3/494ea...,2ojo7YRL7Gck83Z3,decrease,volume,0,0,0,0,0,...,4.125046e-08,6.356045e-08,7.704278e-08,5.048867e-08,7.592725e-08,3.924195e-08,1.624490e-08,3.819438e-09,3.304342e-10,3.342751e-12
4,110,dsl_data/audio/speakers/2ojo7YRL7Gck83Z3/540ee...,2ojo7YRL7Gck83Z3,deactivate,lights,0,0,0,0,0,...,6.044498e-08,6.944633e-08,7.345793e-08,1.963227e-07,4.626480e-07,2.389892e-07,7.018110e-08,1.642731e-08,1.330104e-09,1.173896e-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9374,9849,dsl_data/audio/speakers/vnljypgejkINbBAY/4fb3d...,vnljypgejkINbBAY,decrease,volume,0,0,0,1,0,...,1.466784e-06,4.727239e-07,1.090226e-06,7.522668e-07,6.793515e-07,8.201328e-07,9.526733e-07,2.238469e-07,9.224985e-09,5.584047e-11
9375,9850,dsl_data/audio/speakers/vnljypgejkINbBAY/59e6a...,vnljypgejkINbBAY,deactivate,lights,0,0,0,1,0,...,3.520913e-07,2.115596e-07,2.834343e-07,3.136267e-07,3.330108e-07,3.659563e-07,1.031203e-07,1.667427e-08,3.532347e-10,3.345999e-12
9376,9851,dsl_data/audio/speakers/vnljypgejkINbBAY/5c81c...,vnljypgejkINbBAY,deactivate,lights,0,0,0,1,0,...,3.005407e-06,1.894453e-06,1.120879e-06,4.509063e-07,2.050255e-07,2.727756e-07,1.988128e-07,3.190007e-08,2.907395e-09,3.573724e-11
9377,9852,dsl_data/audio/speakers/vnljypgejkINbBAY/5ef42...,vnljypgejkINbBAY,deactivate,lights,0,0,0,1,0,...,2.919442e-06,2.089495e-06,1.364334e-06,4.710524e-07,1.293256e-06,1.746767e-06,1.370511e-06,2.736100e-07,1.279671e-08,4.697547e-11


Select features & labels

In [40]:
# x = df.drop(['Id','path','speakerId','action','object','action-object','Self-reported fluency level ','First Language spoken','Current language used for work/school','data', 'sample_rate', 'duration', 'data_trim', 'duration_trim', 'data_pad', 'duration_data'],axis=1)
# x_evaluation = evaluation_df.drop(['Id','path','speakerId','Self-reported fluency level ','First Language spoken','Current language used for work/school','data', 'sample_rate', 'duration', 'data_trim', 'duration_trim', 'data_pad', 'duration_data'],axis=1)

x = df.drop(['Id','path','speakerId','action','object','action-object','data', 'sample_rate', 'duration', 'data_trim', 'duration_trim', 'data_pad', 'duration_data'],axis=1)
x_evaluation = evaluation_df.drop(['Id','path','speakerId','data', 'sample_rate', 'duration', 'data_trim', 'duration_trim', 'data_pad', 'duration_data'],axis=1)


y = df[['action-object']].copy()

# Change column names from Int to Str to avoid error by SKLEARN
x.columns = x.columns.astype(str)
x_evaluation.columns = x_evaluation.columns.astype(str)


Drop columns (features) is needed

In [21]:
# column_names = [str(i) for i in range(210, 978)]
# x = x.drop(columns=column_names)
# x_evaluation= x_evaluation.drop(columns=column_names)

plot PCA graph

In [None]:
# pca = PCA().fit(data_rescaled)

# %matplotlib inline
# import matplotlib.pyplot as plt
# plt.rcParams["figure.figsize"] = (50,50)

# fig, ax = plt.subplots()
# xi = np.arange(1, 213, step=1)
# y = np.cumsum(pca.explained_variance_ratio_)

# plt.ylim(0.0,1.1)
# plt.plot(xi, y, marker='o', linestyle='--', color='b')

# plt.xlabel('Number of Components')
# plt.xticks(np.arange(0, 212, step=1)) #change from 0-based array index to 1-based human-readable label
# plt.ylabel('Cumulative variance (%)')
# plt.title('The number of components needed to explain variance')

# plt.axhline(y=0.95, color='r', linestyle='-')
# plt.text(0.5, 0.85, '95% cut-off threshold', color = 'red', fontsize=16)

# ax.grid(axis='x')
# plt.show()

Transform data & run PCA to reduce dimensionality

In [42]:
# Apply scaling (Z-score) for PCA since pca is sensitive to the scale of features.

# scaler = StandardScaler()
# scaler.fit(x)
# X_scaled = scaler.transform(x)
# X_evaluation_scaled = scaler.transform(x_evaluation)  # apply same transformation to test data
# pca = PCA(n_components=.95).fit(X_scaled)
# X_pca = pca.transform(X_scaled)
# X_evaluation_pca = pca.transform(X_evaluation_scaled)
# print(sum(pca.explained_variance_ratio_)) 


from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(x) 
X_scaled = scaler.transform(x)
# apply same transformation to test data
X_evaluation_scaled = scaler.transform(x_evaluation)  

# def standardize(x_train, x_test):
#     mean = np.mean(x_train, axis=0)
#     std = np.std(x_train, axis=0)
#     x_train_standardized = (x_train - mean) / std
#     x_test_standardized = (x_test - mean) / std
#     return x_train_standardized, x_test_standardized
# X_scaled, X_evaluation_scaled = standardize(x, x_evaluation)

Split data to training and test

In [43]:
#80% training data and 20% test data
x_train, x_test, y_train, y_test = train_test_split(X_scaled, y,test_size = .2,random_state = 42, shuffle = True)
# x_train, x_test, y_train, y_test = train_test_split(X_scaled, y,test_size = .2)



Grid search for Random Forest model

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV

# # Define the parameter grid to search over
# param_grid = {'n_estimators': [10, 100, 1000],
#               'max_depth': [3, 5, 7],
#               'min_samples_leaf': [1, 2, 3],
#               'criterion': ['gini', 'entropy']}

# # Initialize the random forest classifier
# rf = RandomForestClassifier()

# # Use GridSearchCV to find the best parameters
# grid_search = GridSearchCV(rf, param_grid, cv=2, scoring='accuracy')
# grid_search.fit(X_scaled, y)

# # Print the best parameters and the best score
# print("Best parameters: ", grid_search.best_params_)
# print("Best score: ", grid_search.best_score_)


Run Random Forest Classifier after obtaining best hyperparams

In [None]:
#Create a Classifier
rf_clf=RandomForestClassifier(criterion ='entropy', max_depth= 7, min_samples_leaf= 2, n_estimators= 1000)

#Train the model using the training sets 
rf_clf.fit(x_train,np.ravel(y_train))
y_pred_rf=rf_clf.predict(x_test)

# Model Accuracy using test data (20%)
print("Accuracy:",accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf,average='weighted'))
print("Recall:", recall_score(y_test, y_pred_rf,average='weighted'))
print("F1-Score:", f1_score(y_test, y_pred_rf,average='weighted'))

Grid search for SVM model

In [None]:
# # Define the parameter grid to search over

# param_grid = {'C': [0.1, 1,4,8,10,50,100,1000], 'gamma': [0.01, 0.1, 1]}

# # Create a SVM with an RBF kernel
# svm = SVC(kernel='rbf')

# # Perform the grid search using 10-fold cross-validation
# grid_search = GridSearchCV(svm, param_grid)
# grid_search.fit(x_train, np.ravel(y_train))

# # Print the best parameters and the corresponding mean test score
# print("Best parameters: ",grid_search.best_params_)
# print("Best score: ",grid_search.best_score_)

Run SVM Classifier after obtaining best hyperparams

In [44]:
svm_clf = svm.SVC(kernel='rbf', C=4,gamma=0.1)

svm_clf.fit(x_train,np.ravel(y_train))
y_pred_svm=svm_clf.predict(x_test)

# Model Accuracy using test data (20%)
print("Accuracy:",accuracy_score(y_test, y_pred_svm))
print("Precision:", precision_score(y_test, y_pred_svm,average='weighted'))
print("Recall:", recall_score(y_test, y_pred_svm,average='weighted'))
print("F1-Score:", f1_score(y_test, y_pred_svm,average='weighted'))

# 0.646


Accuracy: 0.662046908315565
Precision: 0.6667460436011269
Recall: 0.662046908315565
F1-Score: 0.6629969451230404


Predict labels of evaluation data using SVM

In [33]:
# evaluation_svm =svm_clf.predict(X_evaluation_scaled)

svm_clf = svm.SVC(kernel='rbf', C=4,gamma=0.1)

svm_clf.fit(X_scaled,np.ravel(y))
evaluation_svm=svm_clf.predict(X_evaluation_scaled)

evaluation_svm = list(map(lambda s: s.replace("-", ""), evaluation_svm))

svm_df = pd.DataFrame(evaluation_svm, columns = ['Predicted'])
svm_df.index.name = 'Id'


Predict labels of evaluation data using Random Forest

In [None]:
evaluation_rf=rf_clf.predict(X_evaluation_scaled)

evaluation_rf = list(map(lambda s: s.replace("-", ""), evaluation_rf))

rf_df = pd.DataFrame(evaluation_rf, columns = ['Predicted'])
rf_df.index.name = 'Id'


Save both predictions in csv file

In [34]:
now = int(time.time())

svm_df.to_csv(f'evaluation/svm_predictions{now}.csv',index=True,header=True)
# rf_df.to_csv(f'evaluation/rf_predictions{now}.csv',index=True,header=True)

# Print paths of saved csv
print(f'evaluation/svm_predictions{now}.csv')
# print(f'evaluation/rf_predictions{now}.csv')


evaluation/svm_predictions1675264390.csv
