In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io.wavfile import read 
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from algorithms import get_features, plot_durations, get_audio, get_trim, pad_audio
import encode_dict_data 
import os
import math
import time

In [None]:
# Storing the addresses in corresponding variables
data_path = "dsl_data/development.csv"
evaluation_path = "dsl_data/evaluation.csv"

# Importing development and evaluation data from csv files
df = pd.read_csv(data_path)
evaluation_df = pd.read_csv(evaluation_path)

Analyze Data

In [None]:
# Discover that evaluation data only contains English (United States) and Native speakers
print("Unique values of current language:",evaluation_df['Current language used for work/school'].unique())
print("Unique values of first language:",evaluation_df['First Language spoken'].unique())
print("Unique values of fluency level:",evaluation_df['Self-reported fluency level '].unique())


In [None]:
# REMOVE all rows where first & current language is not English (United States) and fluency is not native
modified_df = df[df['First Language spoken'] == 'English (United States)']
modified_df = modified_df[modified_df['Current language used for work/school'] == 'English (United States)']
modified_df = modified_df[modified_df['Self-reported fluency level '] == 'native']

df = modified_df

 Label Encoding (Development & Evaluation)

In [None]:
# Label Encoding

le_mapping = {}
encoding_columns = ['Self-reported fluency level ','First Language spoken','Current language used for work/school','ageRange','gender']

# Encode DEVELOPMENT DATA 
for col in encoding_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_mapping[col] = dict(zip(le.classes_, le.transform(le.classes_)))

# Encode EVALUATION DATA TOO
for col, mapping in le_mapping.items():
    evaluation_df[col] = evaluation_df[col].map(mapping)

Label Decoding (if needed)

In [None]:
# Decoding 

# for col, mapping in le_mapping.items():
#     df[col] = df[col].map(mapping)

Combine "action" & "object" 

In [None]:
df = df.iloc[:10]
evaluation_df = evaluation_df.iloc[:1]


In [None]:
df["action-object"] = df['action'].astype(str) +"-"+ df["object"]


Load audio files (.wav)

In [None]:
df = df.join(df['path'].apply(get_audio))
evaluation_df = evaluation_df.join(evaluation_df['path'].apply(get_audio))

Trim audio files

In [None]:
df = df.join(df[['data', 'sample_rate']].apply(get_trim, axis=1))
evaluation_df = evaluation_df.join(evaluation_df[['data', 'sample_rate']].apply(get_trim, axis=1))

# Exract maximum length to use in padding later
max_development = df['duration_trim'].max()
max_evaluation = evaluation_df['duration_trim'].max()
maximum_duration = np.maximum(max_development, max_evaluation)
print(f"Maximum duration in both sets: {maximum_duration}s")
maximum_duration = math.ceil(maximum_duration)
print(f"Ceil maximum duration in both sets: {maximum_duration}s")





In [None]:
df = df.join(df.apply(lambda x: pad_audio(x['data_trim'], x['sample_rate'],maximum_duration), axis=1))
evaluation_df = evaluation_df.join(evaluation_df.apply(lambda x: pad_audio(x['data_trim'], x['sample_rate'],maximum_duration), axis=1))

In [None]:
df.columns


Extract Features

Apply get_features function to extract features from audio files and trim silence from beggining and end of each audio file
(Done together to reduce complexity)

In [None]:
df = df.join(df.apply(lambda x: get_features(x['data_pad'], x['sample_rate']), axis=1))
evaluation_df = evaluation_df.join(evaluation_df.apply(lambda x: get_features(x['data_pad'], x['sample_rate']), axis=1))

# # Save extracted features in csv files to avoid repeating steps 
# df.to_csv('save_csv/training.csv')
# evaluation_df.to_csv('save_csv/evaluation.csv')

OR get features from previously saved csv files

In [None]:
# # Import dataframes from previously saved csv files so we don't need to get features again

# df = pd.read_csv(r'save_csv/training.csv').iloc[:,1:]
# evaluation_df = pd.read_csv(r'save_csv/evaluation.csv').iloc[:,1:]


In [None]:
df['action-object'].value_counts()

Select features & labels

In [None]:
x = df.drop(['Id','path','speakerId','action','object','action-object','Self-reported fluency level ','First Language spoken','Current language used for work/school','data', 'sample_rate', 'duration', 'data_trim', 'duration_trim', 'data_pad', 'duration_data'],axis=1)
x_evaluation = evaluation_df.drop(['Id','path','speakerId','Self-reported fluency level ','First Language spoken','Current language used for work/school','data', 'sample_rate', 'duration', 'data_trim', 'duration_trim', 'data_pad', 'duration_data'],axis=1)

y = df[['action-object']].copy()

# Change column names from Int to Str to avoid error by SKLEARN
x.columns = x.columns.astype(str)
x_evaluation.columns = x_evaluation.columns.astype(str)


In [None]:
# pca = PCA().fit(data_rescaled)

# %matplotlib inline
# import matplotlib.pyplot as plt
# plt.rcParams["figure.figsize"] = (50,50)

# fig, ax = plt.subplots()
# xi = np.arange(1, 213, step=1)
# y = np.cumsum(pca.explained_variance_ratio_)

# plt.ylim(0.0,1.1)
# plt.plot(xi, y, marker='o', linestyle='--', color='b')

# plt.xlabel('Number of Components')
# plt.xticks(np.arange(0, 212, step=1)) #change from 0-based array index to 1-based human-readable label
# plt.ylabel('Cumulative variance (%)')
# plt.title('The number of components needed to explain variance')

# plt.axhline(y=0.95, color='r', linestyle='-')
# plt.text(0.5, 0.85, '95% cut-off threshold', color = 'red', fontsize=16)

# ax.grid(axis='x')
# plt.show()

In [None]:
# Apply scaling (Z-score) for PCA since pca is sensitive to the scale of features.
# scaler = StandardScaler()

# scaler.fit(x)
# X_scaled = scaler.transform(x)
# X_evaluation_scaled = scaler.transform(x_evaluation)  # apply same transformation to test data


# pca = PCA(n_components=.95).fit(X_scaled)
# X_pca = pca.transform(X_scaled)
# X_evaluation_pca = pca.transform(X_evaluation_scaled)
# print(sum(pca.explained_variance_ratio_)) 


from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(x) 
X_scaled = scaler.transform(x)
X_evaluation_scaled = scaler.transform(x_evaluation)  # apply same transformation to test data

# def standardize(x_train, x_test):
#     mean = np.mean(x_train, axis=0)
#     std = np.std(x_train, axis=0)
#     x_train_standardized = (x_train - mean) / std
#     x_test_standardized = (x_test - mean) / std
#     return x_train_standardized, x_test_standardized
# X_scaled, X_evaluation_scaled = standardize(x, x_evaluation)


# pca = PCA(n_components=.95).fit(X_scaled)
# X_pca = pca.transform(X_scaled)
# X_evaluation_pca = pca.transform(X_evaluation_scaled)
# print(sum(pca.explained_variance_ratio_)) 



Split data to training and test

In [None]:
#80% training data and 20% test data
x_train, x_test, y_train, y_test = train_test_split(X_scaled, y,test_size = .2,random_state = 42, shuffle = True)



In [None]:

#Create a Classifier
rf_clf=RandomForestClassifier(criterion ='entropy', max_depth= 7, min_samples_leaf= 2, n_estimators= 1000)


#Train the model using the training sets 
rf_clf.fit(x_train,np.ravel(y_train))
y_pred_rf=rf_clf.predict(x_test)

# Model Accuracy using test data (20%)
print("Accuracy:",accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf,average='weighted'))
print("Recall:", recall_score(y_test, y_pred_rf,average='weighted'))
print("F1-Score:", f1_score(y_test, y_pred_rf,average='weighted'))

In [None]:

svm_clf = svm.SVC(kernel = 'rbf', C=8)
# svm_clf = svm.SVC()



svm_clf.fit(x_train,np.ravel(y_train))
y_pred_svm=svm_clf.predict(x_test)


# Model Accuracy using test data (20%)
print("Accuracy:",accuracy_score(y_test, y_pred_svm))
print("Precision:", precision_score(y_test, y_pred_svm,average='weighted'))
print("Recall:", recall_score(y_test, y_pred_svm,average='weighted'))
print("F1-Score:", f1_score(y_test, y_pred_svm,average='weighted'))

# 0.646


Grid search for Random Forest model

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV

# # Define the parameter grid to search over
# param_grid = {'n_estimators': [10, 100, 1000],
#               'max_depth': [3, 5, 7],
#               'min_samples_leaf': [1, 2, 3],
#               'criterion': ['gini', 'entropy']}

# # Initialize the random forest classifier
# rf = RandomForestClassifier()

# # Use GridSearchCV to find the best parameters
# grid_search = GridSearchCV(rf, param_grid, cv=2, scoring='accuracy')
# grid_search.fit(X_scaled, y)

# # Print the best parameters and the best score
# print("Best parameters: ", grid_search.best_params_)
# print("Best score: ", grid_search.best_score_)


Grid search for SVM model

In [None]:
# # Define the parameter grid to search over

# param_grid = {'C': [0.1, 1, 10], 'gamma': [0.01, 0.1, 1]}

# # Create a SVM with an RBF kernel
# svm = SVC(kernel='rbf')

# # Perform the grid search using 10-fold cross-validation
# grid_search = GridSearchCV(svm, param_grid)
# grid_search.fit(x_train, np.ravel(y_train))

# # Print the best parameters and the corresponding mean test score
# print("Best parameters: ",grid_search.best_params_)
# print("Best score: ",grid_search.best_score_)

Predict labels of evaluation data using SVM

In [None]:
evaluation_svm =svm_clf.predict(X_evaluation_scaled)

evaluation_svm = list(map(lambda s: s.replace("-", ""), evaluation_svm))

svm_df = pd.DataFrame(evaluation_svm, columns = ['Predicted'])
svm_df.index.name = 'Id'


Predict labels of evaluation data using Random Forest

In [None]:
evaluation_rf=rf_clf.predict(X_evaluation_scaled)

evaluation_rf = list(map(lambda s: s.replace("-", ""), evaluation_rf))

rf_df = pd.DataFrame(evaluation_rf, columns = ['Predicted'])
rf_df.index.name = 'Id'


Save both predictions in csv file

In [None]:
now = int(time.time())

svm_df.to_csv(f'evaluation/svm_predictions{now}.csv',index=True,header=True)
rf_df.to_csv(f'evaluation/rf_predictions{now}.csv',index=True,header=True)

# Print paths of saved csv
print(f'evaluation/svm_predictions{now}.csv')
print(f'evaluation/rf_predictions{now}.csv')
