In [1]:
import numpy as np
import pandas as pd
from scipy.io.wavfile import read 
import os

In [2]:
# Storing the address in 'url' variable
data_path = "dsl_data/development.csv"
# Importing the file from the address contained in 'url' into 'df' 
df = pd.read_csv(data_path)

In [3]:
# Check null values in Training dataset (pandas dataframe)
df.isnull().sum().sum()

0

In [4]:
# To get unique values in order to map values to objects imported from src/data.py

print(df['object'].unique())
print(df['action'].unique())


['none' 'music' 'lights' 'volume' 'heat']
['change language' 'activate' 'deactivate' 'increase' 'decrease']


In [5]:
df.shape

(9854, 10)

In [6]:
import dict_data 

df['gender'] = df['gender'].map(dict_data.gender_map)
df['Self-reported fluency level '] = df['Self-reported fluency level '].map(dict_data.language_fluency_map)
df['ageRange'] = df['ageRange'].map(dict_data.age_range_map)
df['Current language used for work/school'] = df['Current language used for work/school'].map(dict_data.current_language_map)
df['First Language spoken'] = df['First Language spoken'].map(dict_data.first_language_map)


# Try to combine action & object in 1 column
df["action-object"] = df['action'].astype(str) +"-"+ df["object"]


In [None]:
import librosa

duration_array = []
zcr_mean_array = []
zcr_std_array = []
mfcc_array = []
chroma_cens_array = []


for audio in df['path']:
    # rate, data = read(audio, mmap=False) 

    # Load audio files (wav) as amplitude and rate using 
    # Default rate: 22050
    data ,rate = librosa.load(audio)

    # Calculate zero crossing rate
    zcr = librosa.zero_crossings(data)

    # Calculate mean of zero crossing rate
    zcr_mean = np.mean(zcr)
    zcr_mean_array.append(zcr_mean) 

    # Calculate standard deviation of zero crossing rate
    zcr_std = np.std(zcr)
    zcr_std_array.append(zcr_std) 

    #  extract Mel-Frequency Cepstral Coefficients (MFCCs) from audio signal and compute mean on rows
    mfcc = librosa.feature.mfcc(y=data, sr=rate, n_mfcc=50)
    mfcc_mean = np.mean(mfcc,axis=1)
    mfcc_array.append(mfcc_mean) 
   
    # Extract Chroma cens feature
    chroma_cens = librosa.feature.chroma_cens(y=data, sr=rate)
    chroma_cens_mean = np.mean(chroma_cens,axis=1)
    chroma_cens_array.append(chroma_cens_mean)
   
    # Get duration by dividing number of columns in data by rate (Number of channels)
    duration_array.append(data.shape[0] / rate)





df['zcr_mean'] = zcr_mean_array
df['zcr_std'] = zcr_std_array
df['duration'] = duration_array 
df['mfcc'] = mfcc_array 
df['chroma_cens'] = chroma_cens_array 




In [8]:


# extract the array column
array_column = df['mfcc']

# reshape the array to several columns
array_column = array_column.apply(pd.Series)

# rename the columns

array_column.columns = [f'mfcc_{i}' for i in range(array_column.shape[1])]

# join the new DataFrame with the original one
df = pd.concat([df, array_column], axis=1)

# drop the array column
df = df.drop('mfcc', axis=1)

In [9]:


# extract the array column
array_column = df['chroma_cens']

# reshape the array to several columns
array_column = array_column.apply(pd.Series)

# rename the columns

array_column.columns = [f'chroma_cens_{i}' for i in range(array_column.shape[1])]

# join the new DataFrame with the original one
df = pd.concat([df, array_column], axis=1)

# drop the array column
df = df.drop('chroma_cens', axis=1)


In [10]:
df.to_csv('file_name.csv')

In [11]:
from sklearn.model_selection import train_test_split


static_features_array = ['Self-reported fluency level ','First Language spoken', 'Current language used for work/school', 'gender', 'ageRange','zcr_mean','zcr_std','duration']
dynamic_features_array1 = [f'mfcc_{x}' for x in range(50)]
dynamic_features_array2 = [f'chroma_cens_{x}' for x in range(12)]


dynamic_features_array = np.concatenate((dynamic_features_array1, dynamic_features_array2))

all_features_array = np.concatenate((static_features_array, dynamic_features_array))

# x = df_copy[all_features_array]
x = df[all_features_array].copy()
y = df[['action-object']].copy()


#train_test_split splits the data into 70% training data and 30% test data
x_train, x_test, y_train, y_test = train_test_split(x, y,test_size = .3)

In [12]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Classifier
rf_clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
rf_clf.fit(x_train,y_train)
y_pred_rf=rf_clf.predict(x_test)


  rf_clf.fit(x_train,y_train)


In [13]:
from sklearn import svm

svm_clf = svm.SVC()
svm_clf.fit(x_train,y_train)
y_pred_svm=svm_clf.predict(x_test)


  y = column_or_1d(y, warn=True)


In [14]:

# from sklearn.tree import DecisionTreeClassifier
# clf = DecisionTreeClassifier(max_depth = 10, min_impurity_decrease=0.01)
# clf.fit(x_train, y_train)
# y_pred = clf.predict(x_test)
# y_pred

In [15]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy using test data (25%)
print("Test set accuracy Random Forest:",metrics.accuracy_score(y_test, y_pred_rf))
print("Test set accuracy using SVM:",metrics.accuracy_score(y_test, y_pred_svm))



Test set accuracy Random Forest: 0.41494758200879267
Test set accuracy using SVM: 0.3043625295908015


In [18]:
# Storing the address in 'url' variable
evaluation_path =   "dsl_data/evaluation.csv"

# Importing the csv file from the address contained in 'url' into 'evaluation_df' 
evaluation_df = pd.read_csv(evaluation_path)



# Check null values in Training dataset (pandas dataframe)
evaluation_df.isnull().sum().sum()

import dict_data 

# map features in evaluation_df to predefined dictionaries
evaluation_df['gender'] = evaluation_df['gender'].map(dict_data.gender_map)
evaluation_df['Self-reported fluency level '] = evaluation_df['Self-reported fluency level '].map(dict_data.language_fluency_map)
evaluation_df['ageRange'] = evaluation_df['ageRange'].map(dict_data.age_range_map)
evaluation_df['Current language used for work/school'] = evaluation_df['Current language used for work/school'].map(dict_data.current_language_map)
evaluation_df['First Language spoken'] = evaluation_df['First Language spoken'].map(dict_data.first_language_map)


import librosa

chroma_cens_array = []
duration_array = []
zcr_mean_array = []
zcr_std_array = []
mfcc_array = []


for audio in evaluation_df['path']:
    # Load audio files (wav) as amplitude and rate using 
    # Default rate: 22050
    data ,rate = librosa.load(audio)

    # Calculate zero crossing rate
    zcr = librosa.zero_crossings(data)

    # Calculate mean of zero crossing rate
    zcr_mean = np.mean(zcr)
    zcr_mean_array.append(zcr_mean) 

    # Calculate standard deviation of zero crossing rate
    zcr_std = np.std(zcr)
    zcr_std_array.append(zcr_std) 


    #  Extract Mel-Frequency Cepstral Coefficients (MFCCs) from audio signal and compute mean on rows
    mfcc = librosa.feature.mfcc(y=data, sr=rate, n_mfcc=50)
    mfcc_mean = np.mean(mfcc,axis=1)
    mfcc_array.append(mfcc_mean) 

    # Extract Chroma cens feature
    chroma_cens = librosa.feature.chroma_cens(y=data, sr=rate)
    chroma_cens_mean = np.mean(chroma_cens,axis=1)
    chroma_cens_array.append(chroma_cens_mean)

    # Get duration by dividing number of columns in data by rate (Number of channels)
    duration_array.append(data.shape[0] / rate)


evaluation_df['zcr_mean'] = zcr_mean_array
evaluation_df['zcr_std'] = zcr_std_array
evaluation_df['duration'] = duration_array 
evaluation_df['mfcc'] = mfcc_array 
evaluation_df['chroma_cens'] = chroma_cens_array 




# x_evaluation = evaluation_df[['Self-reported fluency level ','First Language spoken', 'Current language used for work/school', 'gender', 'ageRange', 'zero_crossing','duration','mfcc']].copy()



  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


In [19]:
# extract the array column
array_column = evaluation_df['mfcc']

# reshape the array to several columns
array_column = array_column.apply(pd.Series)

# rename the columns

array_column.columns = [f'mfcc_{i}' for i in range(array_column.shape[1])]

# join the new DataFrame with the original one
evaluation_df = pd.concat([evaluation_df, array_column], axis=1)

# drop the array column
evaluation_df = evaluation_df.drop('mfcc', axis=1)


In [20]:

# extract the array column
array_column = evaluation_df['chroma_cens']

# reshape the array to several columns
array_column = array_column.apply(pd.Series)

# rename the columns

array_column.columns = [f'chroma_cens_{i}' for i in range(array_column.shape[1])]

# join the new DataFrame with the original one
evaluation_df = pd.concat([evaluation_df, array_column], axis=1)

# drop the array column
evaluation_df = evaluation_df.drop('chroma_cens', axis=1)


In [21]:
evaluation_df.to_csv('file_name2.csv')


In [22]:

static_features_array = ['Self-reported fluency level ','First Language spoken', 'Current language used for work/school', 'gender', 'ageRange','zcr_mean','zcr_std','duration']
dynamic_features_array1 = [f'mfcc_{x}' for x in range(50)]
dynamic_features_array2 = [f'chroma_cens_{x}' for x in range(12)]


dynamic_features_array = np.concatenate((dynamic_features_array1, dynamic_features_array2))


all_features_array = np.concatenate((static_features_array, dynamic_features_array))

# x = df_copy[all_features_array]
x_evaluation = evaluation_df[all_features_array].copy()

In [23]:
y_evaluation=rf_clf.predict(x_evaluation)

y_evaluation = list(map(lambda s: s.replace("-", ""), y_evaluation))

y_evaluation_df = pd.DataFrame(y_evaluation, columns = ['Predicted'])
y_evaluation_df.index.name = 'Id'


In [24]:
import time

now = int(time.time())

y_evaluation_df.to_csv(f'evaluation/predictions{now}.csv',index=True,header=True)
