In [8]:
import numpy as np
import pandas as pd
from scipy.io.wavfile import read 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn import svm
from sklearn import metrics
from algorithms import get_features
import encode_dict_data 
import os
import time

In [9]:
# Storing the address in 'url' variable
data_path = "dsl_data/development.csv"
evaluation_path = "dsl_data/evaluation.csv"

# Importing the file from the address contained in 'url' into 'df' 
df = pd.read_csv(data_path)
evaluation_df = pd.read_csv(evaluation_path)

In [11]:
# Discover that evaluation data only contains English (United States) and Native speakers
evaluation_df['Current language used for work/school'].unique()
evaluation_df['First Language spoken'].unique()
evaluation_df['Self-reported fluency level '].unique()


array(['native'], dtype=object)

In [12]:
# REMOVE all rows where language is not English and fluency is not native
modified_df = df[df['First Language spoken'] == 'English (United States)']
modified_df = modified_df[modified_df['Current language used for work/school'] == 'English (United States)']
modified_df = modified_df[modified_df['Self-reported fluency level '] == 'native']

df = modified_df



In [14]:
# Check null values in both datasets (pandas dataframe)
print("Null values in TRAINING dataset:",df.isnull().sum().sum())
print("Null values in EVALUATION dataset:",evaluation_df.isnull().sum().sum())



Null values in TRAINING dataset: 0
Null values in EVALUATION dataset: 0


In [15]:
df['gender'] = df['gender'].map(encode_dict_data.gender_map)
df['Self-reported fluency level '] = df['Self-reported fluency level '].map(encode_dict_data.language_fluency_map)
df['ageRange'] = df['ageRange'].map(encode_dict_data.age_range_map)
df['Current language used for work/school'] = df['Current language used for work/school'].map(encode_dict_data.current_language_map)
df['First Language spoken'] = df['First Language spoken'].map(encode_dict_data.first_language_map)


# Try to combine action & object in 1 column
df["action-object"] = df['action'].astype(str) +"-"+ df["object"]

In [16]:
# map features in evaluation_df to predefined dictionaries
evaluation_df['gender'] = evaluation_df['gender'].map(encode_dict_data.gender_map)
evaluation_df['Self-reported fluency level '] = evaluation_df['Self-reported fluency level '].map(encode_dict_data.language_fluency_map)
evaluation_df['ageRange'] = evaluation_df['ageRange'].map(encode_dict_data.age_range_map)
evaluation_df['Current language used for work/school'] = evaluation_df['Current language used for work/school'].map(encode_dict_data.current_language_map)
evaluation_df['First Language spoken'] = evaluation_df['First Language spoken'].map(encode_dict_data.first_language_map)

In [17]:

df = df.join(df['path'].apply(get_features))
evaluation_df = evaluation_df.join(evaluation_df['path'].apply(get_features))


  mfcc = librosa.feature.mfcc(data, sr = sample_rate, n_mfcc=30)
  rolloff = librosa.feature.spectral_rolloff(data)[0]
  spectral_centroid = librosa.feature.spectral_centroid(data)[0]
  spectral_contrast = librosa.feature.spectral_contrast(data)[0]
  spectral_bandwidth = librosa.feature.spectral_bandwidth(data)[0]
 -7.0664035e-05  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfcc = librosa.feature.mfcc(data, sr = sample_rate, n_mfcc=30)
 -7.0664035e-05  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  rolloff = librosa.feature.spectral_rolloff(data)[0]
 -7.0664035e-05  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  spectral_centroid = librosa.feature.spectral_centroid(data)[0]
 -7.0664035e-05  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will res

In [18]:
# Import dataframes from previously saved csv files so we don't need to get features again

# df = pd.read_csv(r'save_csv/training2.csv').iloc[:,1:]
# evaluation_df = pd.read_csv(r'save_csv/evaluation2.csv').iloc[:,1:]


In [19]:
x = df.drop(['Id','path','speakerId','action','object','action-object','Self-reported fluency level ','First Language spoken','Current language used for work/school'],axis=1)
y = df[['action-object']].copy()
x_evaluation = evaluation_df.drop(['Id','path','speakerId','Self-reported fluency level ','First Language spoken','Current language used for work/school'],axis=1)

# Change column names from Int to Str to avoid error by SKLEARN
x.columns = x.columns.astype(str)
x_evaluation.columns = x_evaluation.columns.astype(str)


In [20]:
df.to_csv('save_csv/training2.csv')
evaluation_df.to_csv('save_csv/evaluation2.csv')

In [21]:

# x = df.drop(['Id','path','speakerId','action','object','action-object','Self-reported fluency level ','First Language spoken','gender','ageRange'],axis=1)



In [22]:
x

Unnamed: 0,gender,ageRange,0,1,2,3,4,5,6,7,...,200,201,202,203,204,205,206,207,208,209
9010,1,1,-472.125153,27.46936,-4.033377,58.852196,-15.32099,-11.096684,-24.349939,2.503482,...,0.625339,29.681489,14.797038,6.795048,2019.713124,296.428304,-0.245218,2452.725765,2030.348001,2452.725765
9011,1,1,-455.439331,24.492832,-13.73378,60.796482,-12.364906,-15.249555,-29.53157,-1.985768,...,0.381687,26.648305,15.300624,5.63046,1966.11486,301.816662,-0.307614,2390.32727,1940.078524,2390.32727
9012,1,1,-479.745331,20.624519,-15.172877,56.297703,-15.982334,-11.285934,-32.52647,2.319024,...,0.262787,27.473404,14.946695,6.785409,1980.747756,282.318874,-0.288538,2705.974911,2004.33723,2705.974911


In [23]:
# Apply scaling for PCA since pca is sensitive to the scale of features.
scaler = StandardScaler()

scaler.fit(x)  # Don't cheat - fit only on training data
X_scaled = scaler.transform(x)
# X_evaluation_scaled = scaler.transform(x_evaluation)  # apply same transformation to test data

# X_scaled = scaler.fit_transform(x)
# X_evaluation_scaled = scaler.fit_transform(x_evaluation)


# pca = PCA(n_components=150).fit(X_scaled)
# X_pca = pca.transform(X_scaled)
# X_evaluation_pca = pca.transform(X_evaluation_scaled)
# print(sum(pca.explained_variance_ratio_)) 


In [26]:
#train_test_split splits the data into 80% training data and 20% test data
x_train, x_test, y_train, y_test = train_test_split(X_scaled, y,test_size = .2,random_state = 42, shuffle = True)



In [27]:

#Create a Classifier
rf_clf=RandomForestClassifier(n_estimators=100)


#Train the model using the training sets 
rf_clf.fit(x_train,np.ravel(y_train))
y_pred_rf=rf_clf.predict(x_test)

# Model Accuracy using test data (20%)
print("Test set accuracy Random Forest:",metrics.accuracy_score(y_test, y_pred_rf))

Test set accuracy Random Forest: 1.0


Unnamed: 0,action-object
9010,decrease-heat
9011,decrease-heat
9012,decrease-heat


In [28]:

svm_clf = svm.SVC(kernel = 'rbf', C=10, gamma=0.01)
svm_clf.fit(x_train,np.ravel(y_train))
y_pred_svm=svm_clf.predict(x_test)


print("Test set accuracy using SVM:",metrics.accuracy_score(y_test, y_pred_svm))

# 0.646


ValueError: The number of classes has to be greater than one; got 1 class

In [None]:
# Define the parameter grid for the SVM
param_grid = {'C': [0.1, 1, 10], 'gamma': [0.01, 0.1, 1]}

# Create a SVM with an RBF kernel
svm = SVC(kernel='rbf')

# Perform the grid search using 10-fold cross-validation
grid_search = GridSearchCV(svm, param_grid, cv=2)
grid_search.fit(x_train, np.ravel(y_train))

# Print the best parameters and the corresponding mean test score
print("Best parameters: ",grid_search.best_params_)
print("Best score: ",grid_search.best_score_)

In [None]:
y_evaluation=svm_clf.predict(X_evaluation_scaled)

y_evaluation = list(map(lambda s: s.replace("-", ""), y_evaluation))

y_evaluation_df = pd.DataFrame(y_evaluation, columns = ['Predicted'])
y_evaluation_df.index.name = 'Id'


In [None]:
now = int(time.time())

y_evaluation_df.to_csv(f'evaluation/copy_predictions{now}.csv',index=True,header=True)
