In [2]:
import numpy as np
import pandas as pd
from scipy.io.wavfile import read 
import os

In [3]:
# Storing the address in 'url' variable
data_path = "dsl_data/development.csv"
evaluation_path =   "dsl_data/evaluation.csv"

# Importing the file from the address contained in 'url' into 'df' 
df = pd.read_csv(data_path)
evaluation_df = pd.read_csv(evaluation_path)

In [4]:
# Check null values in Training dataset (pandas dataframe)
df.isnull().sum().sum()

0

In [5]:
# Check null values in Evaluation dataset (pandas dataframe)
evaluation_df.isnull().sum().sum()

0

In [6]:
import dict_data 

df['gender'] = df['gender'].map(dict_data.gender_map)
df['Self-reported fluency level '] = df['Self-reported fluency level '].map(dict_data.language_fluency_map)
df['ageRange'] = df['ageRange'].map(dict_data.age_range_map)
df['Current language used for work/school'] = df['Current language used for work/school'].map(dict_data.current_language_map)
df['First Language spoken'] = df['First Language spoken'].map(dict_data.first_language_map)


# Try to combine action & object in 1 column
df["action-object"] = df['action'].astype(str) +"-"+ df["object"]

In [7]:
# map features in evaluation_df to predefined dictionaries
evaluation_df['gender'] = evaluation_df['gender'].map(dict_data.gender_map)
evaluation_df['Self-reported fluency level '] = evaluation_df['Self-reported fluency level '].map(dict_data.language_fluency_map)
evaluation_df['ageRange'] = evaluation_df['ageRange'].map(dict_data.age_range_map)
evaluation_df['Current language used for work/school'] = evaluation_df['Current language used for work/school'].map(dict_data.current_language_map)
evaluation_df['First Language spoken'] = evaluation_df['First Language spoken'].map(dict_data.first_language_map)

In [8]:
from algorithms import get_features

df = df.join(df['path'].apply(get_features))
evaluation_df = evaluation_df.join(evaluation_df['path'].apply(get_features))


  3.8942535e-06  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mfcc = librosa.feature.mfcc(data, sr = sample_rate, n_mfcc=30)
  3.8942535e-06  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  rolloff = librosa.feature.spectral_rolloff(data)[0]
  3.8942535e-06  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  spectral_centroid = librosa.feature.spectral_centroid(data)[0]
  3.8942535e-06  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  spectral_contrast = librosa.feature.spectral_contrast(data)[0]
  3.8942535e-06  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  spectral_bandwidth = librosa.feature.spectral_bandwidth(data)[0]
  1.0317751e-06  0.0000000e+00] as keyw

In [None]:

x_evaluation = evaluation_df.drop(['Id','path','speakerId'],axis=1)

x_evaluation.columns = x_evaluation.columns.astype(str)


In [None]:
df.to_csv('copy_file_name.csv')
evaluation_df.to_csv('copy_file_name2.csv')


In [None]:
df

NameError: name 'df' is not defined

In [None]:
from sklearn.model_selection import train_test_split

x = df.drop(['Id','path','speakerId','action','object','action-object'],axis=1)
y = df[['action-object']].copy()

# Change column names from Int to Str to avoid error by SKLEARN
x.columns = x.columns.astype(str)


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Apply scaling for PCA since pca is sensitive to the scale of features.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(x)
X_evaluation_scaled = scaler.fit_transform(x_evaluation)


pca = PCA(n_components=80).fit(X_scaled)
X_pca = pca.transform(X_scaled)
X_evaluation_pca = pca.transform(X_evaluation_scaled)


In [None]:
print(sum(pca.explained_variance_ratio_)) 

In [None]:
#train_test_split splits the data into 80% training data and 20% test data
x_train, x_test, y_train, y_test = train_test_split(X_pca, y,test_size = .15,random_state = 42, shuffle = True)



In [None]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Classifier
rf_clf=RandomForestClassifier(n_estimators=100)


#Train the model using the training sets 
rf_clf.fit(x_train,y_train)
y_pred_rf=rf_clf.predict(x_test)


from sklearn import metrics

# Model Accuracy using test data (20%)
print("Test set accuracy Random Forest:",metrics.accuracy_score(y_test, y_pred_rf))

In [None]:
from sklearn import svm

svm_clf = svm.SVC(kernel = 'rbf', probability=True)
svm_clf.fit(x_train,y_train)
y_pred_svm=svm_clf.predict(x_test)

from sklearn import metrics

print("Test set accuracy using SVM:",metrics.accuracy_score(y_test, y_pred_svm))



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Define the parameter grid for the SVM
param_grid = {'C': [0.1, 1, 10], 'gamma': [0.01, 0.1, 1]}

# Create a SVM with an RBF kernel
svm = SVC(kernel='rbf')

# Perform the grid search using 10-fold cross-validation
grid_search = GridSearchCV(svm, param_grid, cv=3)
grid_search.fit(x_train, y)

# Print the best parameters and the corresponding mean test score
print("Best parameters: ",grid_search.best_params_)
print("Best score: ",grid_search.best_score_)

In [None]:
y_evaluation=svm_clf.predict(X_evaluation_pca)

y_evaluation = list(map(lambda s: s.replace("-", ""), y_evaluation))

y_evaluation_df = pd.DataFrame(y_evaluation, columns = ['Predicted'])
y_evaluation_df.index.name = 'Id'


In [None]:
import time

now = int(time.time())

y_evaluation_df.to_csv(f'evaluation/copy_predictions{now}.csv',index=True,header=True)
