In [2]:
import numpy as np
import pandas as pd
from scipy.io.wavfile import read 
import os

In [3]:
# Storing the address in 'url' variable
data_path =   "dsl_data/development.csv"
# Importing the file from the address contained in 'url' into 'df' 
df = pd.read_csv(data_path)
df

Unnamed: 0,Id,path,speakerId,action,object,Self-reported fluency level,First Language spoken,Current language used for work/school,gender,ageRange
0,0,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/0a312...,2BqVo8kVB2Skwgyb,change language,none,advanced,English (United States),English (United States),female,22-40
1,1,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/0ee42...,2BqVo8kVB2Skwgyb,activate,music,advanced,English (United States),English (United States),female,22-40
2,2,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/1d9f3...,2BqVo8kVB2Skwgyb,deactivate,lights,advanced,English (United States),English (United States),female,22-40
3,3,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/269fc...,2BqVo8kVB2Skwgyb,increase,volume,advanced,English (United States),English (United States),female,22-40
4,4,dsl_data/audio/speakers/2BqVo8kVB2Skwgyb/5bbda...,2BqVo8kVB2Skwgyb,increase,volume,advanced,English (United States),English (United States),female,22-40
...,...,...,...,...,...,...,...,...,...,...
9849,9849,dsl_data/audio/speakers/vnljypgejkINbBAY/4fb3d...,vnljypgejkINbBAY,decrease,volume,native,English (United States),English (United States),male,22-40
9850,9850,dsl_data/audio/speakers/vnljypgejkINbBAY/59e6a...,vnljypgejkINbBAY,deactivate,lights,native,English (United States),English (United States),male,22-40
9851,9851,dsl_data/audio/speakers/vnljypgejkINbBAY/5c81c...,vnljypgejkINbBAY,deactivate,lights,native,English (United States),English (United States),male,22-40
9852,9852,dsl_data/audio/speakers/vnljypgejkINbBAY/5ef42...,vnljypgejkINbBAY,deactivate,lights,native,English (United States),English (United States),male,22-40


In [4]:
# Check null values in Training dataset (pandas dataframe)
df.isnull().sum().sum()

0

In [5]:
# df = df.iloc[:100] 

In [6]:
# To get unique values in order to map values to objects imported from src/data.py

df['object'].unique()
df['action'].unique()


array(['change language', 'activate', 'deactivate', 'increase',
       'decrease'], dtype=object)

In [7]:
import dict_data 

df['gender'] = df['gender'].map(dict_data.gender_map)
df['Self-reported fluency level '] = df['Self-reported fluency level '].map(dict_data.language_fluency_map)
df['ageRange'] = df['ageRange'].map(dict_data.age_range_map)
df['Current language used for work/school'] = df['Current language used for work/school'].map(dict_data.current_language_map)
df['First Language spoken'] = df['First Language spoken'].map(dict_data.first_language_map)


# Try to combine action & object in 1 column
df["action-object"] = df['action'].astype(str) +"-"+ df["object"]


In [8]:
path = os.getcwd() + "/"
audios_array = []
for audio in df['path']:
    temp_rate, temp_data = read(audio, mmap=False)   
    temp_data = np.mean(temp_data) 
    audios_array.append(temp_data)

df['audio'] = audios_array    

In [9]:
from sklearn.model_selection import train_test_split

x = df[['Self-reported fluency level ','First Language spoken', 'Current language used for work/school', 'gender', 'ageRange','audio']].copy()
y = df[['action-object']].copy()


#train_test_split splits the data into 75% training data and 25% test data
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [10]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(x_train,y_train.values.ravel())
print("FIT")
y_pred=clf.predict(x_test)

y_pred

FIT


array(['change language-none', 'increase-volume', 'decrease-volume', ...,
       'deactivate-lights', 'increase-volume', 'increase-heat'],
      dtype=object)

In [11]:
# from sklearn import svm

# clf = svm.SVC()
# clf.fit(x_train,y_train)
# y_pred=clf.predict(x_test)
# y_pred


In [12]:

# from sklearn.tree import DecisionTreeClassifier
# clf = DecisionTreeClassifier(max_depth = 10, min_impurity_decrease=0.01)
# clf.fit(x_train, y_train)
# y_pred = clf.predict(x_test)
# y_pred

In [13]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy using test data (25%)
print("Test set accuracy:",metrics.accuracy_score(y_test, y_pred))

Test set accuracy: 0.18506493506493507


In [14]:
# Storing the address in 'url' variable
evaluation_path =   "dsl_data/evaluation.csv"

# Importing the csv file from the address contained in 'url' into 'evaluation_df' 
evaluation_df = pd.read_csv(evaluation_path)

# Check null values in Training dataset (pandas dataframe)
evaluation_df.isnull().sum().sum()

import dict_data 

# map features in evaluation_df to predefined dictionaries
evaluation_df['gender'] = evaluation_df['gender'].map(dict_data.gender_map)
evaluation_df['Self-reported fluency level '] = evaluation_df['Self-reported fluency level '].map(dict_data.language_fluency_map)
evaluation_df['ageRange'] = evaluation_df['ageRange'].map(dict_data.age_range_map)
evaluation_df['Current language used for work/school'] = evaluation_df['Current language used for work/school'].map(dict_data.current_language_map)
evaluation_df['First Language spoken'] = evaluation_df['First Language spoken'].map(dict_data.first_language_map)


# Get audio file for each instance into an array
path = os.getcwd() + "/"
audios_array = []
for audio in evaluation_df['path']:
    temp_rate, temp_data = read(audio, mmap=False)   
    temp_data = np.mean(temp_data) 
    audios_array.append(temp_data)

# Save array as a new column in our evaluation dataframe
evaluation_df['audio'] = audios_array

x_evaluation = evaluation_df[['Self-reported fluency level ','First Language spoken', 'Current language used for work/school', 'gender', 'ageRange', 'audio']].copy()



In [15]:
y_evaluation=clf.predict(x_evaluation)

for element in y_evaluation:
    element = element.replace("none", "")
    element = element.replace("-", "")

remove_dash = list(map(lambda s: s.replace("-", ""), y_evaluation))
modified_y_evaluation = list(map(lambda s: s.replace("none", ""), remove_dash))

# print map(lambda s: s.replace('-' , 'n'), y_evaluation)



y_evaluation_df = pd.DataFrame(modified_y_evaluation, columns = ['Predicted'])
y_evaluation_df
y_evaluation_df.index.name = 'Id'

y_evaluation_df.to_csv('evaluation/predictions.csv',index=True,header=True)