In [54]:
import numpy as np
import pandas as pd
from scipy.io.wavfile import read 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn import svm
from sklearn import metrics
from algorithms import get_features
import dict_data 
import os
import time

In [None]:
# Storing the address in 'url' variable
data_path = "dsl_data/development.csv"
evaluation_path = "dsl_data/evaluation.csv"

# Importing the file from the address contained in 'url' into 'df' 
df = pd.read_csv(data_path)
evaluation_df = pd.read_csv(evaluation_path)

In [None]:
# Check null values in Training dataset (pandas dataframe)
df.isnull().sum().sum()

In [None]:
# Check null values in Evaluation dataset (pandas dataframe)
evaluation_df.isnull().sum().sum()

In [None]:
df['gender'] = df['gender'].map(dict_data.gender_map)
df['Self-reported fluency level '] = df['Self-reported fluency level '].map(dict_data.language_fluency_map)
df['ageRange'] = df['ageRange'].map(dict_data.age_range_map)
df['Current language used for work/school'] = df['Current language used for work/school'].map(dict_data.current_language_map)
df['First Language spoken'] = df['First Language spoken'].map(dict_data.first_language_map)


# Try to combine action & object in 1 column
df["action-object"] = df['action'].astype(str) +"-"+ df["object"]

In [None]:
# map features in evaluation_df to predefined dictionaries
evaluation_df['gender'] = evaluation_df['gender'].map(dict_data.gender_map)
evaluation_df['Self-reported fluency level '] = evaluation_df['Self-reported fluency level '].map(dict_data.language_fluency_map)
evaluation_df['ageRange'] = evaluation_df['ageRange'].map(dict_data.age_range_map)
evaluation_df['Current language used for work/school'] = evaluation_df['Current language used for work/school'].map(dict_data.current_language_map)
evaluation_df['First Language spoken'] = evaluation_df['First Language spoken'].map(dict_data.first_language_map)

In [None]:

df = df.join(df['path'].apply(get_features))
evaluation_df = evaluation_df.join(evaluation_df['path'].apply(get_features))


In [20]:
# Import dataframes from previously saved csv files so we don't need to get features again

df = pd.read_csv(r'save_csv/training2.csv').iloc[:,1:]
evaluation_df = pd.read_csv(r'save_csv/evaluation2.csv').iloc[:,1:]


In [21]:
x_evaluation = evaluation_df.drop(['Id','path','speakerId'],axis=1)
# x_evaluation = evaluation_df.drop(['Id','path','speakerId','Self-reported fluency level ','First Language spoken','gender','ageRange'],axis=1)

x_evaluation.columns = x_evaluation.columns.astype(str)


In [None]:
df.to_csv('save_csv/training2.csv')
evaluation_df.to_csv('save_csv/evaluation2.csv')

In [None]:
df

In [22]:

# x = df.drop(['Id','path','speakerId','action','object','action-object','Self-reported fluency level ','First Language spoken','gender','ageRange'],axis=1)
x = df.drop(['Id','path','speakerId','action','object','action-object'],axis=1)
y = df[['action-object']].copy()

# Change column names from Int to Str to avoid error by SKLEARN
x.columns = x.columns.astype(str)


In [39]:
# Apply scaling for PCA since pca is sensitive to the scale of features.
scaler = StandardScaler()

scaler.fit(x)  # Don't cheat - fit only on training data
X_scaled = scaler.transform(x)
X_evaluation_scaled = scaler.transform(x_evaluation)  # apply same transformation to test data

# X_scaled = scaler.fit_transform(x)
# X_evaluation_scaled = scaler.fit_transform(x_evaluation)


pca = PCA(n_components=150).fit(X_scaled)
X_pca = pca.transform(X_scaled)
X_evaluation_pca = pca.transform(X_evaluation_scaled)


In [None]:
print(sum(pca.explained_variance_ratio_)) 

In [40]:
#train_test_split splits the data into 80% training data and 20% test data
x_train, x_test, y_train, y_test = train_test_split(X_scaled, y,test_size = .2,random_state = 42, shuffle = True)



In [41]:

#Create a Classifier
rf_clf=RandomForestClassifier(n_estimators=100)


#Train the model using the training sets 
rf_clf.fit(x_train,np.ravel(y_train))
y_pred_rf=rf_clf.predict(x_test)

# Model Accuracy using test data (20%)
print("Test set accuracy Random Forest:",metrics.accuracy_score(y_test, y_pred_rf))

Test set accuracy Random Forest: 0.47234906139015725


In [52]:

svm_clf = svm.SVC(kernel = 'rbf', C=10, gamma=0.01)
svm_clf.fit(x_train,np.ravel(y_train))
y_pred_svm=svm_clf.predict(x_test)


print("Test set accuracy using SVM:",metrics.accuracy_score(y_test, y_pred_svm))

# 0.646


Test set accuracy using SVM: 0.6463723997970573


In [53]:
# Define the parameter grid for the SVM
param_grid = {'C': [0.1, 1, 10], 'gamma': [0.01, 0.1, 1]}

# Create a SVM with an RBF kernel
svm = SVC(kernel='rbf')

# Perform the grid search using 10-fold cross-validation
grid_search = GridSearchCV(svm, param_grid, cv=2)
grid_search.fit(x_train, np.ravel(y_train))

# Print the best parameters and the corresponding mean test score
print("Best parameters: ",grid_search.best_params_)
print("Best score: ",grid_search.best_score_)

Best parameters:  {'C': 10, 'gamma': 0.01}
Best score:  0.5388801153904927


In [43]:
y_evaluation=svm_clf.predict(X_evaluation_scaled)

y_evaluation = list(map(lambda s: s.replace("-", ""), y_evaluation))

y_evaluation_df = pd.DataFrame(y_evaluation, columns = ['Predicted'])
y_evaluation_df.index.name = 'Id'


In [44]:
now = int(time.time())

y_evaluation_df.to_csv(f'evaluation/copy_predictions{now}.csv',index=True,header=True)
