# Starter Code for Exploring Accent and Identity

## The first and most important step when using machine learning, and particularly in an unsupervised way, is establishing patterns in your data that don't overfit or underfit.

## Here, we want the machine to learn to reduce dimensions based on clustering similar speakers based on the ratings the participants provided.

In [None]:
# modules
import sklearn
import numpy as np
import pandas as pd
import os
import sys
import time
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer, get_scorer, accuracy_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.inspection import permutation_importance

In [None]:
# setup directory and file location,
# then read in the file and set a variable in the background so you don't alter the original

dir = 'your dir'
ratings_features_fname = os.path.join(dir, 'queer_data.csv')
data = pd.read_csv(ratings_features_fname)
data_orig = data

### Choose from one of the three cells below to either plot the distortions, plot the silhouette scores, or simply run and plot the k-means model according to a number of clusters. The code may need to be broken up into a few different cells, but you shouldn't need more than what's shown below.

### For this cell, you may want to just set up the variables and explore the data a little bit to get a sense of the ratings, number of participants, conditions, and original WAV files before then clustering based on ratings.

In [None]:
### K-means. Set up your training variables and values.
print("K-Means clustering...")
x = data[['Participant','Rating_z_score','Condition','WAV']]
x = x.pivot_table(index = ['WAV'], columns = 'Condition', values='Rating_z_score', aggfunc=np.mean)
y = x.iloc[:, [0,1,2]].values


# # Collecting the distortions into list to make the elbow plot
distortions = []
K = range(1,10)
for k in K:
     kmeanModel = KMeans(n_clusters=k)
     kmeanModel.fit(y)
     distortions.append(kmeanModel.inertia_)

# Plotting the distortions for the elbow plot
plt.figure(figsize=(16,8))
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal clusters')
plt.savefig(os.path.join(fig_dir, 'kmeans_cluster_elbow.png'), bbox_inches='tight', dpi=300)
plt.close()

### For this cell, it may be helpful to just plot one cluster number rather than looping through multiple clusters to create a franken-graph.

In [None]:
## Silhouette to establish cluster number
from sklearn import datasets
from sklearn.cluster import KMeans
import yellowbrick
import matplotlib.pyplot as plt
from yellowbrick.cluster import SilhouetteVisualizer

### plotting the silhouettes for multiple sizes of cluster numbers
fig, ax = plt.subplots(3, 2, figsize=(15,8))
for i in [2, 3, 4, 5, 6, 7]:
    '''
    Create KMeans instances for different number of clusters
    '''
    km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42)
    q, mod = divmod(i, 2)
    '''
    Create SilhouetteVisualizer instance with KMeans instance
    Fit the visualizer
    '''
    visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q-1][mod])
    visualizer.fit(x)

### For this cell, you don't need to specify color or labels at first. You can just plot your faovirite number of clusters and then adjust later.

In [None]:
# Define the model for 3 clusters
kmeans_model = KMeans(n_clusters=3, random_state=42)
# Fit into our dataset fit
kmeans_predict = kmeans_model.fit_predict(y)
x['kmeans_3_cluster'] = kmeans_predict

# Visualising the clusters
plt.scatter(y[kmeans_predict == 0, 0], y[kmeans_predict == 0, 1], s = 100, c = '#785EF0', label = 'SW')
plt.scatter(y[kmeans_predict == 1, 0], y[kmeans_predict == 1, 1], s = 100, c = '#DC267F', label = 'QE')
plt.scatter(y[kmeans_predict == 2, 0], y[kmeans_predict == 2, 1], s = 100, c = '#648FFF', label = 'SM')

# Plotting the centroids of the clusters
plt.scatter(kmeans_model.cluster_centers_[:, 0], kmeans_model.cluster_centers_[:,1], s = 100, c = 'black', label = 'Centroids')
plt.legend()
plt.title('K-Means: 3 Clusters from Participant Ratings')
plt.savefig(os.path.join(fig_dir, 'kmeans_cluster_3.png'), bbox_inches='tight', dpi=300)
plt.close()

## Now that you have your clusters, you can start asking questions about who the clusters of speakers are, and what acoustic information in the signal contributed to listeners grouping the speakers this way.