-
Notifications
You must be signed in to change notification settings - Fork 4
/
analysis_tools.py
397 lines (347 loc) · 16.3 KB
/
analysis_tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib
matplotlib.use('tkagg')
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.preprocessing import StandardScaler
from matplotlib.patches import Ellipse
from sklearn.mixture import GaussianMixture
from sklearn import preprocessing
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from keras.layers import Input, Dense
from keras.models import Model
from keras import losses
from sklearn.cluster import DBSCAN as dbizzle
from sklearn.neighbors import NearestNeighbors
def draw_ellipse(position, covariance, ax=None, **kwargs):
"""
Draws an ellipse with a given position and covariance (for clustering)
"""
ax = ax or plt.gca()
if covariance.shape == (2, 2):
U, s, Vt = np.linalg.svd(covariance)
angle = np.degrees(np.arctan2(U[1, 0], U[0, 0]))
width, height = 2*np.sqrt(s)
else:
angle = 0
width, height = 2*np.sqrt(covariance)
for nsig in range(1, 4):
ax.add_patch(Ellipse(position, nsig*width, nsig*height, angle, **kwargs))
def plot_gmm(gmm, x, label=True, ax=None):
"""
Ex:
gmm = GaussianMixture(n_components=3).fit(X)
plot_gmm(gmm, X)
"""
ax = ax or plt.gca()
labels = gmm.fit(x).predict(x)
if label:
ax.scatter(x[0], x[1], c=labels, s=40, cmap='viridis', zorder=2)
else:
ax.scatter(x[0], x[1], s=40, zorder=2)
ax.axis('equal')
w_factor = 0.2/gmm.weights_.max()
for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_):
draw_ellipse(pos, covar, alpha=w*w_factor)
def get_pca_components(x, num_components=2, incremental=False, batch_size=100):
"""
Converts input data into its reduced PCA components. Can be used to visualize in 2D.
Set incremental to true if memory is an issue (i.e., large dataset)
:param x: n-dimensional input data to be reduced to d-dimensional
:param num_components: default=2, d-dimensions
:param incremental: true for large datasets, memory issues
:param batch_size: refers to the number of rows processed per iteration
:return: Dataframe of the resulting components.
"""
x_standardized = StandardScaler().fit_transform(x)
if incremental:
pca = IncrementalPCA(n_components=num_components, batch_size=batch_size)
else:
pca = PCA(n_components=num_components)
x_pca = pca.fit_transform(x_standardized)
x_pca = pd.DataFrame(data=x_pca)
return x_pca
def get_cluster_assignments(x, num_clusters):
"""
Quick way to retrieve cluster labels directly using k_means, passing value k
:param x: type dataframe
:param num_clusters: can be based on the visualization of first 2 principal components
:return: labels/cluster assignments
"""
km = KMeans(n_clusters=num_clusters).fit(x)
return km.labels_
def scale(df, scale_type="standardize"):
"""
:param df: contains only the columns of numerical data, non identifier data
:param scale_type: (default) 'standardize' = standard scaler (i.e., subtract by mean, divide by std deviation OR
'normalize' = min max scaler (i.e., on a scale from 0 to 1)
:return: standardized columns as a dataframe
"""
scaler = None
if scale_type == "normalize":
scaler = preprocessing.MinMaxScaler()
elif scale_type == "standardize":
scaler = preprocessing.StandardScaler()
scaler.fit_transform(df)
return scaler.transform(df)
def variance_threshold_fs(x, threshold=0.0):
"""
Removes all features below certain variance level (default is 0 variance)
:param x: all numerical data features
:param threshold: variance level
:return: remaining features after removing those under threshold
"""
fs = VarianceThreshold(threshold=threshold)
fs.fit(x)
return x[x.columns[fs.get_support(indices=True)]]
def extra_trees_fs(x, y, n_estimators=50):
"""
Ensemble tree-based method to choose most useful features (feature importance)
evaluated by predicting a given label
:param x: all numerical data features
:param y: label wished to predict (will be encoded)
:param n_estimators: number of ensemble trees to use
:return: remaining features after removing irrelevant features (using SelectFromModel)
"""
le = preprocessing.LabelEncoder()
le.fit(y)
labels = le.transform(y)
clf = ExtraTreesClassifier(n_estimators=n_estimators)
clf = clf.fit(x, labels)
model = SelectFromModel(clf, prefit=True)
return x[x.columns[model.get_support(indices=True)]]
def autoencoder(data, dim_enc1, dim_enc2):
"""
Trains a stacked autoencoder with two hidden layers. Reduces data
to the size dim_enc2 and saves the reduced dataset to a csv
:param data: a pandas dataframe
:param dim_enc1: dimension of first hidden layer of autoencoder. First dimensionality reduction
:param dim_enc2: dimension of last hidden layer of autoencoder. Last dimensionality reduction and output data size
:return: reduced, encoded dataset
"""
in_size = data.shape[1]
print('Number of attributes in dataset:', in_size)
# placeholder for input vector
input_vect = Input(shape=(in_size, ))
# placeholder for encoding
encoded = Dense(dim_enc1, activation='relu')(input_vect)
# encoded = Dense(dim_enc2, activation='relu')(encoded)
# placeholder for decoding
# decoded = Dense(dim_enc1, activation='relu')(encoded)
decoded = Dense(in_size, activation='sigmoid')(encoded)
# full autoencoder placeholder
auto_model = Model(input_vect, decoded)
# encoder portion of autoencoder
encoder = Model(input_vect, encoded)
# the reconstructed representation at the end of the autoencoder
auto_model.compile(optimizer='adadelta', loss=losses.mean_squared_error)
# Training
auto_model.fit(data, data, epochs=15, batch_size=256, shuffle=True)
# Extract encoded data
enc_data = pd.DataFrame(encoder.predict(data))
print('Saving encoded data...')
enc_data.to_csv('recipes_encoded1000.csv')
print('Encoded data was successfully saved to a csv')
return enc_data
def dbscan(data, eps, min_samps):
"""
Perform DBSCAN density-based clustering on the given dataset
:param data: a pandas dataframe to be clustered
:param eps: hyperparameter of DBSCAN --- distance between points to be considered members of same neighborhood
:param min_samps: hyperparameter of DBSCAN --- number of neighbors for a observation to be considered core point
:return: the same pandas dataframe with clusters attached to each row
"""
print('Clustering input data with DBSCAN...')
clustering = dbizzle(eps=eps, min_samples=min_samps).fit(data)
clusters = clustering.labels_
print('DBSCAN complete.')
print(clusters)
print("The unique clusters:", np.unique(clusters))
# count the number of noise points in the clustering
noise_count = 0
for i in range(len(clusters)):
if clusters[i] == -1:
noise_count += 1
print("Number of noise points:", noise_count)
# append the clustering to the end of the input dataframe
data['clusters'] = clusters
return data
def cluster_sampling(data, c_type, w=[], c={}):
"""
Based on the clustering type that is input, a uniform distribution among
the clustering is returned.
:param c_type: an integer representing the clustering type. For simplicity,
0: 'e100_gmm54'
1: 'e100_gmm25'
2: 't186_gmm4'
:return c: a list of 10 cluster numbers based on the choice of a uniform distribution
:return clusters: a dictionary of clusters where each key is the cluster number and each value is a dataframe
containing all datapoints from only that cluster
"""
choices = ['e100_gmm54', 'e100_gmm25', 't186_gmm4']
# retrieve the corresponding column of the dataset
col = data[choices[c_type]]
num_clusters = np.max(col)
if len(w) == 0:
weights = []
# calculate the weights and separate the clusters
for j in range(num_clusters + 1):
weights.append(1 / (num_clusters + 1))
else:
weights = w
if len(c) == 0:
clusters = {}
for j in range(num_clusters + 1):
# keys correspond to cluster number, values correspond to dataframe with only observations of that cluster
clusters[str(j)] = data.loc[data[choices[c_type]] == j]
else:
clusters = c
# Now we can sample a cluster number based on the initial weights
print("Keys:", list(clusters.keys()))
print("Weights:", weights)
sampled_clusters = np.random.choice(list(clusters.keys()), 10, p=weights)
print("Clusters to sample from:", sampled_clusters)
return weights, sampled_clusters, clusters
def sample_from_cluster(choices, clusters):
"""
Performs random sampling for each given cluster
:param choices: the cluster numbers (chosen by a weighted sampling) on which to perform a random sampling
:param clusters: a dictionary where each key is the cluster number and each value is a dataframe with
observations from only that cluster
:return: len(choices) number of observations
"""
size = len(choices)
rlist = []
index_list = []
i = 0
while len(rlist) < size:
cluster = clusters[str(choices[i])] # the dataframe of that cluster only
sample = cluster.sample(1)
recipe = sample.name.values
index = cluster.loc[cluster['name'] == recipe[0]].index[0]
# make sure that there aren't any repeats in that sample
if recipe[0] not in rlist:
rlist.append(recipe[0])
index_list.append(index)
print(recipe[0], ':', choices[i])
i += 1
return rlist, index_list
def reweight(w, rate_dict, choices, clusters):
"""
Helps the system converge to a preference by the user
:param w: the vector of weights corresponding to each cluster
:param rate_dict: a dictionary with 10 recipes as keys and 10 ratings as values
:param choices: a list of 10 values each corresponding to a cluster. Recipe i is from the cluster corresponding to
the ith index of this list
:param clusters: a dictionary where each key is the cluster number and each value is the dataframe of only that
cluster's observations
:return: a new vector of weights based upon the ratings given by the user
"""
ratings = np.array(list(rate_dict.values()))
print("Ratings:", ratings)
print("Clusters corresponding to recipe:", choices)
choices = np.array([int(i) for i in choices]) # convert entries to integers
tot_clusts = len(clusters)
print('Total Number of Clusters:', tot_clusts)
divs = np.zeros(tot_clusts) # dividends used to average scores of clusters that have more than one recipe present
tot_scores = np.zeros(tot_clusts) # the total scores before averaging
for i in range(tot_clusts):
divs[i] = np.sum(choices == i) # allows us to average the ratings corresponding to the recipes' clusters
indices = choices == i # a 1 will be at the index where the recipe belongs to the ith cluster
tot_scores[i] = np.dot(indices, ratings)
# Perform an element-wise division
avg_scores = np.divide(tot_scores, divs, out=np.zeros_like(tot_scores), where=divs != 0)
print("Dividends:", divs)
print("Total Scores:", tot_scores)
print("Average Scores:", avg_scores)
# This will need to be re-normalized
new_w = avg_scores * w # element-wise product of average_scores and the old weight vector will give us the new w
# Establish a vector based on presence of clusters in the recipe sample
presence = np.zeros(tot_clusts)
for i in range(tot_clusts):
if i in choices:
presence[i] = 1
new_w *= presence # insures that clusters that are not present will not be re-weighted
norm_to = np.dot(w, presence) # normalize only with respect to the present clusters' weights
print("Clusters Present:", presence)
new_w = (new_w / np.sum(new_w)) * norm_to
new_w += w * (presence == 0) # add back in the old weights of absent clusters (these are unaffected)
print(new_w)
print(np.sum(new_w))
return new_w
def find_info(rec_rate, ind_list, clustering, recipe_info, clusters):
"""
Get the info corresponding the the highest-rated recipe. Cluster number and index of recipe
:param rec_rate: a dictionary where each key is a recipe and each value is the rating. The final pass.
:param ind_list: the list of indices of each recipe in the original dataset
:param clustering: the clustering that is being used --- again indexed by integer for simplicity
0: e100_gmm54
1: e100_gmm25
2: t186_gmm4
:param recipe_info: the recipe_info.csv dataframe with recipe names, types, subtypes, and corresponding clusters
:param clusters: dictionary of subsetted clusters. Key corresponding to cluster number
:return: the subset of data that contains FULL observations corresponding to only that cluster number
:return index: the index in the original dataset of the highest rated recipe
"""
# Find the highest rated recipe
max_rating = 0
max_recipe = None
max_index = None
size = len(rec_rate)
ratings = list(rec_rate.values())
recipes = list(rec_rate.keys())
for i in range(size):
if ratings[i] > max_rating:
max_rating = ratings[i]
max_recipe = recipes[i]
max_index = ind_list[i]
# Now, we have the highest rated recipe and its index in the original dataset
# Find its corresponding cluster
c_list = ['e100_gmm54', 'e100_gmm25', 't186_gmm4']
clustering = c_list[clustering] # convert from integer interpretation of clustering to string
print("Clustering being used:", clustering)
print('Max Index:', max_index)
cluster = recipe_info.loc[max_index, clustering]
print("The highest-rated recipe:", max_recipe)
print("The recipe's cluster:", cluster)
print("The recipe's index in the dataset:", max_index)
if clustering.startswith('e'):
data = pd.read_csv('Data/recipes_encoded100.csv').iloc[:, 1:]
else:
data = pd.read_csv('Data/recipes_normalized_varFS_extraTrees186.csv').iloc[:, 1:]
# subset the data
# get relevant rows in list format
row_list = list(clusters[str(cluster)].index)
print(recipe_info.loc[row_list, ['name', clustering]])
# return a dataframe of only those rows
subset = data.loc[row_list, :]
return subset, max_index
def get_closest_neighbors(recipe_index, recipe_cluster, n_neighbors=10):
"""
Will find the closest neighbors to the given recipe as a final recommendation
:param recipe_index: the single highest rated recipe in the last round of ratings
:param recipe_cluster: all recipes belonging to the same cluster as the highest rated recipe
:param n_neighbors: number of closest recipes desired
:return: the list of unique recipes closest to the highest rated recipe
"""
recipe_info = pd.read_csv("Data/recipe_info.csv").iloc[:, 1:]
knn = NearestNeighbors(n_neighbors=n_neighbors, metric='correlation')
if len(recipe_cluster) < n_neighbors:
return get_closest_neighbors(recipe_index, recipe_cluster, n_neighbors-1)
cluster_recipe_info = recipe_info.iloc[list(recipe_cluster.index)]
cluster_recipe_info = cluster_recipe_info.reset_index()
print("Cluster subset: ", cluster_recipe_info)
print("Fitting KNN")
knn.fit(recipe_cluster)
print("Fitting complete")
print("Index: ", recipe_index)
neighbors = knn.kneighbors(recipe_cluster.loc[recipe_index].values.reshape(1, -1))
neighbor_distances = neighbors[0][0]
neighbor_indices = neighbors[1][0]
print("Neighbor indices: ", neighbor_indices)
print("Neighbor distances: ", neighbor_distances)
recommendations = cluster_recipe_info.name[neighbor_indices].values
return recommendations, neighbor_distances