In [1]:
import pandas as pd
import time
import numpy as np
import scipy.sparse as sparse
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import pairwise_distances_argmin_min
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import RepeatedEditedNearestNeighbours

In [3]:
path_data = "../hufa_train_wiki_w2v/full_centroids_train.npz"

# Load clear_labeled_data
data = sparse.load_npz(path_data)

p_row_index = data[:,-1].nonzero()[0]
n_row_index = np.delete(np.arange(0, data.shape[0]), p_row_index)
positivos = data[p_row_index,:]
negativos = data[n_row_index,:]

print ("Loaded train_centroids into data:", data.shape, positivos.shape, negativos.shape)

Loaded train_centroids into data: (164926, 100066) (1509, 100066) (163417, 100066)


In [6]:
start = time.time() # Start time
MClass = negativos[:,:-1]
kmedias = MiniBatchKMeans(n_clusters=10000, batch_size=10000,).fit(MClass)
#kmedias = MiniBatchKMeans(n_clusters=10000, batch_size=3334,).fit(MClass) #2100 sec aprox
new_Xtrain_MClass = MClass[pairwise_distances_argmin_min(kmedias.cluster_centers_, MClass)[0]]
US_MClass = sparse.hstack([new_Xtrain_MClass, np.zeros((kmedias.cluster_centers_.shape[0], 1), dtype=np.uint16)]).tocsr()
data_temp = sparse.vstack([US_MClass, positivos])
sparse.save_npz("../hufa_train_wiki_w2v/data_temp.npz", data_temp)
end = time.time()
elapsed = end - start
print("Finished, Time taken: ", elapsed, "seconds.")

In [4]:
try:
    data_temp
except NameError:
    data_temp = sparse.load_npz("../hufa_train_wiki_w2v/data_temp.npz")

#print(data_temp.max())
start = time.time() # Start time

balancedData = data_temp.astype(float)
smt = SMOTE(n_jobs=-1)
#ada = ADASYN(n_jobs=-1)
#tomek = TomekLinks(ratio={1:0}, n_jobs=-1)
enn = RepeatedEditedNearestNeighbours(ratio={1:0}, n_jobs=-1)

print(balancedData[balancedData[:,-1].nonzero()[0],:].shape[0])
X_res, y_res = enn.fit_sample(balancedData[:,:-1], balancedData[:,-1].A.reshape(balancedData[:,-1].A.shape[0],))
balancedData = sparse.hstack([X_res, y_res.reshape(y_res.shape[0],1)]).tocsr()
print ("(-):\t[",balancedData[balancedData[:,-1].nonzero()[0],:].shape[0],"] to [",balancedData[np.delete(np.arange(0, balancedData.shape[0]), balancedData[:,-1].nonzero()[0]),:].shape[0],"]")

X_res, y_res = smt.fit_sample(balancedData[:,:-1], balancedData[:,-1].A.reshape(balancedData[:,-1].A.shape[0],))
#balancedData = np.concatenate((X_res, np.reshape(y_res, (y_res.shape[0], 1))), axis=1)
balancedData = sparse.hstack([X_res, y_res.reshape(y_res.shape[0],1)]).tocsr()
print ("(+):\t[",balancedData[balancedData[:,-1].nonzero()[0],:].shape[0],"] to [",balancedData[np.delete(np.arange(0, balancedData.shape[0]), balancedData[:,-1].nonzero()[0]),:].shape[0],"]")

X_res, y_res = enn.fit_sample(balancedData[:,:-1], balancedData[:,-1].A.reshape(balancedData[:,-1].A.shape[0],))
balancedData = sparse.hstack([X_res, y_res.reshape(y_res.shape[0],1)]).tocsr()
print ("(-):\t[",balancedData[balancedData[:,-1].nonzero()[0],:].shape[0],"] to [",balancedData[np.delete(np.arange(0, balancedData.shape[0]), balancedData[:,-1].nonzero()[0]),:].shape[0],"]")

#print ("(-):\t[",balancedData[balancedData[:,-1]==1].shape[0],"] to [",balancedData[balancedData[:,-1]==0].shape[0],"]")
print ("maxvalue: ",balancedData.max(), "minvalue: ", balancedData.min())
    
#if balancedData.max() < np.iinfo(np.uint16).max:
#    print ("\t- Optimizing dtypes...")
#    balancedData = balancedData.astype(np.uint16, copy=False)
    
del X_res, y_res
end = time.time()
elapsed = end - start
print("Finished, Time taken: ", elapsed, "seconds.") #1333

1509
(-):	[ 1198 ] to [ 10000 ]
(+):	[ 10000 ] to [ 10000 ]
(-):	[ 10000 ] to [ 10000 ]
maxvalue:  5181.0 minvalue:  0.0
Finished, Time taken:  18.05573582649231 seconds.


In [3]:
path_balanced = "../hufa_train_wiki_w2v/train_9.npz"
print("Saving balanced instances...")
sparse.save_npz(path_balanced, balancedData)
print("Finished")

Saving balanced instances...
Finished
