In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn import linear_model, tree, ensemble
import umap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = pd.read_csv("data_tg_dupl_rem.csv")

In [3]:
dataset

Unnamed: 0,Li,Be,B,O,Na,Mg,Al,Si,P,K,...,Hf,Ta,W,Hg,Tl,Pb,Bi,Th,U,Tg
0,0.000000,0.0,0.0,0.631579,0.000000,0.000000,0.035088,0.245614,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,951.15
1,0.000000,0.0,0.0,0.627119,0.000000,0.000000,0.067797,0.220339,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,966.15
2,0.000000,0.0,0.0,0.622951,0.000000,0.000000,0.098361,0.196721,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,983.15
3,0.000000,0.0,0.0,0.619048,0.000000,0.000000,0.126984,0.174603,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1012.15
4,0.000000,0.0,0.0,0.589063,0.134375,0.000000,0.062500,0.214062,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,898.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43636,0.000000,0.0,0.0,0.666667,0.000000,0.055556,0.000000,0.000000,0.222222,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,798.65
43637,0.125000,0.0,0.0,0.625000,0.000000,0.000000,0.000000,0.000000,0.208333,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,610.65
43638,0.190476,0.0,0.0,0.619048,0.000000,0.000000,0.000000,0.000000,0.095238,0.0,...,0.0,0.0,0.095238,0.0,0.0,0.0,0.0,0.0,0.0,709.15
43639,0.000000,0.0,0.0,0.600000,0.000000,0.000000,0.000000,0.266667,0.000000,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,767.15


# Pré-processamento dos dados

In [4]:
#remoção da coluna 'Tg' (alvo)
dados_dim65 = dataset.drop(['Tg'], axis=1)

# UMAP

In [5]:
reducer_umap = umap.UMAP(n_components = 10)

In [6]:
dim10_umap = reducer_umap.fit_transform(dados_dim65)
dim10_umap.shape

(43641, 10)

In [7]:
dim10_df = pd.DataFrame(dim10_umap)

In [8]:
X = dim10_df.iloc[:,:].values
y = dataset.iloc[:,-1].values
print("Matrix of features", X, sep='\n')
print("--------------------------------------------------")
print("Target Variable", y, sep='\n')
print(X.shape)
print(y.shape)

Matrix of features
[[ 6.7823405   4.459136    1.5902648  ...  5.7386994   4.0928884
   6.7987814 ]
 [ 6.782222    4.457295    1.5794584  ...  5.7409267   4.0873046
   6.8027577 ]
 [ 6.8643665   4.38929     1.4882108  ...  5.708819    4.013172
   6.8805265 ]
 ...
 [15.409172    4.714055    3.0222237  ...  2.0647292   1.8566066
   3.4831178 ]
 [ 7.6546245   5.61576     0.25902548 ...  3.5280495   6.798537
   4.441728  ]
 [ 7.22741     5.279834    0.51183945 ...  4.272588    6.487992
   5.060028  ]]
--------------------------------------------------
Target Variable
[951.15 966.15 983.15 ... 709.15 767.15 768.15]
(43641, 10)
(43641,)


In [14]:
kf = KFold(n_splits=10, random_state=1, shuffle=True)
kf.get_n_splits(X)

10

In [15]:
for i, (train_index, test_index) in enumerate(kf.split(X)):
    print(f"Fold {i}:")
    print(f" Train: index={train_index}")
    print(f" Test: index={test_index}")

Fold 0:
 Train: index=[    0     1     2 ... 43638 43639 43640]
 Test: index=[   10    40    44 ... 43594 43607 43626]
Fold 1:
 Train: index=[    0     1     2 ... 43638 43639 43640]
 Test: index=[    3     8    23 ... 43596 43598 43599]
Fold 2:
 Train: index=[    0     1     3 ... 43638 43639 43640]
 Test: index=[    2    11    19 ... 43628 43630 43636]
Fold 3:
 Train: index=[    0     1     2 ... 43638 43639 43640]
 Test: index=[   21    26    36 ... 43609 43616 43622]
Fold 4:
 Train: index=[    1     2     3 ... 43638 43639 43640]
 Test: index=[    0    13    29 ... 43615 43619 43620]
Fold 5:
 Train: index=[    0     1     2 ... 43638 43639 43640]
 Test: index=[    4     6     7 ... 43624 43632 43634]
Fold 6:
 Train: index=[    0     1     2 ... 43638 43639 43640]
 Test: index=[    5    12    49 ... 43610 43631 43635]
Fold 7:
 Train: index=[    0     2     3 ... 43636 43639 43640]
 Test: index=[    1    75    80 ... 43618 43637 43638]
Fold 8:
 Train: index=[    0     1     2 ... 436

In [16]:
def rmse(score):
    rmse = np.sqrt(-score)
    print(f'rmse= {"{:.2f}".format(rmse)}')

In [17]:
score = cross_val_score(ensemble.RandomForestRegressor(random_state=42), X, y, cv= kf, scoring="neg_mean_squared_error")
print(f'Scores for each fold are: {score}')
rmse(score.mean())

Scores for each fold are: [-1365.12945784 -1546.14826948 -1571.57584756 -1517.30256386
 -1574.69891119 -1709.13568606 -1336.08507437 -1442.5615978
 -1659.33101478 -1427.19690866]
rmse= 38.92


# HDBSCAN - DEPOIS DE FAZER O KFOLD

In [None]:
import hdbscan
import math

In [24]:
for i, (train_index, test_index) in enumerate(kf.split(X)):
    if i == 1:
        print(f"Fold {i}:")
        print(f" Train: index={train_index}")
        print(f" Test: index={test_index}")
        cluster = hdbscan.HDBSCAN(min_cluster_size=160, min_samples=80, prediction_data=True)
        cluster.fit(X[train_index])
        test_labels, strengths = hdbscan.approximate_predict(cluster, X[test_index])

Fold 1:
 Train: index=[    0     1     2 ... 43638 43639 43640]
 Test: index=[    3     8    23 ... 43596 43598 43599]


In [17]:
print(test_labels)
print(len(test_labels))
print(max(test_labels))

[ 1  1  1 ...  2  2 -1]
4364
2
