In [1]:
import pandas as pd
import numpy as np

# Integrantes

In [2]:
members = {
    'RM': ['552226', '98676', '551905', '550782'],
    'Nome': ['Bruno Francisco Brito de Paula', 'Edward de Lima Silva',
             'Gabriel Barroso de Assis França','Kayque Lima Nunes'],
}
group = pd.DataFrame(members)
group.set_index('RM', inplace=True)
group['Turma'] = '2TDSPW'
group

Unnamed: 0_level_0,Nome,Turma
RM,Unnamed: 1_level_1,Unnamed: 2_level_1
552226,Bruno Francisco Brito de Paula,2TDSPW
98676,Edward de Lima Silva,2TDSPW
551905,Gabriel Barroso de Assis França,2TDSPW
550782,Kayque Lima Nunes,2TDSPW


# Nasa - Nearest Earth Objects Dataset

Disponível em: https://www.kaggle.com/datasets/sameepvani/nasa-nearest-earth-objects/data.

O dataset conta com vários tipos de dados sobre objetos próximos a terra, e foi coletado diretamente de uma API da NASA.

## Colunas

O dataset conta com as colunas ID, nome, diâmetro mínimo estimado em km, diâmetro máximo estimado em km, velocidade relativa a Terra, distância em km não percorridos, corpo orbitado, presente no sentry (sistema automatizado de monitoramente de colisão da NASA), magnitude absoluta e nível de ameaça.

## Análise Estatística Exploratória

In [12]:
neo = pd.read_csv('/content/neo.csv')
neo.head()

Unnamed: 0,id,name,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbiting_body,sentry_object,absolute_magnitude,hazardous
0,2162635,162635 (2000 SS164),1.198271,2.679415,13569.249224,54839740.0,Earth,False,16.73,False
1,2277475,277475 (2005 WK4),0.2658,0.594347,73588.726663,61438130.0,Earth,False,20.0,True
2,2512244,512244 (2015 YE18),0.72203,1.614507,114258.692129,49798720.0,Earth,False,17.83,False
3,3596030,(2012 BV13),0.096506,0.215794,24764.303138,25434970.0,Earth,False,22.2,False
4,3667127,(2014 GE35),0.255009,0.570217,42737.733765,46275570.0,Earth,False,20.09,True


In [13]:
neo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90836 entries, 0 to 90835
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  90836 non-null  int64  
 1   name                90836 non-null  object 
 2   est_diameter_min    90836 non-null  float64
 3   est_diameter_max    90836 non-null  float64
 4   relative_velocity   90836 non-null  float64
 5   miss_distance       90836 non-null  float64
 6   orbiting_body       90836 non-null  object 
 7   sentry_object       90836 non-null  bool   
 8   absolute_magnitude  90836 non-null  float64
 9   hazardous           90836 non-null  bool   
dtypes: bool(2), float64(5), int64(1), object(2)
memory usage: 5.7+ MB


## Preparação dos Dados

In [15]:
X = neo.drop(['name', 'hazardous'], axis=1)

In [30]:
y = neo['hazardous']

In [16]:
# prompt: create label encoders, individually, for each categorial column in the dataset. after that, apply standard scaler to the dataset

from sklearn.preprocessing import LabelEncoder, StandardScaler

# Create label encoders for each categorical column
lb = LabelEncoder()

# Apply label encoders to each categorical column
X['orbiting_body'] = lb.fit_transform(X['orbiting_body'])

# Apply standard scaler to the dataset
scaler = StandardScaler()
X = scaler.fit_transform(X)
X


array([[-0.58548771,  3.58728394,  3.58728394, ...,  0.        ,
         0.        , -2.34863167],
       [-0.57998558,  0.46352912,  0.46352912, ...,  0.        ,
         0.        , -1.21873486],
       [-0.56873749,  1.9918872 ,  1.9918872 , ...,  0.        ,
         0.        , -1.96854406],
       ...,
       [ 1.89893188, -0.31984203, -0.31984203, ...,  0.        ,
         0.        ,  0.37072244],
       [ 1.90365704, -0.4023702 , -0.4023702 , ...,  0.        ,
         0.        ,  1.47643186],
       [ 1.90795099, -0.29335902, -0.29335902, ...,  0.        ,
         0.        ,  0.20486602]])

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

## Aplicando PCA

In [32]:
from sklearn.decomposition import PCA
p = 5
pca = PCA(n_components=p)

X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

X_train_pca.shape, X_train.shape, X_test_pca.shape, X_test.shape

((72668, 5), (72668, 8), (18168, 5), (18168, 8))

In [21]:
total_variance = pca.explained_variance_ratio_.sum()
print(f'O modelo PCA com {p} variáveis explica{100*(total_variance): .2f} % dos dados de entrada')

O modelo PCA com 5 variáveis explica 100.00 % dos dados de entrada


## Random Forest

Criando modelo.

### Dados iniciais

In [24]:
# prompt: create a random forest algorithm based model using X_train and Y_train and show the models accuracy

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Evaluate the model
from sklearn.metrics import accuracy_score
print(f'Acurácia do modelo: {accuracy_score(y_test, y_pred): .2f}')


Acurácia do modelo:  0.95


### Dados após aplicação do PCA

In [34]:
rf2 = RandomForestClassifier(n_estimators=40, random_state=0)
rf2.fit(X_train_pca, y_train)

y_pred = rf2.predict(X_test_pca)

print(f'Acurácia do modelo: {accuracy_score(y_test, y_pred): .2f}')

Acurácia do modelo:  0.91


### Avaliação

Após a redução da dimensionalidade dos dados, de oito colunas para cinco, houve uma perca de 0.04 de acurácia. Isso significa que houve perca considerável de informação no processo, que não foi favorável neste caso.