In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score


data = pd.read_csv('train.csv')
print(data.head())

  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False  
1         True  
2        False  
3        False  
4         True  


In [2]:
def fill_missing_values(df: pd.DataFrame):
    for column in df.columns:
        if column in ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
            df[column] = df[column].fillna(df[column].mean())
        elif column in ['HomePlanet', 'Cabin', 'Destination', 'Name', 'PassengerId']:
            df[column] = df[column].astype('object')
            df[column] = df[column].fillna(df[column].mode()[0])
        elif column in ['VIP', 'Transported', 'CryoSleep']:
            df[column] = df[column].fillna(df[column].mode()[0])
    return df

In [3]:
#Limpeza para valores null dentre as colunas, tomei a liberdade de 
#definir o tipo para cada coluna no método anterior para facilitar
data = fill_missing_values(df=data)

# Inicializar o LabelEncoder
le = LabelEncoder()
categorical_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
for col in categorical_cols:
    data[col] = le.fit_transform(data[col].astype(str))

  df[column] = df[column].fillna(df[column].mode()[0])
  df[column] = df[column].fillna(df[column].mode()[0])


In [4]:
# Separar características e variável-alvo nos dados de treino
X_train = data.drop(['PassengerId', 'Transported', 'Name'], axis=1)
y_train = data['Transported'].astype(bool)

X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_part, y_train_part)

y_val_pred = clf.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.2f}')

Validation Accuracy: 0.78


In [5]:
# Se quiser visualizar melhor a precisão do modelo
# Tem esse método que estruturei aqui

# def evaluate_results(y_test, y_predict):
#     print('Resultados da Classificação:')
#     f1 = f1_score(y_test, y_predict, average='weighted', zero_division=1)
#     print("f1: %.2f%%" % (f1 * 100.0)) 
#     roc = roc_auc_score(y_test, y_predict, average='weighted')
#     print("roc: %.2f%%" % (roc * 100.0)) 
#     rec = recall_score(y_test, y_predict, average='weighted', zero_division=1)
#     print("recall: %.2f%%" % (rec * 100.0)) 
#     prc = precision_score(y_test, y_predict, average='weighted', zero_division=1)
#     print("precisão: %.2f%%" % (prc * 100.0)) 


# evaluate_results(y_val, y_val_pred)

Resultados da Classificação:
f1: 77.51%
roc: 77.53%
recall: 77.52%
precisão: 77.55%


In [6]:
# Caso desejar buscar entre diversos parâmetros o que seria o ideal, pode descomentar a baixo e rodar
# Possível que alguns jobs dêem problema pois é algo que utiliza mt ram e processamento, então qualquer coisa colocar
# Esse bloco aqui em um multiprocessing seria o ideal, para utilizar melhor todos os núcleos no computador que rode

# param_grid = {
#     'n_estimators': [50, 100, 150], 
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['auto', 'sqrt', 'log2']  
# }

#
# rf = RandomForestClassifier(random_state=42)

# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# grid_search.fit(X_train_part, y_train_part)

# best_params = grid_search.best_params_
# best_model = grid_search.best_estimator_

# y_val_pred = best_model.predict(X_val)
# val_accuracy = accuracy_score(y_val, y_val_pred)
# print(f'Validation Accuracy with Best Model: {val_accuracy:.2f}')
# print(f'Best Parameters: {best_params}')

In [7]:
# Dados de teste
test_data = pd.read_csv('test.csv')

test_data = fill_missing_values(test_data)
for col in categorical_cols:
    test_data[col] = le.fit_transform(test_data[col].astype(str))

X_test = test_data.drop(['PassengerId', 'Name'], axis=1)

predictions = []
passenger_ids = test_data['PassengerId']

test_predictions = clf.predict(X_test)
# Como no repositório modelo do teste, tinha apenas o output relativo a positivo ou falso,
# tomei a liberdade de adicionar 2 colunas com a porcentagem de probabilidade
# para cada uma das labels treinadas (positivo e negativo)
test_probabilities = clf.predict_proba(X_test)

submission = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Transported': test_predictions,
    'Probability_Positive': test_probabilities[:, 1],
    'Probability_Negative': test_probabilities[:, 0]
})

submission.to_csv('submission.csv', index=False)

  df[column] = df[column].fillna(df[column].mode()[0])
  df[column] = df[column].fillna(df[column].mode()[0])


In [None]:
# Caso deseje salvar o encoder e o modelo para posteriormente carregar ele em novos dados ou em um outro ambiente:

joblib.dump(clf, 'best_model.joblib')

# Salvar os LabelEncoders
joblib.dump(le, 'label_encoders.joblib')