In [22]:
# Importação de pacotes necessários
import pandas as pd
import numpy as np
import os
import re

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Pacotes de texto
from sklearn.feature_extraction.text import TfidfVectorizer


In [23]:
# 2) Carregar os datasets no Google Colab

# Caminhos dos arquivos
train_path = "/kaggle/input/campeonato-inteli-modulo3-2025/train.csv"  # ajuste se necessário
test_path  = "/kaggle/input/campeonato-inteli-modulo3-2025/test.csv"   # ajuste se necessário

# Leitura dos datasets
df_train = pd.read_csv(train_path)
df_test  = pd.read_csv(test_path)

print("Train shape:", df_train.shape)
print("Test shape :", df_test.shape)



Train shape: (646, 33)
Test shape : (277, 32)


In [24]:
# 3) Variáveis de configuração

# Coluna alvo
TARGET = "labels"
ID_COL = "id" if "id" in df_train.columns else None

print("Target:", TARGET, "| ID:", ID_COL)


Target: labels | ID: id


In [25]:
# Colunas numéricas
num_cols = df_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
if TARGET in num_cols: num_cols.remove(TARGET)
if ID_COL and ID_COL in num_cols: num_cols.remove(ID_COL)

# Colunas de texto
cat_cols = df_train.select_dtypes(include=['object']).columns.tolist()
text_cols = [c for c in cat_cols if df_train[c].apply(lambda x: isinstance(x, str)).mean() > 0.9]

print("Numéricas:", num_cols)
print("Texto   :", text_cols)


Numéricas: ['age_first_funding_year', 'age_last_funding_year', 'age_first_milestone_year', 'age_last_milestone_year', 'relationships', 'funding_rounds', 'funding_total_usd', 'milestones', 'is_CA', 'is_NY', 'is_MA', 'is_TX', 'is_otherstate', 'is_software', 'is_web', 'is_mobile', 'is_enterprise', 'is_advertising', 'is_gamesvideo', 'is_ecommerce', 'is_biotech', 'is_consulting', 'is_othercategory', 'has_VC', 'has_angel', 'has_roundA', 'has_roundB', 'has_roundC', 'has_roundD', 'avg_participants']
Texto   : ['category_code']


In [26]:
# Substituir NaNs por mediana nas numéricas
imputer = SimpleImputer(strategy="median")
df_train[num_cols] = imputer.fit_transform(df_train[num_cols])
df_test[num_cols]  = imputer.transform(df_test[num_cols])

# Para texto, preencher com string vazia
for c in text_cols:
    df_train[c] = df_train[c].fillna("")
    df_test[c]  = df_test[c].fillna("")


In [27]:
text_features = []

if text_cols:
    text_col = text_cols[0]
    try:
        tfv = TfidfVectorizer(max_features=500, stop_words="english")
        tfv.fit(pd.concat([df_train[text_col], df_test[text_col]]))
        X_train_text = tfv.transform(df_train[text_col]).toarray()
        X_test_text  = tfv.transform(df_test[text_col]).toarray()
        for i in range(X_train_text.shape[1]):
            colname = f"text_{i}"
            df_train[colname] = X_train_text[:, i]
            df_test[colname]  = X_test_text[:, i]
            text_features.append(colname)
    except:
        # fallback manual: criar features simples de texto
        df_train['text_len'] = df_train[text_col].apply(lambda x: len(str(x)))
        df_test['text_len']  = df_test[text_col].apply(lambda x: len(str(x)))
        df_train['word_count'] = df_train[text_col].apply(lambda x: len(str(x).split()))
        df_test['word_count']  = df_test[text_col].apply(lambda x: len(str(x).split()))
        df_train['unique_words'] = df_train[text_col].apply(lambda x: len(set(str(x).split())))
        df_test['unique_words']  = df_test[text_col].apply(lambda x: len(set(str(x).split())))
        df_train['avg_word_len'] = df_train[text_col].apply(lambda x: np.mean([len(w) for w in str(x).split()]) if str(x).split() else 0)
        df_test['avg_word_len']  = df_test[text_col].apply(lambda x: np.mean([len(w) for w in str(x).split()]) if str(x).split() else 0)
        text_features = ['text_len', 'word_count', 'unique_words', 'avg_word_len']


In [28]:
features_final = num_cols + text_features

X_train = df_train[features_final].copy()
X_test  = df_test[features_final].copy()
y       = df_train[TARGET].copy()

# Normalizar entre 0 e 1 para garantir formato binário
y = (y > 0).astype(int)

# Padronização numérica
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)


In [29]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(y))
test_preds = np.zeros(len(df_test))

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y)):
    print(f"Fold {fold+1}")
    X_tr, X_val = X_train[tr_idx], X_train[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

    model = RandomForestClassifier(n_estimators=500, max_depth=10, random_state=42)
    model.fit(X_tr, y_tr)

    oof_preds[val_idx] = model.predict_proba(X_val)[:,1]
    test_preds += model.predict_proba(X_test)[:,1] / skf.n_splits

auc = roc_auc_score(y, oof_preds)
print("OOF AUC:", auc)


Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
OOF AUC: 0.8120855368085286


In [30]:
# Threshold padrão 0.5
pred_labels = (test_preds >= 0.5).astype(int)


In [31]:
if ID_COL not in df_test.columns:
    raise ValueError(f"Coluna ID '{ID_COL}' não encontrada no dataset de teste.")

submission = pd.DataFrame({
    "id": df_test[ID_COL],
    "labels": pred_labels
})

submission.to_csv("submission.csv", index=False)
print(submission.head())


    id  labels
0   70       1
1   23       0
2  389       1
3  872       1
4  920       1


In [32]:
if ID_COL not in df_test.columns:
    raise ValueError(f"Coluna ID '{ID_COL}' não encontrada no dataset de teste.")

submission = pd.DataFrame({
    "id": df_test[ID_COL],
    "labels": pred_labels
})

submission.to_csv("submission.csv", index=False)
print(submission.head())


    id  labels
0   70       1
1   23       0
2  389       1
3  872       1
4  920       1
