In [None]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, classification_report

# 1 carregando dados prontos dos candidatos

In [None]:
arquivo = pd.read_csv('C:/excel/Candidatos/canditatosfinal.csv')

In [None]:
arquivo.head()

In [None]:
arquivo.info()

# 2 Separando os arquivos teste e treino

In [None]:
y = arquivo['aprovado_vaga']
x = arquivo.drop('aprovado_vaga', axis = 1)

# 3 Criando os conjuntos de dados de treino e teste

In [None]:
x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size = 0.3)

In [None]:
# 4 Analizando o melhor modelo de calssificação 

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, make_scorer, classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Modelos a serem testados (sem XGBoost e LightGBM)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

# Pré-processamento
imputer = SimpleImputer(strategy='median')
x = imputer.fit_transform(x)
scaler = StandardScaler()
x = scaler.fit_transform(x)

# Dividir os dados (usando minúsculas)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, 
                                                   random_state=42, 
                                                   stratify=y)

# Lista de modelos para testar
models = [
    ('Logistic Regression', LogisticRegression(class_weight='balanced', max_iter=1000)),
    ('KNN', KNeighborsClassifier()),
    ('Decision Tree', DecisionTreeClassifier(class_weight='balanced')),
    ('Random Forest', RandomForestClassifier(class_weight='balanced', random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('SVM', SVC(class_weight='balanced', probability=True, random_state=42)),
    ('Naive Bayes', GaussianNB()),
    ('AdaBoost', AdaBoostClassifier(random_state=42)),
    ('LDA', LinearDiscriminantAnalysis()),
    ('QDA', QuadraticDiscriminantAnalysis())
]

# Avaliar cada modelo
results = []
for name, model in models:
    try:
        # Criar pipeline
        pipeline = Pipeline([
            ('model', model)
        ])
        
        # Cross-validation com F1-Score
        cv_scores = cross_val_score(pipeline, x_train, y_train, 
                                   cv=5, scoring=make_scorer(f1_score))
        
        # Treinar no conjunto completo
        pipeline.fit(x_train, y_train)
        
        # Previsões no teste
        y_pred = pipeline.predict(x_test)
        test_f1 = f1_score(y_test, y_pred)
        
        # Relatório de classificação detalhado
        report = classification_report(y_test, y_pred, output_dict=True)
        
        # Armazenar resultados
        results.append({
            'Model': name,
            'CV Mean F1': np.mean(cv_scores),
            'CV Std F1': np.std(cv_scores),
            'Test F1': test_f1,
            'Test Precision': report['1']['precision'],
            'Test Recall': report['1']['recall']
        })
        
        print(f"{name} - Test F1: {test_f1:.4f}")
        
    except Exception as e:
        print(f"Erro no modelo {name}: {str(e)}")
        continue

# Converter resultados para DataFrame
results_df = pd.DataFrame(results)
print("\nResultados completos dos modelos (Ordenados por Test F1):")
print(results_df.sort_values('Test F1', ascending=False).to_string(index=False))

# Adicional: Feature Importance do melhor modelo
if not results_df.empty:
    best_model_info = [(i, name, model) for i, (name, model) in enumerate(models) 
                      if name in results_df['Model'].values]
    best_model_name = results_df.loc[results_df['Test F1'].idxmax(), 'Model']
    
    # Encontrar o modelo correspondente
    best_model = None
    for idx, name, model in best_model_info:
        if name == best_model_name:
            best_model = model
            break
    
    if best_model is not None:
        print(f"\nFeature Importance do melhor modelo ({best_model_name}):")
        
        best_model.fit(x_train, y_train)
        
        if hasattr(best_model, 'feature_importances_'):
            importances = best_model.feature_importances_
            feature_importance = pd.DataFrame({'Feature': features, 'Importance': importances})
            print(feature_importance.sort_values('Importance', ascending=False))
        elif hasattr(best_model, 'coef_'):
            coefs = best_model.coef_[0] if len(best_model.coef_.shape) > 1 else best_model.coef_
            feature_importance = pd.DataFrame({'Feature': features, 'Coefficient': coefs})
            print(feature_importance.sort_values('Coefficient', ascending=False))
        else:
            print("Este modelo não fornece feature importance direta.")

# 5 analizando os melhores parametros no modelo de Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Definir os parâmetros a serem testados
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100, 200, 500],
    'class_weight': [None, 'balanced']
}

# Criar o modelo
model = LogisticRegression()

# Configurar o Grid Search
grid_search = GridSearchCV(estimator=model, 
                          param_grid=param_grid, 
                          cv=5, 
                          scoring='accuracy',
                          n_jobs=-1)

# Executar o Grid Search
grid_search.fit(x_train, y_train)

# Melhores parâmetros encontrados
print("Melhores parâmetros:", grid_search.best_params_)
print("Melhor score:", grid_search.best_score_)

# 6 criando o modelo com os melhores parametros para o modelo de logistic regression

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Criar o modelo com os melhores parâmetros encontrados
best_params = {
    'C': 0.001,
    'class_weight': None,
    'max_iter': 100,
    'penalty': 'l1',
    'solver': 'liblinear'
}

# Instanciar o modelo
log_reg = LogisticRegression(
    C=best_params['C'],
    class_weight=best_params['class_weight'],
    max_iter=best_params['max_iter'],
    penalty=best_params['penalty'],
    solver=best_params['solver'],
    random_state=42  # para reprodutibilidade
)

# Treinar o modelo
log_reg.fit(x_train, y_train)

# 7 testando o modelo e visualizando a acuracia 

In [None]:
# Fazer previsões
y_pred = log_reg.predict(x_test)
print(f"Acurácia: {accuracy_score(y_test, y_pred):.4f}")