In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import classification_report

from RandomForestClassifierGridSearch import RandomForestClassifierGridSearch
import pickle
pd.set_option('display.max_rows', None)

In [2]:
df = pd.read_parquet('../data/processed/scores_approvals.parquet')
df.shape

(7263, 39)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7263 entries, 0 to 7262
Data columns (total 39 columns):
 #   Column                            Non-Null Count  Dtype   
---  ------                            --------------  -----   
 0   numero_inscricao                  7263 non-null   object  
 1   nome_x                            7263 non-null   object  
 2   escore_bruto_p1_etapa1            7263 non-null   float64 
 3   escore_bruto_p2_etapa1            7263 non-null   float64 
 4   nota_redacao_etapa1               7263 non-null   float64 
 5   escore_bruto_p1_etapa2            7263 non-null   float64 
 6   escore_bruto_p2_etapa2            7263 non-null   float64 
 7   nota_redacao_etapa2               7263 non-null   float64 
 8   escore_bruto_p1_etapa3            7263 non-null   float64 
 9   escore_bruto_p2_etapa3            7263 non-null   float64 
 10  nota_redacao_etapa3               7263 non-null   float64 
 11  argumento_final                   7263 non-null   float6

In [4]:
FEATURES = ['escore_bruto_p1_etapa1',
            'escore_bruto_p2_etapa1', 'nota_redacao_etapa1',
            'escore_bruto_p1_etapa2', 'escore_bruto_p2_etapa2',
            'nota_redacao_etapa2', 'escore_bruto_p1_etapa3',
            'escore_bruto_p2_etapa3', 'nota_redacao_etapa3',
            'cotista', 'cotas_negros_flag', 'publicas1_flag',
            'publicas2_flag', 'publicas3_flag', 'publicas4_flag', 'publicas5_flag',
            'publicas6_flag', 'publicas7_flag', 'publicas8_flag']

In [5]:
course_dummies = pd.get_dummies(df.course)
course_dummies_columns = list(course_dummies.columns)
df = pd.concat([df, course_dummies], axis=1)

In [6]:
FEATURES.extend(course_dummies_columns)

In [7]:
len(FEATURES)

110

## Baseline Model

In [8]:
# publicas_flags = [
#  'publicas1_flag',
#  'publicas2_flag',
#  'publicas3_flag',
#  'publicas4_flag',
#  'publicas5_flag',
#  'publicas6_flag',
#  'publicas7_flag',
#  'publicas8_flag']
# df['publicas_flag'] = df[publicas_flags].sum(axis=1)

In [9]:
# FEATURES = [col for col in FEATURES if col not in publicas_flags] 
# print(len(FEATURES))

In [10]:
X = df[FEATURES] # features
y = df['label'] # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=47) # 70% training and 30% test

In [11]:
# Create a Random Forest Classifier
model = RandomForestClassifier(random_state=47)

# Fit randomized search
model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print('Accuracy:', metrics.accuracy_score(y_test, y_pred))
classification_report(y_test, y_pred, output_dict=True)

Accuracy: 0.874741913282863


{'0': {'precision': 0.8793859649122807,
  'recall': 0.9860655737704918,
  'f1-score': 0.92967542503864,
  'support': 1220},
 '1': {'precision': 0.8,
  'recall': 0.2918454935622318,
  'f1-score': 0.42767295597484284,
  'support': 233},
 'accuracy': 0.874741913282863,
 'macro avg': {'precision': 0.8396929824561403,
  'recall': 0.6389555336663617,
  'f1-score': 0.6786741905067414,
  'support': 1453},
 'weighted avg': {'precision': 0.8666557998575242,
  'recall': 0.874741913282863,
  'f1-score': 0.8491753732204261,
  'support': 1453}}

In [12]:
 # Predict diabetes
predictions = model.predict_proba(X_test)
approval_prob = round(predictions[0][1], ndigits=3)

In [13]:
predictions[:, 1]

array([0.02, 0.38, 0.09, ..., 0.25, 0.04, 0.1 ])

In [14]:
predictions

array([[0.98, 0.02],
       [0.62, 0.38],
       [0.91, 0.09],
       ...,
       [0.75, 0.25],
       [0.96, 0.04],
       [0.9 , 0.1 ]])

## Hyperparameter tuning + class_weight

In [15]:
param_grid = {
    'n_estimators': [200, 300, 500],      # Number of trees in the forest
    'max_depth': [5, 10, 30],            # Maximum depth of each tree
    'min_samples_split': [5, 10, 30],       # Minimum number of samples required to split an internal node
    'min_samples_leaf': [8, 16, 32]          # Minimum number of samples required to be at a leaf node
}

In [16]:
# Assuming you have your training data X_train and corresponding labels y_train

# Instantiate the class
rf_gs = RandomForestClassifierGridSearch(X_train, y_train)

# Fit the classifier using GridSearchCV
rf_gs.fit(param_grid)

# Assuming you have your test data X_test and corresponding labels y_test

# Evaluate the best model on the test data
report = rf_gs.evaluate(X_test, y_test)
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.79      0.86      1220
           1       0.40      0.75      0.52       233

    accuracy                           0.78      1453
   macro avg       0.67      0.77      0.69      1453
weighted avg       0.86      0.78      0.80      1453



In [None]:
with open('model_flags_tuned.pickle','wb') as f:
    pickle.dump(model, f)

In [None]:
#with open('model_flags_tuned.pickle','wb') as f:
#  pickle.dump(model, f)

## Stratified KFold + Hyperameter tuning + Flags

In [None]:
#FEATURES.remove('argumento_final')
print(len(FEATURES))

X = df[FEATURES] # features
y = df['label'] # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=47) # 70% training and 30% test

110


In [None]:
# # Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],      # Number of trees in the forest
    'max_depth': [5, 10, 20, 40, 80],            # Maximum depth of each tree
    'min_samples_split': [5, 10, 20, 50],       # Minimum number of samples required to split an internal node
    'min_samples_leaf': [2, 8, 16, 32],          # Minimum number of samples required to be at a leaf node
}

In [None]:
X_train.shape

(5810, 110)

In [None]:
# Assuming you have your training data X_train and corresponding labels y_train

# Instantiate the class
rf_gs = RandomForestClassifierGridSearch(X_train, y_train)

# Fit the classifier using GridSearchCV
rf_gs.fit(param_grid)

# Assuming you have your test data X_test and corresponding labels y_test

# Evaluate the best model on the test data
report = rf_gs.evaluate(X_test, y_test)
print(report)

              precision    recall  f1-score   support

           0       0.93      0.91      0.92      1220
           1       0.57      0.64      0.60       233

    accuracy                           0.86      1453
   macro avg       0.75      0.77      0.76      1453
weighted avg       0.87      0.86      0.87      1453



In [None]:
rf_gs.save_model('stratified_kfold_classweight15_tuned_model_spelling')

In [None]:
with open('../ml_dev/models/stratified_kfold_classweight15_tuned_model_spelling.pickle', 'rb') as f:
  model_loaded = pickle.load(f)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
y_pred = model_loaded.predict(X_test)
classification_report(y_test, y_pred, output_dict=True)

ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.


In [None]:
def predict_approval(model, new_data):
    
    # Predict diabetes
    predictions = model.predict_proba(new_data)
    approval_prob = round(predictions[0][1], ndigits=3)

    #pred_to_label = {0: 'Negative', 1: 'Positive'}

    # Make a list of predictions
    #data = []
    #for t, pred in zip(new_data, predictions):
    #    data.append({'prediction': pred[0]})

    return approval_prob

In [None]:
covariables = list({"escore_bruto_p1_etapa1": 5.172,
                            "escore_bruto_p2_etapa1": 14.653,
                            "nota_redacao_etapa1": 6.947,
                            "escore_bruto_p1_etapa2": 3.845,
                            "escore_bruto_p2_etapa2": 19.994,
                            "nota_redacao_etapa2": 7.222,
                            "escore_bruto_p1_etapa3": 4.998,
                            "escore_bruto_p2_etapa3": 16.66,
                            "nota_redacao_etapa3": 8.06,
                            "argumento_final": -25.699,
                            "2.1.1 CAMPUS  DARCY RIBEIRO – DIURNO  ADMINISTRAÇÃO (BACHARELADO)": 1.0,               
                            "cotista": 0.0,
                            "cotas_negros_flag": 0.0,
                            "publicas1_flag": 0.0,
                            "publicas2_flag": 0.0,
                            "publicas3_flag": 0.0,
                            "publicas4_flag": 0.0,
                            "publicas5_flag": 0.0,
                            "publicas6_flag": 0.0,
                            "publicas7_flag": 0.0,
                            "publicas8_flag": 0.0}.keys())

In [None]:
from samples import sample_approved, sample_not_approved

In [None]:
{'teste':sample_not_approved.get('cotista')}

{'teste': 0.0}

In [None]:
list(sample_not_approved.keys())

['escore_bruto_p1_etapa1',
 'escore_bruto_p2_etapa1',
 'nota_redacao_etapa1',
 'escore_bruto_p1_etapa2',
 'escore_bruto_p2_etapa2',
 'nota_redacao_etapa2',
 'escore_bruto_p1_etapa3',
 'escore_bruto_p2_etapa3',
 'nota_redacao_etapa3',
 'argumento_final',
 '2.1.1 CAMPUS  DARCY RIBEIRO – DIURNO  ADMINISTRAÇÃO (BACHARELADO)',
 'cotista',
 'cotas_negros_flag',
 'publicas1_flag',
 'publicas2_flag',
 'publicas3_flag',
 'publicas4_flag',
 'publicas5_flag',
 'publicas6_flag',
 'publicas7_flag',
 'publicas8_flag']

In [None]:
new_sample = sample_not_approved
#new_sample = pd.DataFrame([{col: new_sample.get(col) for col in FEATURES}])
#new_sample = new_sample.fillna(0)

In [None]:
[True for key in new_sample.keys() if key in FEATURES else False]

SyntaxError: invalid syntax (Temp/ipykernel_17636/1785515755.py, line 1)

In [None]:
[True for key in new_sample.keys()]

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True]

In [None]:
new_sample.to_dict()

{'escore_bruto_p1_etapa1': {0: 5.172},
 'escore_bruto_p2_etapa1': {0: 14.653},
 'nota_redacao_etapa1': {0: 6.947},
 'escore_bruto_p1_etapa2': {0: 3.845},
 'escore_bruto_p2_etapa2': {0: 19.994},
 'nota_redacao_etapa2': {0: 7.222},
 'escore_bruto_p1_etapa3': {0: 4.998},
 'escore_bruto_p2_etapa3': {0: 16.66},
 'nota_redacao_etapa3': {0: 8.06},
 'AGRONOMIA (BACHARELADO)': {0: 0},
 'ARQUITETURA E URBANISMO (BACHARELADO)': {0: 0},
 'ARQUIVOLOGIA (BACHARELADO)': {0: 0},
 'ARTES CÊNICAS - INTERPRETAÇÃO TEATRAL (BACHARELADO)': {0: 0},
 'ARTES VISUAIS (BACHARELADO)': {0: 0},
 'ARTES VISUAIS (LICENCIATURA)': {0: 0},
 'BIBLIOTECONOMIA (BACHARELADO)': {0: 0},
 'BIOTECNOLOGIA (BACHARELADO)': {0: 0},
 'CAMPUS UNB CEILÂNDIA (FCE) ENFERMAGEM (BACHARELADO)': {0: 0},
 'CAMPUS UNB PLANALTINA (FUP) – DIURNO CIÊNCIAS NATURAIS (LICENCIATURA)': {0: 0},
 'CAMPUS UNB PLANALTINA (FUP) – NOTURNO CIÊNCIAS NATURAIS (LICENCIATURA)': {0: 0},
 'CIÊNCIA DA COMPUTAÇÃO (BACHARELADO)': {0: 0},
 'CIÊNCIA POLÍTICA (BACHAREL

In [None]:
FEATURES[33]

'DIURNO ADMINISTRAÇÃO (BACHARELADO)'

In [None]:
sample_not_approved.get(FEATURES[33])

In [None]:
predict_approval(model_loaded, new_data=new_sample)

0.104

In [None]:
X_test.columns.to_list()

['escore_bruto_p1_etapa1',
 'escore_bruto_p2_etapa1',
 'nota_redacao_etapa1',
 'escore_bruto_p1_etapa2',
 'escore_bruto_p2_etapa2',
 'nota_redacao_etapa2',
 'escore_bruto_p1_etapa3',
 'escore_bruto_p2_etapa3',
 'nota_redacao_etapa3',
 'AGRONOMIA (BACHARELADO)',
 'ARQUITETURA E URBANISMO (BACHARELADO)',
 'ARQUIVOLOGIA (BACHARELADO)',
 'ARTES CÊNICAS - INTERPRETAÇÃO TEATRAL (BACHARELADO)',
 'ARTES VISUAIS (BACHARELADO)',
 'ARTES VISUAIS (LICENCIATURA)',
 'BIBLIOTECONOMIA (BACHARELADO)',
 'BIOTECNOLOGIA (BACHARELADO)',
 'CAMPUS UNB CEILÂNDIA (FCE) ENFERMAGEM (BACHARELADO)',
 'CAMPUS UNB PLANALTINA (FUP) – DIURNO CIÊNCIAS NATURAIS (LICENCIATURA)',
 'CAMPUS UNB PLANALTINA (FUP) – NOTURNO CIÊNCIAS NATURAIS (LICENCIATURA)',
 'CIÊNCIA DA COMPUTAÇÃO (BACHARELADO)',
 'CIÊNCIA POLÍTICA (BACHARELADO)',
 'CIÊNCIAS AMBIENTAIS (BACHARELADO)',
 'CIÊNCIAS BIOLÓGICAS (BACHARELADO)',
 'CIÊNCIAS CONTÁBEIS (BACHARELADO)',
 'CIÊNCIAS ECONÔMICAS (BACHARELADO)',
 'CIÊNCIAS SOCIAIS – ANTROPOLOGIA / SOCIOLOGIA 

In [None]:
my_sample = pd.DataFrame([X_test.iloc[0]])
#my_sample.rename_ax

In [None]:
my_sample.rename(columns = {'AGRONOMIA (BACHARELADO)': 'AGRONOMIA 123 (BACHARELADO)'})

Unnamed: 0,escore_bruto_p1_etapa1,escore_bruto_p2_etapa1,nota_redacao_etapa1,escore_bruto_p1_etapa2,escore_bruto_p2_etapa2,nota_redacao_etapa2,escore_bruto_p1_etapa3,escore_bruto_p2_etapa3,nota_redacao_etapa3,AGRONOMIA 123 (BACHARELADO),...,cotista,cotas_negros_flag,publicas1_flag,publicas2_flag,publicas3_flag,publicas4_flag,publicas5_flag,publicas6_flag,publicas7_flag,publicas8_flag
510,1.149,21.549,5.808,3.845,25.857,8.273,1.428,25.228,8.148,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
my_sample.columns.to_list()

['escore_bruto_p1_etapa1',
 'escore_bruto_p2_etapa1',
 'nota_redacao_etapa1',
 'escore_bruto_p1_etapa2',
 'escore_bruto_p2_etapa2',
 'nota_redacao_etapa2',
 'escore_bruto_p1_etapa3',
 'escore_bruto_p2_etapa3',
 'nota_redacao_etapa3',
 'AGRONOMIA (BACHARELADO)',
 'ARQUITETURA E URBANISMO (BACHARELADO)',
 'ARQUIVOLOGIA (BACHARELADO)',
 'ARTES CÊNICAS - INTERPRETAÇÃO TEATRAL (BACHARELADO)',
 'ARTES VISUAIS (BACHARELADO)',
 'ARTES VISUAIS (LICENCIATURA)',
 'BIBLIOTECONOMIA (BACHARELADO)',
 'BIOTECNOLOGIA (BACHARELADO)',
 'CAMPUS UNB CEILÂNDIA (FCE) ENFERMAGEM (BACHARELADO)',
 'CAMPUS UNB PLANALTINA (FUP) – DIURNO CIÊNCIAS NATURAIS (LICENCIATURA)',
 'CAMPUS UNB PLANALTINA (FUP) – NOTURNO CIÊNCIAS NATURAIS (LICENCIATURA)',
 'CIÊNCIA DA COMPUTAÇÃO (BACHARELADO)',
 'CIÊNCIA POLÍTICA (BACHARELADO)',
 'CIÊNCIAS AMBIENTAIS (BACHARELADO)',
 'CIÊNCIAS BIOLÓGICAS (BACHARELADO)',
 'CIÊNCIAS CONTÁBEIS (BACHARELADO)',
 'CIÊNCIAS ECONÔMICAS (BACHARELADO)',
 'CIÊNCIAS SOCIAIS – ANTROPOLOGIA / SOCIOLOGIA 

In [None]:
model_loaded.predict(new_sample)

array([0], dtype=int64)

## Stratified KFold + Hyperameter tuning + Simple Flags

In [None]:
list(df.columns)

['numero_inscricao',
 'nome_x',
 'escore_bruto_p1_etapa1',
 'escore_bruto_p2_etapa1',
 'nota_redacao_etapa1',
 'escore_bruto_p1_etapa2',
 'escore_bruto_p2_etapa2',
 'nota_redacao_etapa2',
 'escore_bruto_p1_etapa3',
 'escore_bruto_p2_etapa3',
 'nota_redacao_etapa3',
 'argumento_final',
 'classificacao_final_universal',
 'classificacao_final_cotas_negros',
 'classificacao_final_publicas1',
 'classificacao_final_publicas2',
 'classificacao_final_publicas3',
 'classificacao_final_publicas4',
 'classificacao_final_publicas5',
 'classificacao_final_publicas6',
 'classificacao_final_publicas7',
 'classificacao_final_publicas8',
 'course',
 'cotista',
 'cotas_negros_flag',
 'publicas1_flag',
 'publicas2_flag',
 'publicas3_flag',
 'publicas4_flag',
 'publicas5_flag',
 'publicas6_flag',
 'publicas7_flag',
 'publicas8_flag',
 'nome_y',
 'campus',
 'curso',
 'periodo',
 '_merge',
 'label',
 'AGRONOMIA (BACHARELADO)',
 'ARQUITETURA E URBANISMO (BACHARELADO)',
 'ARQUIVOLOGIA (BACHARELADO)',
 'ARTES 

In [None]:
publicas_flags = [
 'publicas1_flag',
 'publicas2_flag',
 'publicas3_flag',
 'publicas4_flag',
 'publicas5_flag',
 'publicas6_flag',
 'publicas7_flag',
 'publicas8_flag']
df['publicas_flag'] = df[publicas_flags].sum(axis=1)

In [None]:
FEATURES = [col for col in FEATURES if col not in publicas_flags] 
print(len(FEATURES))


102


In [None]:
FEATURES.append('publicas_flag')

In [None]:
print(len(FEATURES))


103


In [None]:

X = df[FEATURES] # features
y = df['label'] # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=47) # 70% training and 30% test

In [None]:
# Assuming you have your training data X_train and corresponding labels y_train

# Instantiate the class
rf_gs = RandomForestClassifierGridSearch(X_train, y_train)

# Fit the classifier using GridSearchCV
rf_gs.fit(param_grid)

# Assuming you have your test data X_test and corresponding labels y_test

# Evaluate the best model on the test data
report = rf_gs.evaluate(X_test, y_test)
print(report)

              precision    recall  f1-score   support

           0       0.92      0.92      0.92      1220
           1       0.58      0.58      0.58       233

    accuracy                           0.87      1453
   macro avg       0.75      0.75      0.75      1453
weighted avg       0.87      0.87      0.87      1453

