In [1]:
import pandas as pd
import re

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import classification_report

from RandomForestClassifierGridSearch import RandomForestClassifierGridSearch



In [3]:
scores = pd.read_parquet('../data/scores.parquet')
#approvals = pd.read_parquet('../data/approvals.parquet')
scores.shape

(7301, 23)

In [3]:
scores = scores[~scores.course.str.contains('JUDICE')]

In [4]:
scores.head()

Unnamed: 0,numero_inscricao,nome,escore_bruto_p1_etapa1,escore_bruto_p2_etapa1,nota_redacao_etapa1,escore_bruto_p1_etapa2,escore_bruto_p2_etapa2,nota_redacao_etapa2,escore_bruto_p1_etapa3,escore_bruto_p2_etapa3,...,classificacao_final_cotas_negros,classificacao_final_publicas1,classificacao_final_publicas2,classificacao_final_publicas3,classificacao_final_publicas4,classificacao_final_publicas5,classificacao_final_publicas6,classificacao_final_publicas7,classificacao_final_publicas8,course
0,20199023,Amanda Amorim Luz,5.172,14.653,6.947,3.845,19.994,7.222,4.998,16.66,...,,,,,,,,,,2.1.1 CAMPUS DARCY RIBEIRO – DIURNO ADMINIST...
1,20180980,Amanda Larissa Oliveira dos Santos,6.896,48.271,3.8,6.152,54.342,7.200,5.712,46.648,...,,,,,,,,,,2.1.1 CAMPUS DARCY RIBEIRO – DIURNO ADMINIST...
2,20188220,Amanda Luisa de Oliveira Souza,0.0,23.704,5.333,0.769,24.303,6.909,2.142,25.465,...,,,,,,,,,,2.1.1 CAMPUS DARCY RIBEIRO – DIURNO ADMINIST...
3,20193280,Amanda Mendes Reis de Araujo,0.0,10.342,8.963,2.307,14.866,9.5 86,1.428,13.088,...,,,,,,,,,,2.1.1 CAMPUS DARCY RIBEIRO – DIURNO ADMINIST...
4,20188170,Ana Beatriz Cattermol Cavalcante,0.0,10.917,2.848,0.0,10.764,3.750,6.426,17.373,...,1.0,,,,,,,,,2.1.1 CAMPUS DARCY RIBEIRO – DIURNO ADMINIST...


In [5]:
cotas_columns = [col for col in scores.columns if 'classificacao' in col]
cotas_columns.pop(0) # removing 'classificacao_final_universal'
cotas_columns

['classificacao_final_cotas_negros',
 'classificacao_final_publicas1',
 'classificacao_final_publicas2',
 'classificacao_final_publicas3',
 'classificacao_final_publicas4',
 'classificacao_final_publicas5',
 'classificacao_final_publicas6',
 'classificacao_final_publicas7',
 'classificacao_final_publicas8']

In [6]:
scores['cotista'] = scores[cotas_columns].notnull().any(axis=1).astype(int)

In [7]:
for column in cotas_columns:
    colum_name = re.sub("classificacao_final_", "", f'{column}_flag')
    scores[colum_name] = scores[column].notnull().astype(int)

In [8]:
flags_columns = list(scores.columns[scores.columns.str.contains('flag')])

In [9]:
flags_columns

['cotas_negros_flag',
 'publicas1_flag',
 'publicas2_flag',
 'publicas3_flag',
 'publicas4_flag',
 'publicas5_flag',
 'publicas6_flag',
 'publicas7_flag',
 'publicas8_flag']

In [10]:
df = pd.merge(scores, approvals, how='left', on='numero_inscricao', indicator=True)

In [11]:
df['label'] = df._merge.apply(lambda x: 1 if x == 'both' else 0)

In [12]:
FEATURES = ['escore_bruto_p1_etapa1',
            'escore_bruto_p2_etapa1',
            'nota_redacao_etapa1',
            'escore_bruto_p1_etapa2',
            'escore_bruto_p2_etapa2',
            'nota_redacao_etapa2',
            'escore_bruto_p1_etapa3',
            'escore_bruto_p2_etapa3',
            'nota_redacao_etapa3',
            'argumento_final']

In [13]:
def convert_string_to_float(df, colnames):
    for colname in colnames:
        df[colname] = df[colname].str.replace(' ', "", regex=True)
        df[colname] = df[colname].str.replace('[R$]', "", regex=True)
        df[colname] = df[colname].str.replace(',', ".", regex=False)
        df[colname] = df[colname].apply(float)
    return df

In [14]:
df = convert_string_to_float(df, FEATURES)

In [15]:
course_dummies = pd.get_dummies(df.course)
course_dummies_columns = list(course_dummies.columns)
df = pd.concat([df, course_dummies], axis=1)

In [16]:
FEATURES.extend(course_dummies_columns)

In [17]:
len(FEATURES)

101

## Baseline Model

In [18]:
X = df[FEATURES] # features
y = df['label'] # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=47) # 70% training and 30% test

In [19]:
# Create a Random Forest Classifier
model = RandomForestClassifier(random_state=47)

# Fit randomized search
model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print('Accuracy:', metrics.accuracy_score(y_test, y_pred))
classification_report(y_test, y_pred, output_dict=True)

Accuracy: 0.8492773571920165


{'0': {'precision': 0.8618944323933478,
  'recall': 0.9770491803278688,
  'f1-score': 0.9158663081060315,
  'support': 1220},
 '1': {'precision': 0.6,
  'recall': 0.18025751072961374,
  'f1-score': 0.27722772277227725,
  'support': 233},
 'accuracy': 0.8492773571920165,
 'macro avg': {'precision': 0.7309472161966739,
  'recall': 0.5786533455287413,
  'f1-score': 0.5965470154391543,
  'support': 1453},
 'weighted avg': {'precision': 0.8198975963660595,
  'recall': 0.8492773571920165,
  'f1-score': 0.8134555783174804,
  'support': 1453}}

## Baseline Model + cotista

In [20]:
FEATURES.append('cotista')

In [21]:
len(FEATURES)

102

In [22]:
X = df[FEATURES] # features
y = df['label'] # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=47) # 70% training and 30% test

In [23]:
# Create a Random Forest Classifier
model = RandomForestClassifier(random_state=47)

# Fit randomized search
model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print('Accuracy:', metrics.accuracy_score(y_test, y_pred))
classification_report(y_test, y_pred, output_dict=True)

Accuracy: 0.8582243633860978


{'0': {'precision': 0.868995633187773,
  'recall': 0.978688524590164,
  'f1-score': 0.920585967617579,
  'support': 1220},
 '1': {'precision': 0.6708860759493671,
  'recall': 0.22746781115879827,
  'f1-score': 0.33974358974358976,
  'support': 233},
 'accuracy': 0.8582243633860978,
 'macro avg': {'precision': 0.76994085456857,
  'recall': 0.6030781678744811,
  'f1-score': 0.6301647786805844,
  'support': 1453},
 'weighted avg': {'precision': 0.8372272045321992,
  'recall': 0.8582243633860978,
  'f1-score': 0.8274433151436358,
  'support': 1453}}

## Baseline Model + Flags

In [24]:
FEATURES.extend(flags_columns)
len(FEATURES)

111

In [25]:
X = df[FEATURES] # features
y = df['label'] # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=47) # 70% training and 30% test

In [27]:
# Create a Random Forest Classifier
model = RandomForestClassifier(random_state=47)

# Fit randomized search
model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)
classification_report(y_test, y_pred, output_dict=True)

{'0': {'precision': 0.8802045288531775,
  'recall': 0.9877049180327869,
  'f1-score': 0.9308613364233295,
  'support': 1220},
 '1': {'precision': 0.8214285714285714,
  'recall': 0.296137339055794,
  'f1-score': 0.4353312302839117,
  'support': 233},
 'accuracy': 0.8768066070199587,
 'macro avg': {'precision': 0.8508165501408744,
  'recall': 0.6419211285442905,
  'f1-score': 0.6830962833536206,
  'support': 1453},
 'weighted avg': {'precision': 0.8707793409110348,
  'recall': 0.8768066070199587,
  'f1-score': 0.8513991790038633,
  'support': 1453}}

## Hyperparameter tuning + class_weight

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300, 500],      # Number of trees in the forest
    'max_depth': [5, 10, 20, 40, 60],            # Maximum depth of each tree
    'min_samples_split': [5, 10, 20, 40, 60],       # Minimum number of samples required to split an internal node
    'min_samples_leaf': [2, 8, 16, 32]          # Minimum number of samples required to be at a leaf node
}

In [None]:
# Assuming you have your training data X_train and corresponding labels y_train

# Instantiate the class
rf_gs = RandomForestClassifierGridSearch(X_train, y_train)

# Fit the classifier using GridSearchCV
rf_gs.fit(param_grid)

# Assuming you have your test data X_test and corresponding labels y_test

# Evaluate the best model on the test data
report = rf_gs.evaluate(X_test, y_test)
print("Classification Report:")
print(report)

NameError: name 'RandomForestClassifierGridSearch' is not defined

In [None]:
with open('model_flags_tuned.pickle','wb') as f:
    pickle.dump(model, f)

In [None]:
#with open('model_flags_tuned.pickle','wb') as f:
#  pickle.dump(model, f)

## Stratified KFold + Hyperameter tuning + Flags

In [26]:
FEATURES.remove('argumento_final')

In [27]:
len(FEATURES)

110

In [28]:
X = df[FEATURES] # features
y = df['label'] # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=47) # 70% training and 30% test

In [29]:
# # Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],      # Number of trees in the forest
    'max_depth': [5, 10, 20, 40, 80],            # Maximum depth of each tree
    'min_samples_split': [5, 10, 20, 50],       # Minimum number of samples required to split an internal node
    'min_samples_leaf': [2, 8, 16, 32],          # Minimum number of samples required to be at a leaf node
}

In [30]:
X_train.shape

(5810, 110)

In [31]:
# Assuming you have your training data X_train and corresponding labels y_train

# Instantiate the class
rf_gs = RandomForestClassifierGridSearch(X_train, y_train)

# Fit the classifier using GridSearchCV
rf_gs.fit(param_grid)

# Assuming you have your test data X_test and corresponding labels y_test

# Evaluate the best model on the test data
report = rf_gs.evaluate(X_test, y_test)
print(report)

              precision    recall  f1-score   support

           0       0.92      0.91      0.92      1220
           1       0.57      0.60      0.58       233

    accuracy                           0.86      1453
   macro avg       0.74      0.75      0.75      1453
weighted avg       0.86      0.86      0.86      1453



In [32]:
rf_gs.save_model('stratified_kfold_classweight15_tuned_model')

In [38]:
import pickle

In [39]:
with open('../ml_dev/models/stratified_kfold_classweight15_tuned_model.pickle', 'rb') as f:
  model_loaded = pickle.load(f)

In [40]:
y_pred = model_loaded.predict(X_test)
classification_report(y_test, y_pred, output_dict=True)

{'0': {'precision': 0.9221209610604806,
  'recall': 0.9122950819672131,
  'f1-score': 0.9171817058096416,
  'support': 1220},
 '1': {'precision': 0.5650406504065041,
  'recall': 0.5965665236051502,
  'f1-score': 0.5803757828810021,
  'support': 233},
 'accuracy': 0.8616655196145905,
 'macro avg': {'precision': 0.7435808057334923,
  'recall': 0.7544308027861817,
  'f1-score': 0.7487787443453219,
  'support': 1453},
 'weighted avg': {'precision': 0.8648603193657961,
  'recall': 0.8616655196145905,
  'f1-score': 0.8631722219539134,
  'support': 1453}}

In [33]:
def predict_approval(model, new_data):
    
    # Predict diabetes
    predictions = model.predict_proba(new_data)
    approval_prob = round(predictions[0][1], ndigits=3)

    #pred_to_label = {0: 'Negative', 1: 'Positive'}

    # Make a list of predictions
    #data = []
    #for t, pred in zip(new_data, predictions):
    #    data.append({'prediction': pred[0]})

    return approval_prob

In [55]:
FEATURES

['escore_bruto_p1_etapa1',
 'escore_bruto_p2_etapa1',
 'nota_redacao_etapa1',
 'escore_bruto_p1_etapa2',
 'escore_bruto_p2_etapa2',
 'nota_redacao_etapa2',
 'escore_bruto_p1_etapa3',
 'escore_bruto_p2_etapa3',
 'nota_redacao_etapa3',
 'argumento_final',
 '2.1.1 CAMPUS  DARCY RIBEIRO – DIURNO  ADMINISTRAÇÃO (BACHARELADO)',
 '2.1.2 CAMPUS  DARCY RIBEIRO – NOTURNO  ADMINISTRAÇÃO (BACHARELADO)',
 '2.1.3 CAMPUS  UNB CEILÂNDIA (FCE)  ENFERMAGEM (BACHARELADO)',
 '2.1.4 CAMPUS  UNB GAMA (FGA)  ENGENHARIAS – AEROESPACIAL / AUTOMOTIVA / ELETRÔNICA / ENERGIA / SOFTWARE (BACHARELADOS)**',
 '2.1.5  CAMPUS  UNB PLANALTINA (FUP) – DIURNO  CIÊNCIAS NATURAIS (L ICENCIATURA)',
 '2.1.6  CAMPUS  UNB PLANALTINA (FUP) – NOTURNO  CIÊNCIAS NATURAI S (LICENCIATURA)',
 'AGRONO MIA (BACHARELADO)',
 'ARQUITETURA E URBANISMO (BACHARELADO)',
 'ARQUIVOLOGIA (BACHARELADO)',
 'ARTES CÊNICAS - INTERPRETAÇÃO TEATRAL (BACHARELADO) *',
 'ARTES VISUAIS (BACHARELADO)*',
 'ARTES VISUAIS (LICENCIATURA)*',
 'BIBLIOTECONOMIA (B

In [45]:
covariables = list({"escore_bruto_p1_etapa1": 5.172,
                            "escore_bruto_p2_etapa1": 14.653,
                            "nota_redacao_etapa1": 6.947,
                            "escore_bruto_p1_etapa2": 3.845,
                            "escore_bruto_p2_etapa2": 19.994,
                            "nota_redacao_etapa2": 7.222,
                            "escore_bruto_p1_etapa3": 4.998,
                            "escore_bruto_p2_etapa3": 16.66,
                            "nota_redacao_etapa3": 8.06,
                            "argumento_final": -25.699,
                            "2.1.1 CAMPUS  DARCY RIBEIRO – DIURNO  ADMINISTRAÇÃO (BACHARELADO)": 1.0,               
                            "cotista": 0.0,
                            "cotas_negros_flag": 0.0,
                            "publicas1_flag": 0.0,
                            "publicas2_flag": 0.0,
                            "publicas3_flag": 0.0,
                            "publicas4_flag": 0.0,
                            "publicas5_flag": 0.0,
                            "publicas6_flag": 0.0,
                            "publicas7_flag": 0.0,
                            "publicas8_flag": 0.0}.keys())

In [54]:
covariables

['escore_bruto_p1_etapa1',
 'escore_bruto_p2_etapa1',
 'nota_redacao_etapa1',
 'escore_bruto_p1_etapa2',
 'escore_bruto_p2_etapa2',
 'nota_redacao_etapa2',
 'escore_bruto_p1_etapa3',
 'escore_bruto_p2_etapa3',
 'nota_redacao_etapa3',
 'argumento_final',
 '2.1.1 CAMPUS  DARCY RIBEIRO – DIURNO  ADMINISTRAÇÃO (BACHARELADO)',
 'cotista',
 'cotas_negros_flag',
 'publicas1_flag',
 'publicas2_flag',
 'publicas3_flag',
 'publicas4_flag',
 'publicas5_flag',
 'publicas6_flag',
 'publicas7_flag',
 'publicas8_flag']

In [50]:
all([True for covariable in covariables if covariable in FEATURES])

True

In [53]:
all([True, True, True])

True

In [101]:
X_test.iloc[4].to_dict()

{'escore_bruto_p1_etapa1': 0.0,
 'escore_bruto_p2_etapa1': 29.595,
 'nota_redacao_etapa1': 7.524,
 'escore_bruto_p1_etapa2': 4.614,
 'escore_bruto_p2_etapa2': 19.737,
 'nota_redacao_etapa2': 7.2,
 'escore_bruto_p1_etapa3': 2.856,
 'escore_bruto_p2_etapa3': 35.7,
 'nota_redacao_etapa3': 7.369,
 'argumento_final': 14.603,
 '2.1.1 CAMPUS  DARCY RIBEIRO – DIURNO  ADMINISTRAÇÃO (BACHARELADO)': 0.0,
 '2.1.2 CAMPUS  DARCY RIBEIRO – NOTURNO  ADMINISTRAÇÃO (BACHARELADO)': 0.0,
 '2.1.3 CAMPUS  UNB CEILÂNDIA (FCE)  ENFERMAGEM (BACHARELADO)': 0.0,
 '2.1.4 CAMPUS  UNB GAMA (FGA)  ENGENHARIAS – AEROESPACIAL / AUTOMOTIVA / ELETRÔNICA / ENERGIA / SOFTWARE (BACHARELADOS)**': 0.0,
 '2.1.5  CAMPUS  UNB PLANALTINA (FUP) – DIURNO  CIÊNCIAS NATURAIS (L ICENCIATURA)': 0.0,
 '2.1.6  CAMPUS  UNB PLANALTINA (FUP) – NOTURNO  CIÊNCIAS NATURAI S (LICENCIATURA)': 0.0,
 'AGRONO MIA (BACHARELADO)': 0.0,
 'ARQUITETURA E URBANISMO (BACHARELADO)': 0.0,
 'ARQUIVOLOGIA (BACHARELADO)': 0.0,
 'ARTES CÊNICAS - INTERPRETAÇÃO 

In [100]:
y_test

510     0
6079    0
1730    0
5231    0
3514    1
       ..
639     0
4744    0
4629    0
3591    0
171     0
Name: label, Length: 1453, dtype: int64

In [27]:
from samples import sample_approved, sample_not_approved

In [42]:
list(sample_not_approved.keys())

['escore_bruto_p1_etapa1',
 'escore_bruto_p2_etapa1',
 'nota_redacao_etapa1',
 'escore_bruto_p1_etapa2',
 'escore_bruto_p2_etapa2',
 'nota_redacao_etapa2',
 'escore_bruto_p1_etapa3',
 'escore_bruto_p2_etapa3',
 'nota_redacao_etapa3',
 'argumento_final',
 '2.1.1 CAMPUS  DARCY RIBEIRO – DIURNO  ADMINISTRAÇÃO (BACHARELADO)',
 'cotista',
 'cotas_negros_flag',
 'publicas1_flag',
 'publicas2_flag',
 'publicas3_flag',
 'publicas4_flag',
 'publicas5_flag',
 'publicas6_flag',
 'publicas7_flag',
 'publicas8_flag']

In [37]:
new_sample = sample_not_approved
new_sample = pd.DataFrame([{col: new_sample.get(col) for col in FEATURES}])
new_sample = new_sample.fillna(0)

In [38]:
new_sample

Unnamed: 0,escore_bruto_p1_etapa1,escore_bruto_p2_etapa1,nota_redacao_etapa1,escore_bruto_p1_etapa2,escore_bruto_p2_etapa2,nota_redacao_etapa2,escore_bruto_p1_etapa3,escore_bruto_p2_etapa3,nota_redacao_etapa3,argumento_final,...,cotista,cotas_negros_flag,publicas1_flag,publicas2_flag,publicas3_flag,publicas4_flag,publicas5_flag,publicas6_flag,publicas7_flag,publicas8_flag
0,5.172,14.653,6.947,3.845,19.994,7.222,4.998,16.66,8.06,-25.699,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
predict_approval(model_loaded, new_data=new_sample)

0.073