In [1]:
import pandas as pd
import re

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import classification_report

from RandomForestClassifierGridSearch import RandomForestClassifierGridSearch
import pickle

In [2]:
scores = pd.read_parquet('../data/processed/scores.parquet')
approvals = pd.read_parquet('../data/processed/approvals.parquet')
scores.shape

(7263, 23)

In [3]:
cotas_columns = [col for col in scores.columns if 'classificacao' in col]
cotas_columns.pop(0) # removing 'classificacao_final_universal'
cotas_columns

['classificacao_final_cotas_negros',
 'classificacao_final_publicas1',
 'classificacao_final_publicas2',
 'classificacao_final_publicas3',
 'classificacao_final_publicas4',
 'classificacao_final_publicas5',
 'classificacao_final_publicas6',
 'classificacao_final_publicas7',
 'classificacao_final_publicas8']

In [4]:
scores['cotista'] = scores[cotas_columns].notnull().any(axis=1).astype(int)

In [5]:
for column in cotas_columns:
    colum_name = re.sub("classificacao_final_", "", f'{column}_flag')
    scores[colum_name] = scores[column].notnull().astype(int)

In [6]:
flags_columns = list(scores.columns[scores.columns.str.contains('flag')])

In [7]:
flags_columns

['cotas_negros_flag',
 'publicas1_flag',
 'publicas2_flag',
 'publicas3_flag',
 'publicas4_flag',
 'publicas5_flag',
 'publicas6_flag',
 'publicas7_flag',
 'publicas8_flag']

In [8]:
df = pd.merge(scores, approvals, how='left', on='numero_inscricao', indicator=True)

In [9]:
df[df.curso.notna()][['numero_inscricao', 'nome_x','nome_y', 'course', 'curso']]

Unnamed: 0,numero_inscricao,nome_x,nome_y,course,curso
4,20188170,Ana Beatriz Cattermol Cavalcante,Ana Beatriz Cattermol Cavalcante,DIURNO ADMINISTRAÇÃO (BACHARELADO),Administração (Bacharelado)
6,20183676,Ana Clara Marques da Silva,Ana Clara Marques da Silva,DIURNO ADMINISTRAÇÃO (BACHARELADO),Administração (Bacharelado)
9,20170011,Ana Paula Nunes Bezerra,Ana Paula Nunes Bezerra,DIURNO ADMINISTRAÇÃO (BACHARELADO),Administração (Bacharelado)
24,20184514,Bruno dos Santos Fernandes,Bruno dos Santos Fernandes,DIURNO ADMINISTRAÇÃO (BACHARELADO),Administração (Bacharelado)
29,20181503,Carina da Silva Ferreira,Carina da Silva Ferreira,DIURNO ADMINISTRAÇÃO (BACHARELADO),Administração (Bacharelado)
...,...,...,...,...,...
7257,20176727,Maria Clara de Melo Pierre,Maria Clara de Melo Pierre,CAMPUS UNB PLANALTINA (FUP) – NOTURNO CIÊNCIAS...,Ciências Naturais (Licenciatura)
7258,21270449,Sarah de Oliveira Nascimento,Sarah de Oliveira Nascimento,CAMPUS UNB PLANALTINA (FUP) – NOTURNO CIÊNCIAS...,Ciências Naturais (Licenciatura)
7259,20102521,Eloisa Graziele Rodrigues de Ara ujo,Eloisa Graziele Rodrigues de Araujo,GESTÃO AMBIENTAL (BACHARELADO),Gestão Ambiental (Bacharelado)
7260,20190553,Julia de Sousa Vale,Julia de Sousa Vale,GESTÃO AMBIENTAL (BACHARELADO),Gestão Ambiental (Bacharelado)


In [10]:
df['label'] = df._merge.apply(lambda x: 1 if x == 'both' else 0)

In [11]:
FEATURES = ['escore_bruto_p1_etapa1',
            'escore_bruto_p2_etapa1',
            'nota_redacao_etapa1',
            'escore_bruto_p1_etapa2',
            'escore_bruto_p2_etapa2',
            'nota_redacao_etapa2',
            'escore_bruto_p1_etapa3',
            'escore_bruto_p2_etapa3',
            'nota_redacao_etapa3',
            'argumento_final']

In [12]:
def convert_string_to_float(df, colnames):
    for colname in colnames:
        df[colname] = df[colname].str.replace(' ', "", regex=True)
        df[colname] = df[colname].str.replace('[R$]', "", regex=True)
        df[colname] = df[colname].str.replace(',', ".", regex=False)
        df[colname] = df[colname].apply(float)
    return df

In [13]:
df = convert_string_to_float(df, FEATURES)

In [14]:
course_dummies = pd.get_dummies(df.course)
course_dummies_columns = list(course_dummies.columns)
df = pd.concat([df, course_dummies], axis=1)

In [15]:
FEATURES.extend(course_dummies_columns)

In [16]:
len(FEATURES)

101

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7263 entries, 0 to 7262
Columns: 130 entries, numero_inscricao to TURISMO (BACHARELADO)
dtypes: category(1), float64(12), int32(10), int64(1), object(15), uint8(91)
memory usage: 2.5+ MB


## Baseline Model

In [18]:
X = df[FEATURES] # features
y = df['label'] # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=47) # 70% training and 30% test

In [19]:
# Create a Random Forest Classifier
model = RandomForestClassifier(random_state=47)

# Fit randomized search
model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print('Accuracy:', metrics.accuracy_score(y_test, y_pred))
classification_report(y_test, y_pred, output_dict=True)

Accuracy: 0.848589125946318


{'0': {'precision': 0.8591954022988506,
  'recall': 0.980327868852459,
  'f1-score': 0.9157733537519143,
  'support': 1220},
 '1': {'precision': 0.6065573770491803,
  'recall': 0.15879828326180256,
  'f1-score': 0.25170068027210885,
  'support': 233},
 'accuracy': 0.848589125946318,
 'macro avg': {'precision': 0.7328763896740155,
  'recall': 0.5695630760571309,
  'f1-score': 0.5837370170120115,
  'support': 1453},
 'weighted avg': {'precision': 0.8186829040998326,
  'recall': 0.848589125946318,
  'f1-score': 0.8092840675022277,
  'support': 1453}}

## Baseline Model + cotista

In [20]:
FEATURES.append('cotista')

In [21]:
len(FEATURES)

102

In [22]:
X = df[FEATURES] # features
y = df['label'] # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=47) # 70% training and 30% test

In [23]:
# Create a Random Forest Classifier
model = RandomForestClassifier(random_state=47)

# Fit randomized search
model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print('Accuracy:', metrics.accuracy_score(y_test, y_pred))
classification_report(y_test, y_pred, output_dict=True)

Accuracy: 0.8623537508602891


{'0': {'precision': 0.8701015965166908,
  'recall': 0.9827868852459016,
  'f1-score': 0.9230177059276367,
  'support': 1220},
 '1': {'precision': 0.72,
  'recall': 0.2317596566523605,
  'f1-score': 0.35064935064935066,
  'support': 233},
 'accuracy': 0.8623537508602891,
 'macro avg': {'precision': 0.7950507982583455,
  'recall': 0.6072732709491311,
  'f1-score': 0.6368335282884936,
  'support': 1453},
 'weighted avg': {'precision': 0.8460316226774692,
  'recall': 0.8623537508602891,
  'f1-score': 0.8312339297543121,
  'support': 1453}}

## Baseline Model + Flags

In [24]:
FEATURES.extend(flags_columns)
len(FEATURES)

111

In [57]:
X = df[FEATURES] # features
y = df['label'] # labelss
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=47) # 70% training and 30% test

In [26]:
# Create a Random Forest Classifier
model = RandomForestClassifier(random_state=47)

# Fit randomized search
model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)
classification_report(y_test, y_pred, output_dict=True)

{'0': {'precision': 0.8709912536443148,
  'recall': 0.9851607584501236,
  'f1-score': 0.9245647969052223,
  'support': 1213},
 '1': {'precision': 0.797752808988764,
  'recall': 0.2862903225806452,
  'f1-score': 0.42136498516320475,
  'support': 248},
 'accuracy': 0.86652977412731,
 'macro avg': {'precision': 0.8343720313165395,
  'recall': 0.6357255405153844,
  'f1-score': 0.6729648910342135,
  'support': 1461},
 'weighted avg': {'precision': 0.8585592657767059,
  'recall': 0.86652977412731,
  'f1-score': 0.8391482648641406,
  'support': 1461}}

## Hyperparameter tuning + class_weight

In [27]:
param_grid = {
    'n_estimators': [100, 200, 300, 500],      # Number of trees in the forest
    'max_depth': [5, 10, 20, 40, 60],            # Maximum depth of each tree
    'min_samples_split': [5, 10, 20, 40, 60],       # Minimum number of samples required to split an internal node
    'min_samples_leaf': [2, 8, 16, 32]          # Minimum number of samples required to be at a leaf node
}

In [30]:
# Assuming you have your training data X_train and corresponding labels y_train

# Instantiate the class
rf_gs = RandomForestClassifierGridSearch(X_train, y_train)

# Fit the classifier using GridSearchCV
rf_gs.fit(param_grid)

# Assuming you have your test data X_test and corresponding labels y_test

# Evaluate the best model on the test data
report = rf_gs.evaluate(X_test, y_test)
print("Classification Report:")
print(report)

KeyboardInterrupt: 

In [None]:
with open('model_flags_tuned.pickle','wb') as f:
    pickle.dump(model, f)

In [None]:
#with open('model_flags_tuned.pickle','wb') as f:
#  pickle.dump(model, f)

## Stratified KFold + Hyperameter tuning + Flags

In [25]:
FEATURES.remove('argumento_final')
print(len(FEATURES))

X = df[FEATURES] # features
y = df['label'] # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=47) # 70% training and 30% test

110


In [32]:
# # Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],      # Number of trees in the forest
    'max_depth': [5, 10, 20, 40, 80],            # Maximum depth of each tree
    'min_samples_split': [5, 10, 20, 50],       # Minimum number of samples required to split an internal node
    'min_samples_leaf': [2, 8, 16, 32],          # Minimum number of samples required to be at a leaf node
}

In [37]:
X_train.shape

(5810, 110)

In [38]:
# Assuming you have your training data X_train and corresponding labels y_train

# Instantiate the class
rf_gs = RandomForestClassifierGridSearch(X_train, y_train)

# Fit the classifier using GridSearchCV
rf_gs.fit(param_grid)

# Assuming you have your test data X_test and corresponding labels y_test

# Evaluate the best model on the test data
report = rf_gs.evaluate(X_test, y_test)
print(report)

              precision    recall  f1-score   support

           0       0.93      0.91      0.92      1220
           1       0.57      0.64      0.60       233

    accuracy                           0.86      1453
   macro avg       0.75      0.77      0.76      1453
weighted avg       0.87      0.86      0.87      1453



In [39]:
rf_gs.save_model('stratified_kfold_classweight15_tuned_model_spelling')

In [27]:
with open('../ml_dev/models/stratified_kfold_classweight15_tuned_model_spelling.pickle', 'rb') as f:
  model_loaded = pickle.load(f)

In [28]:
y_pred = model_loaded.predict(X_test)
classification_report(y_test, y_pred, output_dict=True)

{'0': {'precision': 0.9286912751677853,
  'recall': 0.9073770491803279,
  'f1-score': 0.9179104477611939,
  'support': 1220},
 '1': {'precision': 0.5670498084291188,
  'recall': 0.6351931330472103,
  'f1-score': 0.5991902834008097,
  'support': 233},
 'accuracy': 0.8637302133516862,
 'macro avg': {'precision': 0.747870541798452,
  'recall': 0.7712850911137691,
  'f1-score': 0.7585503655810018,
  'support': 1453},
 'weighted avg': {'precision': 0.8706992161518806,
  'recall': 0.8637302133516862,
  'f1-score': 0.8668011578121441,
  'support': 1453}}

In [74]:
def predict_approval(model, new_data):
    
    # Predict diabetes
    predictions = model.predict_proba(new_data)
    approval_prob = round(predictions[0][1], ndigits=3)

    #pred_to_label = {0: 'Negative', 1: 'Positive'}

    # Make a list of predictions
    #data = []
    #for t, pred in zip(new_data, predictions):
    #    data.append({'prediction': pred[0]})

    return approval_prob

In [63]:
FEATURES

['escore_bruto_p1_etapa1',
 'escore_bruto_p2_etapa1',
 'nota_redacao_etapa1',
 'escore_bruto_p1_etapa2',
 'escore_bruto_p2_etapa2',
 'nota_redacao_etapa2',
 'escore_bruto_p1_etapa3',
 'escore_bruto_p2_etapa3',
 'nota_redacao_etapa3',
 'AGRONOMIA (BACHARELADO)',
 'ARQUITETURA E URBANISMO (BACHARELADO)',
 'ARQUIVOLOGIA (BACHARELADO)',
 'ARTES CÊNICAS - INTERPRETAÇÃO TEATRAL (BACHARELADO)',
 'ARTES VISUAIS (BACHARELADO)',
 'ARTES VISUAIS (LICENCIATURA)',
 'BIBLIOTECONOMIA (BACHARELADO)',
 'BIOTECNOLOGIA (BACHARELADO)',
 'CAMPUS UNB CEILÂNDIA (FCE) ENFERMAGEM (BACHARELADO)',
 'CAMPUS UNB PLANALTINA (FUP) – DIURNO CIÊNCIAS NATURAIS (LICENCIATURA)',
 'CAMPUS UNB PLANALTINA (FUP) – NOTURNO CIÊNCIAS NATURAIS (LICENCIATURA)',
 'CIÊNCIA DA COMPUTAÇÃO (BACHARELADO)',
 'CIÊNCIA POLÍTICA (BACHARELADO)',
 'CIÊNCIAS AMBIENTAIS (BACHARELADO)',
 'CIÊNCIAS BIOLÓGICAS (BACHARELADO)',
 'CIÊNCIAS CONTÁBEIS (BACHARELADO)',
 'CIÊNCIAS ECONÔMICAS (BACHARELADO)',
 'CIÊNCIAS SOCIAIS – ANTROPOLOGIA / SOCIOLOGIA 

In [42]:
covariables = list({"escore_bruto_p1_etapa1": 5.172,
                            "escore_bruto_p2_etapa1": 14.653,
                            "nota_redacao_etapa1": 6.947,
                            "escore_bruto_p1_etapa2": 3.845,
                            "escore_bruto_p2_etapa2": 19.994,
                            "nota_redacao_etapa2": 7.222,
                            "escore_bruto_p1_etapa3": 4.998,
                            "escore_bruto_p2_etapa3": 16.66,
                            "nota_redacao_etapa3": 8.06,
                            "argumento_final": -25.699,
                            "2.1.1 CAMPUS  DARCY RIBEIRO – DIURNO  ADMINISTRAÇÃO (BACHARELADO)": 1.0,               
                            "cotista": 0.0,
                            "cotas_negros_flag": 0.0,
                            "publicas1_flag": 0.0,
                            "publicas2_flag": 0.0,
                            "publicas3_flag": 0.0,
                            "publicas4_flag": 0.0,
                            "publicas5_flag": 0.0,
                            "publicas6_flag": 0.0,
                            "publicas7_flag": 0.0,
                            "publicas8_flag": 0.0}.keys())

In [30]:
from samples import sample_approved, sample_not_approved

In [43]:
{'teste':sample_not_approved.get('cotista')}

{'teste': 0.0}

In [36]:
list(sample_not_approved.keys())

['escore_bruto_p1_etapa1',
 'escore_bruto_p2_etapa1',
 'nota_redacao_etapa1',
 'escore_bruto_p1_etapa2',
 'escore_bruto_p2_etapa2',
 'nota_redacao_etapa2',
 'escore_bruto_p1_etapa3',
 'escore_bruto_p2_etapa3',
 'nota_redacao_etapa3',
 'argumento_final',
 '2.1.1 CAMPUS  DARCY RIBEIRO – DIURNO  ADMINISTRAÇÃO (BACHARELADO)',
 'cotista',
 'cotas_negros_flag',
 'publicas1_flag',
 'publicas2_flag',
 'publicas3_flag',
 'publicas4_flag',
 'publicas5_flag',
 'publicas6_flag',
 'publicas7_flag',
 'publicas8_flag']

In [76]:
new_sample = sample_not_approved
#new_sample = pd.DataFrame([{col: new_sample.get(col) for col in FEATURES}])
#new_sample = new_sample.fillna(0)

In [75]:
predict_approval(model_loaded, new_data=new_sample)

0.104

## Stratified KFold + Hyperameter tuning + Flags + SMOTE

In [25]:
from imblearn.over_sampling import SMOTE
import numpy as np

In [26]:
FEATURES.remove('argumento_final')
print(len(FEATURES))

X = df[FEATURES] # features
y = df['label'] # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=47) # 70% training and 30% test

110


In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=47)

In [29]:
# Apply SMOTE to the training set
smote = SMOTE(random_state=47)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# # Train a logistic regression model on the resampled data
# model = LogisticRegression()
# model.fit(X_train_resampled, y_train_resampled)

# # Make predictions on the test set
# y_pred = model.predict(X_test)

# # Evaluate the model
# print(classification_report(y_test, y_pred))


In [30]:
pd.Series(y_train).value_counts(dropna=False)
pd.Series(y_train_resampled).value_counts(dropna=False)

0    4747
1    4747
Name: label, dtype: int64

In [32]:
# param_grid = {
#     'n_estimators': [100, 200, 300, 500],      # Number of trees in the forest
#     'max_depth': [None, 5, 10, 20, 40, 60],            # Maximum depth of each tree
#     'min_samples_split': [2, 5, 10, 20, 40, 60],       # Minimum number of samples required to split an internal node
#     'min_samples_leaf': [1, 2, 4, 8, 16, 32],          # Minimum number of samples required to be at a leaf node
#     'max_features': ['sqrt', 'log2']
# }

param_grid = {
    'n_estimators': [100, 200, 300],      # Number of trees in the forest
    'max_depth': [2, 5, 10, 20, 30],            # Maximum depth of each tree
    'min_samples_split': [5, 10, 20, 50],       # Minimum number of samples required to split an internal node
    'min_samples_leaf': [2, 8, 16, 32],          # Minimum number of samples required to be at a leaf node
}

In [33]:
# Assuming you have your training data X_train and corresponding labels y_train

# Instantiate the class
rf_gs = RandomForestClassifierGridSearch(X_train_resampled, y_train_resampled)

# Fit the classifier using GridSearchCV
rf_gs.fit(param_grid)

# Assuming you have your test data X_test and corresponding labels y_test

# Evaluate the best model on the test data
report = rf_gs.evaluate(X_test, y_test)
print(report)

              precision    recall  f1-score   support

           0       0.93      0.90      0.91      1220
           1       0.53      0.62      0.57       233

    accuracy                           0.85      1453
   macro avg       0.73      0.76      0.74      1453
weighted avg       0.86      0.85      0.86      1453



In [34]:
rf_gs.get_best_params()

{'max_depth': 30,
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 200}

In [36]:
df.course.value_counts(dropna=False).to_dict()

{'MEDICINA (BACHARELADO)': 640,
 'DIREITO (BACHARELADO)': 601,
 'ENGENHARIAS – AEROESPACIAL / AUTOMOTIVA / ELETRÔNICA / ENERGIA / SOFTWARE (BACHARELADOS)': 440,
 'PSICOLOGIA (BACHARELADO / LICENCIATURA / PSICÓLOGO)': 429,
 'CIÊNCIA DA COMPUTAÇÃO (BACHARELADO)': 423,
 'ARQUITETURA E URBANISMO (BACHARELADO)': 226,
 'RELAÇÕES INTERNACIONAIS (BACHARELADO)': 224,
 'ENFERMAGEM (BACHARELADO)': 222,
 'FARMÁCIA (BACHARELADO)': 221,
 'MEDICINA VETERINÁRIA (BACHARELADO)': 219,
 'NUTRIÇÃO (BACHARELADO)': 193,
 'ODONTOLOGIA (BACHARELADO)': 180,
 'CIÊNCIAS BIOLÓGICAS (BACHARELADO)': 166,
 'CAMPUS UNB CEILÂNDIA (FCE) ENFERMAGEM (BACHARELADO)': 162,
 'ENGENHARIA DE COMPUTAÇÃO (BACHARELADO)': 148,
 'CIÊNCIAS ECONÔMICAS (BACHARELADO)': 144,
 'DESIGN – PROGRAMAÇÃO VISUAL/PROJETO DO PRODUTO (BACHARELADO)': 139,
 'DIURNO ADMINISTRAÇÃO (BACHARELADO)': 135,
 'FISIOTERAPIA (BACHARELADO)': 133,
 'PEDAGOGIA (LICENCIATURA)': 124,
 'JORNALISMO (BACHARELADO)': 114,
 'EDUCAÇÃO FÍSICA (BACHARELADO)': 101,
 'COMUNICA