In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import classification_report



In [2]:
scores = pd.read_parquet('../data/scores.parquet')
approvals = pd.read_parquet('../data/approvals.parquet')
scores.shape

(7301, 23)

In [3]:
scores = scores[~scores.course.str.contains('JUDICE')]

In [4]:
scores.head()

Unnamed: 0,numero_inscricao,nome,escore_bruto_p1_etapa1,escore_bruto_p2_etapa1,nota_redacao_etapa1,escore_bruto_p1_etapa2,escore_bruto_p2_etapa2,nota_redacao_etapa2,escore_bruto_p1_etapa3,escore_bruto_p2_etapa3,...,classificacao_final_cotas_negros,classificacao_final_publicas1,classificacao_final_publicas2,classificacao_final_publicas3,classificacao_final_publicas4,classificacao_final_publicas5,classificacao_final_publicas6,classificacao_final_publicas7,classificacao_final_publicas8,course
0,20199023,Amanda Amorim Luz,5.172,14.653,6.947,3.845,19.994,7.222,4.998,16.66,...,,,,,,,,,,2.1.1 CAMPUS DARCY RIBEIRO – DIURNO ADMINIST...
1,20180980,Amanda Larissa Oliveira dos Santos,6.896,48.271,3.8,6.152,54.342,7.200,5.712,46.648,...,,,,,,,,,,2.1.1 CAMPUS DARCY RIBEIRO – DIURNO ADMINIST...
2,20188220,Amanda Luisa de Oliveira Souza,0.0,23.704,5.333,0.769,24.303,6.909,2.142,25.465,...,,,,,,,,,,2.1.1 CAMPUS DARCY RIBEIRO – DIURNO ADMINIST...
3,20193280,Amanda Mendes Reis de Araujo,0.0,10.342,8.963,2.307,14.866,9.5 86,1.428,13.088,...,,,,,,,,,,2.1.1 CAMPUS DARCY RIBEIRO – DIURNO ADMINIST...
4,20188170,Ana Beatriz Cattermol Cavalcante,0.0,10.917,2.848,0.0,10.764,3.750,6.426,17.373,...,1.0,,,,,,,,,2.1.1 CAMPUS DARCY RIBEIRO – DIURNO ADMINIST...


In [4]:
cotas_columns = [col for col in scores.columns if 'classificacao' in col]
cotas_columns.pop(0) # removing 'classificacao_final_universal'
cotas_columns

['classificacao_final_cotas_negros',
 'classificacao_final_publicas1',
 'classificacao_final_publicas2',
 'classificacao_final_publicas3',
 'classificacao_final_publicas4',
 'classificacao_final_publicas5',
 'classificacao_final_publicas6',
 'classificacao_final_publicas7',
 'classificacao_final_publicas8']

In [5]:
scores['cotista'] = scores[cotas_columns].notnull().any(axis=1).astype(int)

In [6]:
for column in cotas_columns:
    scores[f'{column}_flag'] = scores[column].notnull().astype(int)

In [7]:
flags_columns = list(scores.columns[scores.columns.str.contains('flag')])

In [8]:
df = pd.merge(scores, approvals, how='left', on='numero_inscricao', indicator=True)

In [9]:
df['label'] = df._merge.apply(lambda x: 1 if x == 'both' else 0)

In [10]:
FEATURES = ['escore_bruto_p1_etapa1',
            'escore_bruto_p2_etapa1',
            'nota_redacao_etapa1',
            'escore_bruto_p1_etapa2',
            'escore_bruto_p2_etapa2',
            'nota_redacao_etapa2',
            'escore_bruto_p1_etapa3',
            'escore_bruto_p2_etapa3',
            'nota_redacao_etapa3',
            'argumento_final']

In [11]:
def convert_string_to_float(df, colnames):
    for colname in colnames:
        df[colname] = df[colname].str.replace(' ', "", regex=True)
        df[colname] = df[colname].str.replace('[R$]', "", regex=True)
        df[colname] = df[colname].str.replace(',', ".", regex=False)
        df[colname] = df[colname].apply(float)
    return df

In [12]:
df = convert_string_to_float(df, FEATURES)

In [13]:
course_dummies = pd.get_dummies(df.course)
course_dummies_columns = list(course_dummies.columns)
df = pd.concat([df, course_dummies], axis=1)

In [14]:
FEATURES.extend(course_dummies_columns)

In [15]:
len(FEATURES)

101

## Baseline Model

In [16]:
X = df[FEATURES] # features
y = df['label'] # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=47) # 70% training and 30% test

In [17]:
# Create a Random Forest Classifier
model = RandomForestClassifier(random_state=47)

# Fit randomized search
model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print('Accuracy:', metrics.accuracy_score(y_test, y_pred))
classification_report(y_test, y_pred, output_dict=True)

Accuracy: 0.8492773571920165


{'0': {'precision': 0.8618944323933478,
  'recall': 0.9770491803278688,
  'f1-score': 0.9158663081060315,
  'support': 1220},
 '1': {'precision': 0.6,
  'recall': 0.18025751072961374,
  'f1-score': 0.27722772277227725,
  'support': 233},
 'accuracy': 0.8492773571920165,
 'macro avg': {'precision': 0.7309472161966739,
  'recall': 0.5786533455287413,
  'f1-score': 0.5965470154391543,
  'support': 1453},
 'weighted avg': {'precision': 0.8198975963660595,
  'recall': 0.8492773571920165,
  'f1-score': 0.8134555783174804,
  'support': 1453}}

## Baseline Model + cotista

In [18]:
FEATURES.append('cotista')

In [19]:
len(FEATURES)

102

In [20]:
X = df[FEATURES] # features
y = df['label'] # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=47) # 70% training and 30% test

In [21]:
# Create a Random Forest Classifier
model = RandomForestClassifier(random_state=47)

# Fit randomized search
model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print('Accuracy:', metrics.accuracy_score(y_test, y_pred))
classification_report(y_test, y_pred, output_dict=True)

Accuracy: 0.8582243633860978


{'0': {'precision': 0.868995633187773,
  'recall': 0.978688524590164,
  'f1-score': 0.920585967617579,
  'support': 1220},
 '1': {'precision': 0.6708860759493671,
  'recall': 0.22746781115879827,
  'f1-score': 0.33974358974358976,
  'support': 233},
 'accuracy': 0.8582243633860978,
 'macro avg': {'precision': 0.76994085456857,
  'recall': 0.6030781678744811,
  'f1-score': 0.6301647786805844,
  'support': 1453},
 'weighted avg': {'precision': 0.8372272045321992,
  'recall': 0.8582243633860978,
  'f1-score': 0.8274433151436358,
  'support': 1453}}

## Baseline Model + Flags

In [22]:
FEATURES.extend(flags_columns)
len(FEATURES)

111

In [23]:
X = df[FEATURES] # features
y = df['label'] # labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=47) # 70% training and 30% test

In [None]:
# Create a Random Forest Classifier
model = RandomForestClassifier(random_state=47)

# Fit randomized search
model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)
classification_report(y_test, y_pred, output_dict=True)

{'0': {'precision': 0.8802045288531775,
  'recall': 0.9877049180327869,
  'f1-score': 0.9308613364233295,
  'support': 1220},
 '1': {'precision': 0.8214285714285714,
  'recall': 0.296137339055794,
  'f1-score': 0.4353312302839117,
  'support': 233},
 'accuracy': 0.8768066070199587,
 'macro avg': {'precision': 0.8508165501408744,
  'recall': 0.6419211285442905,
  'f1-score': 0.6830962833536206,
  'support': 1453},
 'weighted avg': {'precision': 0.8707793409110348,
  'recall': 0.8768066070199587,
  'f1-score': 0.8513991790038633,
  'support': 1453}}

## Hyperparameter tuning + class_weight

In [24]:
from RandomForestClassifierGridSearch import RandomForestClassifierGridSearch

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300, 500],      # Number of trees in the forest
    'max_depth': [5, 10, 20, 40, 60],            # Maximum depth of each tree
    'min_samples_split': [5, 10, 20, 40, 60],       # Minimum number of samples required to split an internal node
    'min_samples_leaf': [2, 8, 16, 32]          # Minimum number of samples required to be at a leaf node
}

In [None]:
# Assuming you have your training data X_train and corresponding labels y_train

# Instantiate the class
rf_gs = RandomForestClassifierGridSearch(X_train, y_train)

# Fit the classifier using GridSearchCV
rf_gs.fit(param_grid)

# Assuming you have your test data X_test and corresponding labels y_test

# Evaluate the best model on the test data
report = rf_gs.evaluate(X_test, y_test)
print("Classification Report:")
print(report)

NameError: name 'RandomForestClassifierGridSearch' is not defined

In [None]:
with open('model_flags_tuned.pickle','wb') as f:
    pickle.dump(model, f)

In [None]:
#with open('model_flags_tuned.pickle','wb') as f:
#  pickle.dump(model, f)

## Stratified KFold + Hyperameter tuning + Flags

In [25]:
# # Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],      # Number of trees in the forest
    'max_depth': [5, 10, 20, 40, 80],            # Maximum depth of each tree
    'min_samples_split': [5, 10, 20, 50],       # Minimum number of samples required to split an internal node
    'min_samples_leaf': [2, 8, 16, 32],          # Minimum number of samples required to be at a leaf node
}

In [26]:
X_train.shape

(5810, 111)

In [27]:
# Assuming you have your training data X_train and corresponding labels y_train

# Instantiate the class
rf_gs = RandomForestClassifierGridSearch(X_train, y_train)

# Fit the classifier using GridSearchCV
rf_gs.fit(param_grid)

# Assuming you have your test data X_test and corresponding labels y_test

# Evaluate the best model on the test data
report = rf_gs.evaluate(X_test, y_test)
print(report)

              precision    recall  f1-score   support

           0       0.92      0.92      0.92      1220
           1       0.59      0.59      0.59       233

    accuracy                           0.87      1453
   macro avg       0.76      0.76      0.76      1453
weighted avg       0.87      0.87      0.87      1453



In [31]:
rf_gs.save_model('stratified_kfold_classweight15_tuned_model.pickle')

In [26]:
import pickle

In [27]:
with open('stratified_kfold_classweight15_tuned_model.pickle.pickle', 'rb') as f:
  loaded_pipe = pickle.load(f)

In [28]:
y_pred = loaded_pipe.predict(X_test)
classification_report(y_test, y_pred, output_dict=True)

{'0': {'precision': 0.9220672682526662,
  'recall': 0.921311475409836,
  'f1-score': 0.921689216892169,
  'support': 1220},
 '1': {'precision': 0.5897435897435898,
  'recall': 0.592274678111588,
  'f1-score': 0.5910064239828693,
  'support': 233},
 'accuracy': 0.868547832071576,
 'macro avg': {'precision': 0.755905428998128,
  'recall': 0.7567930767607121,
  'f1-score': 0.7563478204375191,
  'support': 1453},
 'weighted avg': {'precision': 0.8687765476108115,
  'recall': 0.868547832071576,
  'f1-score': 0.8686616251868237,
  'support': 1453}}

In [31]:
y_pred = loaded_pipe.predict(X_test)
classification_report(y_test, y_pred, output_dict=True)

{'0': {'precision': 0.9222042139384117,
  'recall': 0.9327868852459016,
  'f1-score': 0.9274653626731867,
  'support': 1220},
 '1': {'precision': 0.6255707762557078,
  'recall': 0.5879828326180258,
  'f1-score': 0.6061946902654868,
  'support': 233},
 'accuracy': 0.8774948382656572,
 'macro avg': {'precision': 0.7738874950970598,
  'recall': 0.7603848589319637,
  'f1-score': 0.7668300264693367,
  'support': 1453},
 'weighted avg': {'precision': 0.8746367046610064,
  'recall': 0.8774948382656572,
  'f1-score': 0.8759470786601143,
  'support': 1453}}