In [43]:
import numpy as np
import pandas as pd
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

# Getting the data

In [29]:
df_original_train = pd.read_csv('train.csv', sep=",", encoding="UTF8")
df_original_test = pd.read_csv('test.csv', sep=",", encoding="UTF8")

# Explorating the data

In [30]:
df_train.head()

Unnamed: 0,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_REDACAO,NU_NOTA_COMP1,NU_NOTA_COMP2,NU_NOTA_COMP3,NU_NOTA_COMP4,NU_NOTA_COMP5,NU_NOTA_MT
0,436.3,495.4,581.2,520.0,120.0,120.0,120.0,80.0,80.0,399.4
1,474.5,544.1,599.0,580.0,140.0,120.0,120.0,120.0,80.0,459.8
2,,,,,,,,,,
3,,,,,,,,,,
4,,,,,,,,,,


In [31]:
df_train.describe()

Unnamed: 0,NU_NOTA_CN,NU_NOTA_CH,NU_NOTA_LC,NU_NOTA_REDACAO,NU_NOTA_COMP1,NU_NOTA_COMP2,NU_NOTA_COMP3,NU_NOTA_COMP4,NU_NOTA_COMP5,NU_NOTA_MT
count,10341.0,10341.0,10133.0,10133.0,10133.0,10133.0,10133.0,10133.0,10133.0,10133.0
mean,473.495155,529.661087,516.472841,529.048258,119.811704,111.915918,106.824238,113.123951,77.372446,482.497928
std,71.093674,73.726344,68.68819,154.294758,29.846017,33.737588,34.341349,33.480033,43.06714,99.826323
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,419.9,480.4,468.1,440.0,120.0,100.0,80.0,100.0,40.0,408.9
50%,459.8,532.0,520.9,540.0,120.0,120.0,120.0,120.0,80.0,461.2
75%,514.5,581.2,564.9,600.0,140.0,120.0,120.0,120.0,100.0,537.6
max,806.4,807.0,763.6,1000.0,200.0,200.0,200.0,200.0,200.0,952.0


In [32]:
df_train.corr()["NU_NOTA_MT"].sort_values(ascending = True)

NU_NOTA_COMP1      0.299402
NU_NOTA_COMP2      0.335638
NU_NOTA_COMP4      0.342282
NU_NOTA_COMP5      0.343337
NU_NOTA_COMP3      0.350307
NU_NOTA_REDACAO    0.379376
NU_NOTA_LC         0.494695
NU_NOTA_CH         0.529594
NU_NOTA_CN         0.584941
NU_NOTA_MT         1.000000
Name: NU_NOTA_MT, dtype: float64

In [33]:
df_train.corr()["NU_NOTA_MT"].sort_values(ascending = False)

NU_NOTA_MT         1.000000
NU_NOTA_CN         0.584941
NU_NOTA_CH         0.529594
NU_NOTA_LC         0.494695
NU_NOTA_REDACAO    0.379376
NU_NOTA_COMP3      0.350307
NU_NOTA_COMP5      0.343337
NU_NOTA_COMP4      0.342282
NU_NOTA_COMP2      0.335638
NU_NOTA_COMP1      0.299402
Name: NU_NOTA_MT, dtype: float64

# Training using a Random Forest Classifier

### The features 'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_REDACAO', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3', 'NU_NOTA_COMP4', 'NU_NOTA_COMP5' were choosed to be used for training, since they are more correlated with the target feature "NU_NOTA_MT"

In [39]:
features = [
    'NU_NOTA_CN',
    'NU_NOTA_CH',
    'NU_NOTA_LC',
    'NU_NOTA_REDACAO',
    'NU_NOTA_COMP1',
    'NU_NOTA_COMP2',
    'NU_NOTA_COMP3',
    'NU_NOTA_COMP4',
    'NU_NOTA_COMP5']

In [40]:
df_test = df_original_test
df_train = df_original_train

### Selecting the lines with "NOTAS" that aren't null and aren't 0

In [41]:
df_train = df_train.loc[
      (df_train['NU_NOTA_CN'].notnull())  & (df_train['NU_NOTA_CN'] != 0) & (df_train['NU_NOTA_CH'].notnull())      & (df_train['NU_NOTA_CH'] != 0) 
    & (df_train['NU_NOTA_LC'].notnull())  & (df_train['NU_NOTA_LC'] != 0) & (df_train['NU_NOTA_REDACAO'].notnull()) & (df_train['NU_NOTA_REDACAO'] != 0)    
]

df_test = df_test.loc[
      (df_test['NU_NOTA_CN'].notnull())  & (df_test['NU_NOTA_CN'] != 0) & (df_test['NU_NOTA_CH'].notnull())      & (df_test['NU_NOTA_CH'] != 0) 
    & (df_test['NU_NOTA_LC'].notnull())  & (df_test['NU_NOTA_LC'] != 0) & (df_test['NU_NOTA_REDACAO'].notnull()) & (df_test['NU_NOTA_REDACAO'] != 0)    
]

In [44]:
y_train = df_train['NU_NOTA_MT']
x_train = df_train[features]
x_test = df_test[features]

In [46]:
scaler = StandardScaler()  
x_train = sc.fit_transform(x_train)  
x_test = sc.transform(x_test)

In [47]:
regressor = RandomForestRegressor()
regressor.fit(x_train, y_train)

# Predicting using the test data

In [53]:
y_pred_test = regressor.predict(x_test)

test_predictions = pd.DataFrame({"NU_INSCRICAO": df_test["NU_INSCRICAO"], "NU_NOTA_MT": y_pred_test})
test_predictions.to_csv("answer.csv", index = False)