In [None]:
# --------------------------------------------------------------
# Dependencies
# --------------------------------------------------------------

import pandas as pd
import pylab as pl
import numpy as np
import scipy.optimize as opt
import statsmodels.api as sm

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn as sns

import itertools

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler




In [None]:
# --------------------------------------------------------------
# Import Dataset 
# --------------------------------------------------------------


studentInfo = pd.read_csv("./uci-open-university-learning-analytics-dataset/studentInfo.csv")

assessments = pd.read_csv("./uci-open-university-learning-analytics-dataset/assessments.csv")

studentAssessment = pd.read_csv("./uci-open-university-learning-analytics-dataset/studentAssessment.csv")

studentVle = pd.read_csv("./uci-open-university-learning-analytics-dataset/studentVle.csv")



In [None]:
# --------------------------------------------------------------
# Junção das Tabelas
# --------------------------------------------------------------

# Juntar informações das avaliações
dfs = studentInfo.merge(studentAssessment, on="id_student", how="left")
print("\nstudentAssessment\n")
print(studentAssessment.dtypes)

# Juntar com detalhes das avaliações
assessments.drop(columns=["code_module", "code_presentation"], inplace=True) #Remove colunas que vão ser duplicadas
dfs = dfs.merge(assessments, on="id_assessment", how="left")
print("\nassessments\n")
print(assessments.dtypes)

# Juntar interações com a plataforma
dfs = dfs.merge(studentVle.groupby("id_student")["sum_click"].sum().reset_index(), on="id_student", how="left")

print("\nstudentVle\n")
print(studentVle.dtypes)


dfs.drop(columns=["id_student", "id_assessment","code_presentation"], inplace=True) #Remover colunas irrelevantes 


# Questao para Final Result: tem 4: Fail, Withdrawn, Pass and Disitinction. Devo fazer drop das desistencias e juntar os de distinção ao aprovado ?

In [None]:
print("\ndfs\n")
print(dfs.dtypes)
print(dfs.head())

print("\nnunique\n")
print(dfs.nunique())
print(dfs.describe().T)


print("\nisnull\n")
print(dfs.isnull().sum())
print((dfs.isnull().sum() / len(dfs) * 100).apply(lambda x: f"{x:.2f}%"))

print("\nshape\n")
print(dfs.shape)

In [None]:
# --------------------------------------------------------------
# Identificação das Features
# --------------------------------------------------------------
# Identificar colunas categóricas
categorical_cols = ["code_module", "gender", "region", "highest_education", "imd_band", "age_band", "disability", "assessment_type", "final_result","is_banked"]
# Selecionar features numericas
numeric_cols =["date_submitted","num_of_prev_attempts", "sum_click","date","studied_credits", "weight","score"]

In [None]:
# --------------------------------------------------------------
# Normalização do data set
# --------------------------------------------------------------

# Substituir '?' por NaN
dfs.replace('?', np.nan, inplace=True)

# --------------------------------------------------------------
# numeric_cols
# --------------------------------------------------------------

# Converter colunas numéricas corretamente
dfs[numeric_cols] = dfs[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Preencher valores NaN com a média da respetiva coluna
dfs[numeric_cols] = dfs[numeric_cols].fillna(dfs[numeric_cols].mean())

# --------------------------------------------------------------
# categorical_cols
# --------------------------------------------------------------

# Remover linhas com NaN
dfs.dropna(subset=categorical_cols, inplace=True)

# Remover linhas com 'Withdrawn'
dfs = dfs.loc[dfs['final_result'] != 'Withdrawn']

# Substituir 'Distinction' por 'Pass'
dfs['final_result'] = dfs['final_result'].replace('Distinction', 'Pass')



In [None]:
# --------------------------------------------------------------
# Normalização das Features numéricas para SVM
# --------------------------------------------------------------

scaler = StandardScaler()
dfs[numeric_cols] = scaler.fit_transform(dfs[numeric_cols])



In [None]:
# --------------------------------------------------------------
# Normalização das Features categóricas para SVM
# --------------------------------------------------------------

# Aplicar Label Encoding
for col in categorical_cols:
    dfs[col] = LabelEncoder().fit_transform(dfs[col])

print(dfs.dtypes)
print(dfs.head())


In [None]:
# --------------------------------------------------------------
# Split feature subsets
# --------------------------------------------------------------

X = dfs.drop(columns="final_result")
y = dfs["final_result"]



In [None]:
# --------------------------------------------------------------
# Create a training and test set
# --------------------------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.dtypes)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)


In [None]:
# --------------------------------------------------------------
# Treinamento do modelo SVM
# --------------------------------------------------------------

svm = SVC(probability=True)  # Probabilidade para usar as saídas de probabilidade no treino da rede neural
svm.fit(X_train, y_train)

In [None]:
# --------------------------------------------------------------
# Treinamento do modelo Random Forest
# --------------------------------------------------------------

rf = RandomForestClassifier()
rf.fit(X_train, y_train)


In [None]:
# --------------------------------------------------------------
# Previsões dos modelos base
# --------------------------------------------------------------

svm_preds_train = svm.predict_proba(X_train)[:, 1]  # Usando a probabilidade da classe 1
rf_preds_train = rf.predict_proba(X_train)[:, 1]


In [None]:
# Concatenar as predições dos modelos base como novas features
X_train_stack = pd.DataFrame({
    'svm_pred': svm_preds_train,
    'rf_pred': rf_preds_train
})

X_test_stack = pd.DataFrame({
    'svm_pred': svm_preds_test,
    'rf_pred': rf_preds_test
})


In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense

# Construir a rede neural
model = Sequential()
model.add(Dense(32, input_dim=X_train_stack.shape[1], activation='relu'))  # Camada de entrada
model.add(Dense(16, activation='relu'))  # Camada oculta
model.add(Dense(1, activation='sigmoid'))  # Camada de saída para classificação binária

# Compilar o modelo
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Treinar a rede neural
model.fit(X_train_stack, y_train, epochs=10, batch_size=32, validation_data=(X_test_stack, y_test))


In [None]:
# Avaliar o modelo da rede neural
loss, accuracy = model.evaluate(X_test_stack, y_test)
print(f'Acurácia da rede neural: {accuracy:.4f}')

# Predições com a rede neural
nn_preds = model.predict(X_test_stack)
nn_preds_class = (nn_preds > 0.5).astype(int)  # Se for maior que 0.5, classe 1, senão classe 0


In [None]:
# --------------------------------------------------------------
# Perform forward feature selection using simple decision tree
# --------------------------------------------------------------

In [None]:
# --------------------------------------------------------------
# Grid search for best hyperparameters and model selection
# --------------------------------------------------------------

In [None]:
# --------------------------------------------------------------
# Create a grouped bar plot to compare the results
# --------------------------------------------------------------

In [None]:
# --------------------------------------------------------------
# Select best model and evaluate results
# --------------------------------------------------------------

In [None]:
# --------------------------------------------------------------
# Select train and test data based on participant
# --------------------------------------------------------------

In [None]:

# --------------------------------------------------------------
# Use best model again and evaluate results
# --------------------------------------------------------------



In [None]:

# --------------------------------------------------------------
# Try a simpler model with the selected features
# --------------------------------------------------------------

SyntaxError: invalid syntax (1408881651.py, line 1)