# Setup

In [None]:
from google.colab import drive

# Gain access to the source files in GDrive
drive.mount('/content/drive')

In [None]:
import os

# Set Data Path
dataPath = "/content/drive/MyDrive/Data"
# Show Data Folders
os.listdir(dataPath)

# Load Data

In [None]:
import pandas as pd

datasetNames = ["2425Q1-DBDP1", "Y", "Z"]
counterNames = ["login","video","slide"]
timeNames = ["Weekday","Hour"]

## Counter per time

In [None]:
sourcesCounters = []

for i in range(3):
  for j in range(3):
    for k in range(2):
      sourcesCounters.append(pd.read_csv(dataPath+"/"+datasetNames[i]+"/"+counterNames[j]+"LogsPer"+timeNames[k]+".csv", index_col=0)
                      .assign(**{"Dataset": datasetNames[i], "Counter": counterNames[j], "Time": timeNames[k]})
                    )
      display(sourcesCounters[i*3*2+j*2+k])

In [None]:
# Concatenate all dataframes
dfCounters = pd.concat(sourcesCounters).reset_index(drop=True).astype({"Hour": "Int16"}) #, errors='coerce')

## Indicators per user

In [None]:
from sklearn.preprocessing import minmax_scale

targetNames = ["ExamMark", "ExercisesMark", "ExercisesMark"]
sources = []

for i in range(3):
  sources.append(pd.read_csv(dataPath+"/"+datasetNames[i]+"/pd_AnyTime.csv")
                  .assign(**{"Dataset": datasetNames[i]})
                  .rename({targetNames[i]: "Target"}, axis=1, errors='raise')
                  .drop(["UsernameID","ExamMark","ExercisesMark","TestMark"], axis=1, errors='ignore')
                )
  continuousFeatures = (sources[i]).columns.tolist()
  continuousFeatures.remove("Gender")
  continuousFeatures.remove("Dataset")
  (sources[i]).loc[:,"Label"]=(sources[i]["Target"]>=5)
  (sources[i]).loc[:,"Label"] = sources[i]["Label"].replace({True: "Pass", False: "Fail"}).convert_dtypes()

  print("---------"+datasetNames[i])
  tmpdf = sources[i]["AvgViewedVideoRepetitions"]
  display(tmpdf)
  min = tmpdf.min()
  media_weight = tmpdf.mean()
  max = tmpdf.max()
  std_weight = tmpdf.std()
  mediana_weight = tmpdf.median()
  print("Promedio: "+str(media_weight))
  print("Desviación estandard: "+str(std_weight))
  print("Min: "+str(min))
  print("Mediana: "+str(mediana_weight))
  print("Max: "+str(max))

  for c in continuousFeatures:
    if c!='Label':
      sources[i][c] = minmax_scale(sources[i][c], feature_range=(0,1), axis=0)
  display(sources[i])

In [None]:
# Concatenate all dataframes
df = pd.concat(sources)

In [None]:
dict={}
for i in range(len(continuousFeatures)):
  dict[continuousFeatures[i]] = 1
dfRounded = df.round(dict)
dfRounded.describe()

In [None]:
# Create the list of predictors
predictors=list(set(df.columns.tolist())-set(['Target','Label','Gender','Dataset']))

# Visualizations

In [None]:
# Set Data Path
chartsPath = "Charts/AllTogether"
# Show Data Folders
os.listdir(chartsPath)

## Univariate analysis

In [None]:
df.describe()

In [None]:
for i in range(3):
    print(datasetNames[i])
    display(sources[i].groupby("Gender").agg(Counter=pd.NamedAgg(column="BeforeExamSlidePercent", aggfunc="count")))

In [None]:
display(dfCounters)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

for i in range(2):
    for j in range(3):
      print(timeNames[i])
      print(counterNames[j])
      ts = dfCounters[(dfCounters["Time"]==timeNames[i]) & (dfCounters["Counter"]==counterNames[j])].groupby([timeNames[i]]).agg(Counter=pd.NamedAgg(column="count", aggfunc="sum"))
      #display(ts)
      fig = plt.figure()
      ax=sns.barplot(data=ts, x="Counter", y=timeNames[i], orient="h") #, hue="Dataset")
      if timeNames[i]=="Weekday":
        ax.set(ylabel="Día de la semana")
      elif timeNames[i]=="Hour":
        ax.set(ylabel="Hora del día")
      else:
        ax.set(ylabel=None)
      ax.set(xlabel=None)
      plt.savefig(chartsPath+"/Counters-"+timeNames[i]+"_"+counterNames[j]+".pdf", format="pdf", bbox_inches='tight')
      ax.set(xlabel=counterNames[j])
      plt.title(counterNames[j]+" x "+timeNames[i])
      plt.show()

In [None]:
#Check if both DBD partials are statistically different
from scipy.stats import ttest_ind

ttest_ind((sources[1])['Target'], (sources[2])['Target'])

In [None]:
# Plot the access to the slides
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

slideFeatures=["BeforeSessionSlidePercent","BeforeExamMinusWeekSlidePercent","BeforeExamSlidePercent"]
slidekde=[]
for i in range(3):
  print(slideFeatures[i])
  print("Promedio: "+str(df[slideFeatures[i]].mean()))
  print("Desviación estandar: "+str(df[slideFeatures[i]].std()))
  print("Mediana: "+str(df[slideFeatures[i]].median()))
  slidekde.append(
      pd.DataFrame(df[slideFeatures[i]]).rename(columns={slideFeatures[i]:"Value"})
      )
slidekde[0]["Discriminant"] = "Antes de la sesión"
slidekde[1]["Discriminant"] = "Al menos una semana antes del examen"
slidekde[2]["Discriminant"] = "En cualquier momento antes del examen"
slidesToKDE = pd.concat(slidekde).reset_index(drop=True)

fig = plt.figure()
g=sns.displot(data=slidesToKDE, x="Value", kind="kde", hue="Discriminant", height=2.5, aspect=2 )
axes = g.axes.flat
plt.xlim(0,1)
plt.ylim(0,1.45)
g.set(ylabel="Función de densidad")
g.set(xlabel="Porcentaje de transparencias")
plt.gca().xaxis.set_major_formatter(PercentFormatter(1))
sns.move_legend(g, loc="upper center", title=None)
lss = [':', '--', '-']
for ax in axes:
    for line, ls in zip(ax.lines, lss):
        line.set_linestyle(ls)

plt.savefig(chartsPath+"/KDE-Slides.pdf", format="pdf", bbox_inches='tight')
plt.show()

In [None]:
# Plot the access to the videos
import seaborn as sns
import matplotlib.pyplot as plt

videoFeatures=["BeforeSessionVideoPercent","BeforeExamMinusWeekVideoPercent","BeforeExamVideoPercent"]
videokde=[]
for i in range(3):
  print(videoFeatures[i])
  print("Promedio: "+str(df[videoFeatures[i]].mean()))
  print("Desviación estandar: "+str(df[videoFeatures[i]].std()))
  print("Mediana: "+str(df[videoFeatures[i]].median()))
  videokde.append(
      pd.DataFrame(df[videoFeatures[i]]).rename(columns={videoFeatures[i]:"Value"})
      )
videokde[0]["Discriminant"] = "Antes de la sesión"
videokde[1]["Discriminant"] = "Al menos una semana antes del examen"
videokde[2]["Discriminant"] = "En cualquier momento antes del examen"
videosToKDE = pd.concat(videokde).reset_index(drop=True)

fig = plt.figure()
g=sns.displot(data=videosToKDE, x="Value", kind="kde", hue="Discriminant", height=2.5, aspect=2)
axes = g.axes.flat
plt.xlim(0,1)
plt.ylim(0,1.45)
g.set(ylabel="Función de densidad")
g.set(xlabel="Porcentaje de videos")
plt.gca().xaxis.set_major_formatter(PercentFormatter(1))
sns.move_legend(g, loc="upper center", title=None)
lss = [':', '--', '-']
for ax in axes:
    for line, ls in zip(ax.lines, lss):
        line.set_linestyle(ls)

plt.savefig(chartsPath+"/KDE-Videos.pdf", format="pdf", bbox_inches='tight')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

for feature in continuousFeatures:
  media_weight = df[feature].mean()
  std_weight = df[feature].std()
  mediana_weight = df[feature].median()
  print("Promedio: "+str(media_weight))
  print("Desviación estandard: "+str(std_weight))
  print("Mediana: "+str(mediana_weight))

  fig = plt.figure()
  ax=sns.displot(df[feature].values, bins=10, height=2, aspect=2)
  ax.set(ylabel="Número de estudiantes")
  ax.set(xlabel="Número de accesos normalizados y discretizados")
  plt.gca().xaxis.set_major_formatter(PercentFormatter(1))
  if (feature in ["AvgSlideRepetitions", "AvgVideoRepetitions", "AvgViewedVideoRepetitions"]):
    ax.set(xticklabels=[])
    ax.tick_params(bottom=False)
  #plt.axvline(media_weight, color="r", linestyle="--");
  #plt.axvline(mediana_weight, color="g", linestyle="-")
  #plt.axvline(media_weight-std_weight, color="r", linestyle=":")
  #plt.axvline(media_weight+std_weight, color="r", linestyle=":")
  plt.xlim(0,1)
  plt.ylim(0,65)
  plt.savefig(chartsPath+"/Hist-"+feature+".pdf", format="pdf", bbox_inches='tight')
  plt.title(feature)
  plt.show()

### Other possible visualizations

## Bivariate analysis

In [None]:
# Correlations
print("Males")
display(df[df["Gender"]=="M"].corr(numeric_only=True))
print("Females")
display(df[df["Gender"]=="F"].corr(numeric_only=True))
print("Both")
display(df.corr(numeric_only=True))

In [None]:
promisingPredictors = ["BeforeSessionVideoPercent","BeforeExamVideoPercent","BeforeExamMinusWeekVideoPercent","AvgVideoRepetitions","BeforeSessionSlidePercent"]
genderDistinctPredictors = ["BeforeExamSlidePercent","BeforeExamMinusWeekSlidePercent","AvgSlideRepetitions"]

### Scatterplots

In [None]:
import pandas as pd
import seaborn as sns

sns.pairplot(
    pd.merge(df[promisingPredictors], df["Label"], left_index=True, right_index=True),
    hue="Label"
    )

In [None]:
from sklearn.linear_model import LinearRegression
from matplotlib.ticker import PercentFormatter

for f in promisingPredictors:
  # Create a Linear Regression model
  model = LinearRegression()

  # Train the model on the training data
  model.fit(df[[f]], df["Target"])

  # Print the intercept and coefficient
  print(f"Intercept: {model.intercept_}")
  print(f"Coefficient: {model.coef_}")

  ax=sns.lmplot(data=df, x=f, y="Target", fit_reg=True)

  plt.xlabel("")
  plt.ylabel("")
  plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
  plt.gca().xaxis.set_major_formatter(PercentFormatter(1))
  plt.ylabel("Nota del examen normalizada")
  plt.xlabel("Porcentaje de vídeos")
  plt.savefig(chartsPath+"/Scatter-"+f+".pdf", format="pdf", bbox_inches='tight')
  plt.xlabel(f)
  plt.title("Scatter Plot of Data with Regression Line")
  plt.show()

In [None]:
from matplotlib.ticker import PercentFormatter

for f in predictors: #genderDistinctPredictors:
  sns.lmplot(data=df, x=f, y="Target", fit_reg=True, legend=False, hue="Gender", markers=['o','*'])
  plt.xlabel("")
  plt.ylabel("")
  plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
  plt.gca().xaxis.set_major_formatter(PercentFormatter(1))
  plt.ylabel("Nota del examen normalizada")
  plt.xlabel("Porcentaje de transparencias")
  plt.savefig(chartsPath+"/ScatterGender-"+f+"_AnyTime.pdf", format="pdf", bbox_inches='tight')
  plt.xlabel(f)
  plt.title("Scatter Plot of Data with Regression Line")
  plt.show()

### Boxplots

In [None]:
import matplotlib.pyplot as plt
import numpy as np

for feature in predictors:
  #AvgViewedVideoRepetitions needs to be ignored, because it has NULL values. If removed before, this should work
  if feature!="AvgViewedVideoRepetitions":
    ax = sns.boxplot(x=dfRounded[feature], y=df["Target"])
    ax.set(ylabel="Nota del examen normalizada")
    ax.set(xlabel=None)
    plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
    plt.gca().xaxis.set_major_formatter(PercentFormatter(10))
    plt.savefig(chartsPath+"/BoxPlot-ExamMark_x_"+feature+".pdf", format="pdf", bbox_inches='tight')
    ax.set(xlabel=feature)
    plt.show()

### PCA

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition

dfOnlyLog = df[promisingPredictors]
dfOnlyLogStandardized = pd.DataFrame(StandardScaler().fit_transform(dfOnlyLog))
pca = decomposition.PCA(n_components=2).fit(dfOnlyLogStandardized.dropna(how='any'))
dfProj = pca.transform(dfOnlyLogStandardized.dropna(how='any'))
print("Valores propios: ", 100*pca.explained_variance_ratio_)
print("Vectores propios: ", pca.components_)
print(dfOnlyLog.columns)

In [None]:
import pandas as pd
import seaborn as sns

dfpcas = pd.DataFrame(dfProj,columns=['x1', 'x2'])

fig = plt.figure(figsize=(8,8))
sns.scatterplot(x="x1", y="x2", hue="Label", data=pd.merge(dfpcas,df.reset_index()["Label"], left_index=True, right_index=True))
plt.show()

# Prediction

## Classification

In [None]:
from sklearn import tree
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, accuracy_score

clf = tree.DecisionTreeClassifier(max_depth=2)
clf = clf.fit(df[predictors], df["Label"])
print(clf.classes_)

plt.figure(figsize=(20,20))
ax = plot_tree(clf, feature_names=df[predictors].columns, class_names=clf.classes_)
#plt.savefig(chartsPath+"/DecisionTree-Classifier.pdf", format="pdf", bbox_inches='tight')
plt.show()

print("Confusion matrix: "+str(confusion_matrix(df["Label"], clf.predict(df[predictors]))))
print("Accuracy: "+str(accuracy_score(df["Label"], clf.predict(df[predictors]))))
print("Balanced accuracy: "+str(balanced_accuracy_score(df["Label"], clf.predict(df[predictors]))))

In [None]:
print(df[predictors].columns)
resultados=["Suspenso","Aprobado"]
indicadores=['Total de vídeos', 'Vídeos antes del examen',
       'BeforeExamMinusWeekSlidePercent', 'BeforeSessionVideoPercent',
       'BeforeExamSlidePercent', 'DuringSessionsLoginCounter',
       'AvgSlideRepetitions', 'Transparencias antes de la sesión',
       'Vídeos al menos una semana antes del examen', 'BeforeExamLoginCounter',
       'Repetición de accesos a vídeos']

In [None]:
from sklearn.tree import export_graphviz
import graphviz

# Export as dot file
dot_data = export_graphviz(clf, out_file=None,
                           feature_names=indicadores, label='all',
                           class_names=resultados,
                           filled=True, rounded=True,
                           special_characters=True)

# Draw graph
graph = graphviz.Source(dot_data)
graph.render(chartsPath+"/DecisionTree-Classifier_AnyTime")

## Regression

In [None]:
# Import the necessary modules and libraries
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

X = df[predictors]

# Fit regression model
regr = DecisionTreeRegressor(max_depth=2)
regr.fit(X, df["Target"])

plt.figure(figsize=(14,10))
plot_tree(regr, filled=True, feature_names=X.columns)
#plt.savefig(chartsPath+"/DecisionTree-Regresor.pdf", format="pdf", bbox_inches='tight')
plt.title(label="ExamMark")
plt.show()

print("MSE: "+str(mean_squared_error(df["Target"], regr.predict(X))))

In [None]:
print(X.columns)
indicadores=["Repeticiones de vídeos", "Vídeos al menos una semana antes del examen",
       'BeforeSessionVideoPercent', 'BeforeExamMinusWeekSlidePercent',
       'DuringSessionsLoginCounter', 'Accesos al sistema durante el periodo de examenes',
       'AvgSlideRepetitions', "Transparencias antes de la sesión",
       'Total de vídeos', 'BeforeExamSlidePercent',
       'BeforeExamVideoPercent']

In [None]:
from sklearn.tree import export_graphviz
import graphviz

# Export as dot file
dot_data = export_graphviz(regr, out_file=None,
                           feature_names=indicadores,
                           filled=True, rounded=True,
                           special_characters=True)

# Draw graph
graph = graphviz.Source(dot_data)
graph.render(chartsPath+"/DecisionTree-Regressor_AnyTime")