# SIMCE Data - Exploratory Analysis

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

In [5]:
input_path = Path(".").resolve().parent / "data" / "RECODE"
simce_raw = pd.read_csv(input_path / "SIMCE.csv")
simce_raw.head()

Unnamed: 0,No,RUT,ID,gender,school,type,grade,spanish,math,lengths
0,1,35,91791,1,3260,1,4,342.74,338.86,3
1,2,35,104035,1,3260,1,8,327.92,303.94,3
2,3,35,146037,1,3247,1,10,317.38,372.51,3
3,4,238,217457,2,24822,1,4,284.22,248.03,2
4,5,238,153564,2,25280,2,8,256.94,268.46,2


## Model

$$
Y = XBZ + E
$$

donde

* $ Y \in \mathbb{R}^{ n \times p}$
* $ X \in \mathbb{R}^{ n \times m}$
* $ B \in \mathbb{R}^{ m \times q}$
* $ Z \in \mathbb{R}^{ q \times p}$
* $ E \in \mathbb{R}^{ n \times p}$

En particular

* $n$ es el número de estudiantes
* $p$ es la cantidad de pruebas 
* $m$ es la cantidad de tipos de escuela

## Duplicated

In [11]:
simce_type_dup = (
    simce_raw.groupby("RUT")
    .filter(lambda x: x["type"].nunique() > 1)
    .sort_values("RUT")
)
simce_type_dup.head()

Unnamed: 0,No,RUT,ID,gender,school,type,grade,spanish,math,lengths
3,4,238,217457,2,24822,1,4,284.22,248.03,2
4,5,238,153564,2,25280,2,8,256.94,268.46,2
5,6,239,230510,2,25882,2,4,298.3,301.98,3
6,7,239,162215,2,25882,2,8,263.12,256.04,3
7,8,239,173092,2,8926,1,10,322.4,324.65,3


In [12]:
simce_gender_dup = (
    simce_raw.groupby("RUT")
    .filter(lambda x: x["gender"].nunique() > 1)
    .sort_values("RUT")
)
simce_gender_dup.head()

Unnamed: 0,No,RUT,ID,gender,school,type,grade,spanish,math,lengths
573,574,22118,194886,1,5654,2,4,297.17,268.49,3
574,575,22118,60771,1,20074,2,8,304.56,294.14,3
575,576,22118,65198,2,20074,2,10,326.06,327.14,3
747,748,28930,144163,1,293,1,4,132.83,162.48,3
748,749,28930,22592,2,288,1,8,280.26,316.8,3


In [40]:
simce = (
    simce_raw.groupby("RUT")
    .filter(
        lambda x: (x["type"].nunique() == 1)
#         and (x["gender"].nunique() == 1)
        and (x["grade"].nunique() == 3)
    )
    .assign(score=lambda x: x["math"])
    .loc[: ,["RUT", "type", "grade", "score"]]
    .pivot_table(index=["type", "RUT"], columns="grade", values="score")
    .sort_index()
)

simce.head()

Unnamed: 0_level_0,grade,4,8,10
type,RUT,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,35,338.86,303.94,372.51
1,1144,248.64,259.76,210.17
1,2010,210.26,227.33,130.5
1,3392,202.01,210.38,161.9
1,3951,260.91,239.0,277.89


In [46]:
# sample
simce = simce.groupby("type").head(100)

In [47]:
Y = simce.values
X = pd.get_dummies(simce.index.get_level_values("type")).values
Z = np.vstack([np.ones(simce.shape[1]), simce.columns.values])

## ML Estimation

In [52]:
def gmanova_mle(Y, X, Z):
    n, p = Y.shape
    XTXinv = np.linalg.inv(X.T @ X)
    HX = X @ XTXinv @ X.T
    S = Y.T @ (np.identity(n) - HX) @ Y
    Sinv = np.linalg.inv(S)
    left = XTXinv @ X.T
    right = Z.T @ np.linalg.inv(Z @ Sinv @ Z.T)
    B = left @ Y @ Sinv @ right
    res = Y - X @ B @ Z
    Sigma = (res.T @ res) / n
    return B, Sigma

B, Sigma = gmanova_mle(Y, X, Z)

In [54]:
B

array([[240.57432626,   0.87613166],
       [254.07771261,   2.6271382 ],
       [275.90268959,   6.54407287]])

In [55]:
Sigma

array([[2207.01685006, 1365.12641335, 1589.10497768],
       [1365.12641335, 1890.73959598, 1524.82523439],
       [1589.10497768, 1524.82523439, 3177.00016305]])