In [11]:
import keras

In [96]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [119]:
df = pd.read_csv("../data/academic-success.csv", sep=";")

In [120]:
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [121]:
# Group the data by 'Daytime/evening attendance\t' and calculate the mean of 'admission grade'
mean_admission_grades = df.groupby('Daytime/evening attendance\t')['Admission grade'].mean()

# Display the result
print(mean_admission_grades)

Daytime/evening attendance\t
0    126.648447
1    127.018523
Name: Admission grade, dtype: float64


In [122]:
df.columns

Index(['Marital status', 'Application mode', 'Application order', 'Course',
       'Daytime/evening attendance\t', 'Previous qualification',
       'Previous qualification (grade)', 'Nacionality',
       'Mother's qualification', 'Father's qualification',
       'Mother's occupation', 'Father's occupation', 'Admission grade',
       'Displaced', 'Educational special needs', 'Debtor',
       'Tuition fees up to date', 'Gender', 'Scholarship holder',
       'Age at enrollment', 'International',
       'Curricular units 1st sem (credited)',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd

In [123]:
# Print the dimensions of the original data
print("Dimensions of original data:", df.shape)

# Drop any rows the contain missing values
df_clean = df.dropna()

# Print the dimensions of the modified data 
print("Dimensions of modified data:", df_clean.shape)

Dimensions of original data: (4424, 37)
Dimensions of modified data: (4424, 37)


In [124]:
df.dtypes

Marital status                                      int64
Application mode                                    int64
Application order                                   int64
Course                                              int64
Daytime/evening attendance\t                        int64
Previous qualification                              int64
Previous qualification (grade)                    float64
Nacionality                                         int64
Mother's qualification                              int64
Father's qualification                              int64
Mother's occupation                                 int64
Father's occupation                                 int64
Admission grade                                   float64
Displaced                                           int64
Educational special needs                           int64
Debtor                                              int64
Tuition fees up to date                             int64
Gender        

In [116]:
print([i for i in df.columns])

['Marital status', 'Application mode', 'Application order', 'Course', 'Daytime/evening attendance\t', 'Previous qualification', 'Previous qualification (grade)', 'Nacionality', "Mother's qualification", "Father's qualification", "Mother's occupation", "Father's occupation", 'Admission grade', 'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'Age at enrollment', 'International', 'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)', 'Unemployment rate', 'Inflation rate', 'GDP', 'Target']


In [125]:
# Normalize the data
predictors = [
    "Age at enrollment",
    "Curricular units 1st sem (credited)",
    "Curricular units 1st sem (enrolled)",
    "Curricular units 1st sem (evaluations)",
    "Curricular units 1st sem (approved)",
    "Previous qualification (grade)",
    "Curricular units 1st sem (grade)",
    "Curricular units 1st sem (without evaluations)",
    "Curricular units 2nd sem (evaluations)",
    "Curricular units 2nd sem (approved)",
    "Curricular units 2nd sem (grade)",
    "Curricular units 2nd sem (without evaluations)",
    "Unemployment rate",
]

X = df[predictors]

# Normalize the data
X = (X - X.mean()) / X.std()

y = df["Admission grade"]

y = (y - y.mean()) / y.std()

In [87]:
X

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,-0.804750,1,19,12,...,-0.28241,-2.838016,-2.042399,-1.471361,-1.963267,-0.199418,-0.287606,0.124372,0.765674,Dropout
1,1,15,1,9254,1,1,2.076585,1,1,3,...,-0.28241,-0.105714,-0.522623,0.518845,0.659487,-0.199418,0.876123,-1.105097,0.347160,Graduate
2,1,1,5,9070,1,1,-0.804750,1,37,37,...,-0.28241,-0.105714,-2.042399,-1.471361,-1.963267,-0.199418,-0.287606,0.124372,0.765674,Dropout
3,1,17,2,9773,1,1,-0.804750,1,38,37,...,-0.28241,-0.105714,0.490561,0.187144,0.416403,-0.199418,-0.813161,-1.466705,-1.375356,Graduate
4,2,39,1,8014,0,1,-2.472892,1,37,38,...,-0.28241,-0.105714,-0.522623,0.518845,0.531548,-0.199418,0.876123,-1.105097,0.347160,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,6,9773,1,1,-0.577276,1,1,1,...,-0.28241,-0.105714,-0.016031,0.187144,0.467578,-0.199418,1.476757,1.136876,-1.789464,Graduate
4420,1,1,2,9773,1,1,-0.956399,105,1,1,...,-0.28241,-0.105714,-0.522623,-0.807959,0.147730,-0.199418,-0.174987,-0.454201,0.889026,Dropout
4421,1,1,1,9500,1,1,1.621637,1,37,37,...,-0.28241,0.805053,0.237265,-1.139660,0.627502,-0.199418,0.876123,-1.105097,0.347160,Dropout
4422,1,1,1,9147,1,1,3.593077,1,37,37,...,-0.28241,-0.561098,-0.522623,0.187144,0.339639,-0.199418,-0.813161,-1.466705,-1.375356,Graduate


In [91]:
# reference from https://www.science.smith.edu/~jcrouser/SDS293/labs/lab8-py.html

import statsmodels.api as sm
import matplotlib.pyplot as plt

def processSubset(feature_set):
    # Fit model on feature set and calculate RSS
    model = sm.OLS(y,X[list(feature_set)])
    regr = model.fit()
    RSS = ((regr.predict(X[list(feature_set)]) - y) ** 2).sum()
    return {"model":regr, "RSS":RSS}

In [92]:
import time
import itertools

def getBest(k):
    
    tic = time.time()
    
    results = []
    
    for combo in itertools.combinations(X.columns, k):
        results.append(processSubset(combo))
    
    # Wrap everything up in a dataframe
    models = pd.DataFrame(results)
    
    # Choose the model with the lowest RSS
    best_model = models.loc[models['RSS'].argmin()]
    
    toc = time.time()
    print("Processed", models.shape[0], "models on", k, "predictors in", (toc-tic), "seconds.")
    
    # Return the best model, along with some other useful information about the model
    return best_model

In [93]:
models_best = pd.DataFrame(columns=["RSS", "model"])

tic = time.time()
for i in range(1,8):
    models_best.loc[i] = getBest(i)

toc = time.time()
print("Total elapsed time:", (toc-tic), "seconds.")

Processed 18 models on 1 predictors in 0.015819072723388672 seconds.
Processed 153 models on 2 predictors in 0.15841007232666016 seconds.
Processed 816 models on 3 predictors in 1.177778959274292 seconds.
Processed 3060 models on 4 predictors in 6.418569087982178 seconds.
Processed 8568 models on 5 predictors in 19.5941379070282 seconds.
Processed 18564 models on 6 predictors in 52.46624302864075 seconds.
Processed 31824 models on 7 predictors in 101.21685719490051 seconds.
Total elapsed time: 184.17635869979858 seconds.


In [94]:
models_best

Unnamed: 0,RSS,model
1,71330110.918047,<statsmodels.regression.linear_model.Regressio...
2,71330110.918047,<statsmodels.regression.linear_model.Regressio...
3,71330110.918047,<statsmodels.regression.linear_model.Regressio...
4,71330110.918047,<statsmodels.regression.linear_model.Regressio...
5,71330110.918047,<statsmodels.regression.linear_model.Regressio...
6,71330110.918047,<statsmodels.regression.linear_model.Regressio...
7,71330110.918047,<statsmodels.regression.linear_model.Regressio...


In [11]:
# Regression
# X: International, scholarship, 1st sem grade, 2nd sem grade
# Y: Admission grade

In [None]:
# Normalize the data

# Pull out the target variable
y = df["Target"]

# Pull out the features
X = df.drop("Target", axis=1)

# Normalize the data
numeric_features = [
    "Age at enrollment",
    "Curricular units 1st sem (credited)",
    "Curricular units 1st sem (enrolled)",
    "Curricular units 1st sem (evaluations)",
    "Curricular units 1st sem (approved)",
    "Previous qualification (grade)",
    "Admission grade",
    "Curricular units 1st sem (grade)",
    "Curricular units 1st sem (without evaluations)",
    "Curricular units 2nd sem (credited)",
    "Curricular units 2nd sem (enrolled)",
    "Curricular units 2nd sem (evaluations)",
    "Curricular units 2nd sem (approved)",
    "Curricular units 2nd sem (grade)",
    "Curricular units 2nd sem (without evaluations)",
    "Unemployment rate",
    "Inflation rate",
    "GDP"
]

In [126]:

# split into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# initialize and train model
model = LinearRegression()
model.fit(X_train, Y_train)

# test set
Y_pred = model.predict(X_test)

# evaluation
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

print("Model Coefficients:", model.coef_)
print("Mean Squared Error:", mse)
print("R2 Score:", r2)

Model Coefficients: [ 5.03302453e-02  1.71321799e-01 -1.97798432e-01 -8.22874136e-02
  5.21213744e-05  5.73426764e-01  8.56074986e-02  4.65196331e-02
  3.10050643e-03  7.61797022e-02  7.76483120e-03 -1.44953405e-02
  1.70997111e-02]
Mean Squared Error: 0.675002441301662
R2 Score: 0.28073160132931463


In [13]:
from sklearn.linear_model import ElasticNet

# initialize model with l1l2 regularization
model = ElasticNet(alpha=1.0, l1_ratio=0.5)

# train model
model.fit(X_train, Y_train)

# predict
Y_pred = model.predict(X_test)

# evaluate model
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

print("Model Coefficients:", model.coef_)
print("Mean Squared Error:", mse)
print("R2 Score:", r2)

Model Coefficients: [0.         0.12616467 0.09303571]
Mean Squared Error: 208.54823237985565
R2 Score: 0.0021087259172655193


In [14]:
# list to store results
results = []

# train model for l1_ratio from 0.1 to 1.0
for l1_ratio in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
    model = ElasticNet(alpha=1.0, l1_ratio=l1_ratio)
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    mse = mean_squared_error(Y_test, Y_pred)
    r2 = r2_score(Y_test, Y_pred)
    results.append({
        'l1_ratio': l1_ratio,
        'coef': model.coef_,
        'MSE': mse,
        'R2': r2
    })

results

[{'l1_ratio': 0.1,
  'coef': array([0.        , 0.13515968, 0.09917302]),
  'MSE': 208.5716595832465,
  'R2': 0.0019966280990625362},
 {'l1_ratio': 0.2,
  'coef': array([0.        , 0.1329523 , 0.09762381]),
  'MSE': 208.56491678643454,
  'R2': 0.0020288919932456295},
 {'l1_ratio': 0.3,
  'coef': array([0.        , 0.1307199 , 0.09608251]),
  'MSE': 208.5587636547821,
  'R2': 0.002058334373620574},
 {'l1_ratio': 0.4,
  'coef': array([0.        , 0.12845201, 0.09455717]),
  'MSE': 208.55318930405247,
  'R2': 0.00208500732063277},
 {'l1_ratio': 0.5,
  'coef': array([0.        , 0.12616467, 0.09303571]),
  'MSE': 208.54823237985565,
  'R2': 0.0021087259172655193},
 {'l1_ratio': 0.6,
  'coef': array([0.        , 0.12384838, 0.09152545]),
  'MSE': 208.54388382362862,
  'R2': 0.002129533508220871},
 {'l1_ratio': 0.7,
  'coef': array([0.        , 0.12149239, 0.09003482]),
  'MSE': 208.54013362910075,
  'R2': 0.0021474779729271365},
 {'l1_ratio': 0.8,
  'coef': array([0.        , 0.11911237, 0