# Student performance (Kaggle) (Moritz)
- https://www.kaggle.com/c/184702-tu-ml-ws-18-student-performance
- small samples (train = 198), medium dimension (32)
- attribute characteristics: numeric, categorical
- Predict: Grade
- Result file cols: id, Grade
- Missing values: No

## with preprocessing
- scale (fit train data to scaler, scale train and test data)
    - _SVR:_ very long runtime without scaling
- merge train and test data
- one hot encode (+ drop first columns) categorical data

### Linear Regression
- with preprocessing
- with all samples:
    - <1 s
    - RMSE: 3.56123
    - Kaggle: 4.75835
    
### SVR
- with preprocessing
- with all samples:
    - few seconds
    - C: 0.2, kernel: linear, epsilon: 0.5, gamma: auto 
    - RMSE: 4.26364
    - Kaggle: 4.51673
    
### Gradient Boosted Decision Tree
- with preprocessing
- with all samples:
    - few minutes
    - {'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 150}
    - RMSE: 3.8868
    - Kaggle: __4.24893__

In [5]:
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import datetime as dt
from sklearn.metrics import mean_squared_error
%run './base.ipynb'

In [6]:
# Import and preprocessing

# read train data
train = pd.read_csv('./data/student_performance_kaggle/StudentPerformance.shuf.train.csv')
# add index 'train' and val of id col
train['_index'] = 'train'
train.set_index(['_index', 'id'], inplace=True)
# extract, then drop 'Grade' col
train_target = train[['Grade']]
train.drop(['Grade'], axis='columns', inplace=True)

# read test data
test = pd.read_csv('./data/student_performance_kaggle/StudentPerformance.shuf.test.csv')
# add index 'test' and val of id coll
test['_index'] = 'test'
test.set_index(['_index', 'id'], inplace=True)

# scale train and test data
train_s, test_s = scale_data(train, test)

# concat train and test data for futher preprocessing
data_s = pd.concat([train_s, test_s])

# one hot encode data
data_oh = one_hot(data_s, drop_first=True)

#display(data_oh)

# split data into train and test
X_train = data_oh.loc['train']
y_train = train_target
X_test = data_oh.loc['test']

display(X_train.shape)

(198, 39)

In [8]:
# Linear Regression
reg = linear_reg(X_train, y_train, X_train, y_train)
result = pd.DataFrame(reg.predict(X_test), columns=['Grade'])

# join id col
result = pd.concat([X_test.reset_index()[['id']], result], axis='columns')

# Save result
filename = f'''lr_{dt.datetime.now()}.csv'''

result.to_csv('./predictions/student_performance_kaggle/' + filename, sep = ",", index=False)
print(f'''Saved as {filename}''')

#display(result)

R^2 value for model: 0.40842998603547453
Predict:
RMSE: 3.56123
R^2 Score: 0.40842998603547453
Saved as lr_2019-01-02 11:45:14.195347.csv


In [9]:
# SVR
# params
param_grid = {
    'C': np.linspace(.2,1,5),
    'kernel': ['linear', 'rbf', 'sigmoid', 'poly'], # poly very slow
    'epsilon': np.linspace(0,.5,6),
    'gamma': ['auto', 'scale']
}

# run grid search
gs = run_svr(X_train, y_train.values.ravel(), cv=5, param_grid=param_grid)

# predict
result = pd.DataFrame(gs.best_estimator_.predict(X_test), columns=['Grade'])

# join id col
result = pd.concat([X_test.reset_index()[['id']], result], axis='columns')
#display(result)

# Create SVR filename
filename = f'''svr_'''\
           f'''C-{gs.best_estimator_.C}_'''\
           f'''k-{gs.best_estimator_.kernel}_'''\
           f'''e-{gs.best_estimator_.epsilon}_'''\
           f'''g-{gs.best_estimator_.gamma}_'''\
           f'''{dt.datetime.now()}.csv'''

result.to_csv('./predictions/student_performance_kaggle/' + filename, sep = ",", index=False)
print(f'''Saved as {filename}''')

GridSearch initializing...
SVR model in training...
MSE: 18.1786, RMSE: 4.26364, C: 0.2, kernel: linear, epsilon: 0.5, gamma: auto 
Saved as svr_C-0.2_k-linear_e-0.5_g-auto_2019-01-02 11:46:28.043171.csv


In [10]:
# Gradient Boosted Decision Tree
param_fix = {
    'learning_rate': .01, 
    'loss': 'ls'
}

param_grid = {
    'n_estimators': (50, 100, 150, 200, 300, 400, 500), 
    'max_depth': (1, 2, 3, 4, 5), 
    'min_samples_split': (2, 3, 5)
}

gs = run_boosted_tree(X_train, y_train.values.ravel(), [], [], param_fix=param_fix, cv=10, param_grid=param_grid)

#plot_scores(gbt.cv_results_)
#plot_training_deviance(gbt, test_data, test_target)

# predict
result = pd.DataFrame(gs.best_estimator_.predict(X_test), columns=['Grade'])

# join id col
result = pd.concat([X_test.reset_index()[['id']], result], axis='columns')
#display(result)

# Create SVR filename
filename = f'''gbdtree_'''\
           f'''ne-{gs.best_estimator_.n_estimators}_'''\
           f'''md-{gs.best_estimator_.max_depth}_'''\
           f'''mss-{gs.best_estimator_.min_samples_split}_'''\
           f'''{dt.datetime.now()}.csv'''

result.to_csv('./predictions/student_performance_kaggle/' + filename, sep = ",", index=False)
print(f'''Saved as {filename}''')

GridSearch initializing...
GradientBoostedRegressor model in training...
GradientBoostedRegressor model selected and fitted in 82.193 s

MSE: 15.10722, RMSE: 3.8868
Best parameters selected by GridSearch: {'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 150}


Unnamed: 0,id,Grade
0,312,8.296581
1,164,9.936007
2,245,11.190844
3,303,10.496202
4,260,11.994454
5,376,11.385717
6,86,10.943066
7,375,9.891644
8,367,11.292694
9,6,11.491068


Saved as gbdtree_ne-150_md-5_mss-5_2019-01-02 11:48:46.282577.csv


In [137]:
# Unused
# Get a feeling for the dataset

# Check if train DataFrame has NaNs
if(train.isnull().values.any()): print('NaNs!')
else: print('Nons!')

Nons!
