In [108]:
import pandas as pd
import numpy as np
import sklearn as skl
import sklearn.preprocessing as sklpre
import sklearn.linear_model as skllm
import sklearn.model_selection as sklms
import statsmodels.api as sm
from scipy import stats

# Task 1.1
I have chosen to one-hot encode the SEX-category, as neither male nor female should be considered adifferent numbers. 
The rest of the categorical values are just true/false, so those aren't encoded. 
Then I scale all the scalar features, not touching the categorical features.

In [109]:
# Reading the data into dataframe
df = pd.read_csv("data_task_1.txt", header=0, sep=" ")
# Onehot-encoding gender
onehot_gender = pd.get_dummies(df["SEX"]).set_axis(["Male", "Female"], axis=1, inplace=False)
# Replacing old gender column
df = df.join(onehot_gender)
df.drop("SEX", axis=1, inplace=True)
# List of boolean categories
categorical = ["ADHEU", "HOCHOZON", "AMATOP", "AVATOP", "ADEKZ", "ARAUCH",
               "FSNIGHT", "FSPT", "FSATEM", "FSAUGE", "FSPFEI", "FSHLAUF", "Male", "Female"]

# A loop that splits the data and tries again until there is no split where only one modality is in one split
first = True
while first or np.any(np.logical_or(X_train.sum(axis=0)==0, X_train.sum(axis=0)==X_train.shape[0])) or np.any(np.logical_or(X_test.sum(axis=0)==0, X_test.sum(axis=0)==X_test.shape[0])):
    first = False
    X_train, X_test, y_train, y_test = sklms.train_test_split(df.loc[:, df.columns != "FFVC"],
                                                          df["FFVC"],
                                                          test_size=0.5,
                                                          stratify=df["FSATEM"])
# Scaling scalar features based on train set
scaler = sklpre.StandardScaler()
X_train_continous = scaler.fit_transform(X_train.loc[:, np.logical_not(np.isin(X_train.columns, categorical))].values)
X_test_continous = scaler.transform(X_test.loc[:, np.logical_not(np.isin(X_test.columns, categorical))].values)
X_train.loc[:, np.logical_not(np.isin(X_train.columns, categorical))] = X_train_continous
X_test.loc[:, np.logical_not(np.isin(X_test.columns, categorical))] = X_test_continous
# All preprocessing done!

# Task 1.2
Running OLS, calculating uncertainties and p-values

In [115]:
def get_summary_linear_model(model, X_train, y_train):
    """
    Scikit-learn has no built in support for confidence intervals and p-values, so I 
    made this to calculate it for me after fitting the model. Put into a function for reuse.
    """
    # Combining intercept and coefficients in same array
    coefficients = np.append(model.intercept_, model.coef_)
    
    # Predicting y
    y_hat = model.predict(X_train)
    # Calculating RSS to get variance for use when calculating stddev of coeffs
    residuals = y_train.values - y_hat
    rss = residuals.reshape(-1, 1).T @ residuals.reshape(-1, 1)
    var = rss[0, 0] / (len(X_train) - len(X_train.columns) - 1)
    
    
    # Adding intercept to X_train, as sklearn usually does not need the column of 1's
    X_with_intercept = np.append(np.ones(X_train.shape[0]).reshape(-1,1), X_train, axis=1)
    # Stddev of coefficients
    stddev =  np.sqrt((np.diag(var * np.linalg.pinv(X_with_intercept.T @ X_with_intercept))))
    labels = ["Intercept"] + X_train.columns.tolist()
    
    
    coef_over_std = coefficients / stddev
    p_values =[2*(1-stats.t.cdf(np.abs(i),(len(X_with_intercept)-1))) for i in coef_over_std]
    
    # Putting results into table
    coeffs_table = pd.DataFrame(zip(labels, coefficients, stddev, p_values))
    # Giving nice names with TeX formatting
    coeffs_table.rename(columns={0:"Feature", 1:r"$\beta_i$", 2:r"$\pm$", 3:"p-values"}, inplace=True)
    return coeffs_table


In [116]:
# OLS on train data
ols_reg = skllm.LinearRegression().fit(X_train, y_train)
# R2 score
r2 = ols_reg.score(X_test, y_test)
coeffs_table = get_summary_linear_model(ols_reg, X_train, y_train)
# Printing results
print(f"Got an R^2 score of {r2:.2f} for the test set.")
coeffs_table

Got an R^2 score of 0.63 for the test set.


Unnamed: 0,Feature,$\beta_i$,$\pm$,p-values
0,Intercept,2.344563,0.028057,0.0
1,ALTER,0.008147,0.0162,0.6154766
2,ADHEU,-0.100479,0.062798,0.1108709
3,HOCHOZON,-0.084859,0.038877,0.02999842
4,AMATOP,0.009551,0.034462,0.7818985
5,AVATOP,-0.000305,0.033494,0.9927477
6,ADEKZ,-0.027338,0.035423,0.4409851
7,ARAUCH,0.001231,0.030792,0.9681437
8,AGEBGEW,0.006275,0.014706,0.6699498
9,FSNIGHT,0.026442,0.053006,0.6183284


In [31]:
intercept_col = pd.DataFrame(np.ones_like(y_train.values), columns=["Intercept"])
X_train_intercept = sm.add_constant(X_train, prepend=True) #intercept_col.join(X_train)
mod = sm.OLS(y_train, X_train_intercept).fit()
values_of_interest = ["Coef.", "Std.Err.", "P>|t|"]
table = mod.summary2().tables[1]
table = table.loc[:, np.isin(table.columns, values_of_interest)]
table

Unnamed: 0,Coef.,Std.Err.,P>|t|
const,1.569932,0.027724,2.501024e-134
ALTER,-0.006871,0.015986,0.6677619
ADHEU,-0.005608,0.06174,0.9277135
HOCHOZON,-0.068222,0.03944,0.0850561
AMATOP,-0.034179,0.034739,0.3262413
AVATOP,0.006831,0.03351,0.8386527
ADEKZ,-0.00661,0.037981,0.8620043
ARAUCH,-0.018108,0.031185,0.5620545
AGEBGEW,0.006056,0.014471,0.6760061
FSNIGHT,-0.004301,0.050362,0.9320261
