In [1]:
import pandas as pd  # For dataframe
import numpy as np  # For matrix operations
import sklearn.preprocessing as sklpre  # For preprocessing (scaling)
import sklearn.linear_model as skllm  # For OLS
import sklearn.model_selection as sklms  # For train_test_split
from scipy import stats  # To calc p-value

# For automatic formatting of code, sparing you from my usually horrible looking code
%load_ext nb_black

<IPython.core.display.Javascript object>

# Task 1.1
I have chosen to one-hot encode the SEX-category, as neither male nor female should be considered adifferent numbers. 
The rest of the categorical values are just true/false, so those aren't encoded. 
Then I scale all the scalar features, not touching the categorical features.

In [2]:
# Reading the data into dataframe
df = pd.read_csv("data_task_1.txt", header=0, sep=" ")
# Onehot-encoding gender
onehot_gender = pd.get_dummies(df["SEX"]).set_axis(
    ["Male", "Female"], axis=1, inplace=False
)
# Replacing old gender column
df = df.join(onehot_gender)
df.drop("SEX", axis=1, inplace=True)
# List of boolean categories
categorical = [
    "ADHEU",
    "HOCHOZON",
    "AMATOP",
    "AVATOP",
    "ADEKZ",
    "ARAUCH",
    "FSNIGHT",
    "FSPT",
    "FSATEM",
    "FSAUGE",
    "FSPFEI",
    "FSHLAUF",
    "Male",
    "Female",
]

# A loop that splits the data and tries again until there is no split where only one modality is in one split
first = True
while (
    first
    or np.any(
        np.logical_or(X_train.sum(axis=0) == 0, X_train.sum(axis=0) == X_train.shape[0])
    )
    or np.any(
        np.logical_or(X_test.sum(axis=0) == 0, X_test.sum(axis=0) == X_test.shape[0])
    )
):
    first = False
    # Splitting over and over until splits are good, stratifying the most biased feature.
    X_train, X_test, y_train, y_test = sklms.train_test_split(
        df.loc[:, df.columns != "FFVC"],
        df["FFVC"],
        test_size=0.5,
        stratify=df["FSATEM"],
    )
# Scaling scalar features based on train set
scaler = sklpre.StandardScaler()
X_train_continous = scaler.fit_transform(
    X_train.loc[:, np.logical_not(np.isin(X_train.columns, categorical))].values
)
X_test_continous = scaler.transform(
    X_test.loc[:, np.logical_not(np.isin(X_test.columns, categorical))].values
)
X_train.loc[
    :, np.logical_not(np.isin(X_train.columns, categorical))
] = X_train_continous
X_test.loc[:, np.logical_not(np.isin(X_test.columns, categorical))] = X_test_continous
# All preprocessing done!

<IPython.core.display.Javascript object>

# Task 1.2
Running OLS, calculating uncertainties and p-values

In [3]:
def get_summary_linear_model(model, X_train, y_train):
    """
    Scikit-learn has no built in support for confidence intervals and p-values, so I 
    made this to calculate it for me after fitting the model. Put into a function for reuse.
    """
    # Combining intercept and coefficients in same array
    coefficients = np.append(model.intercept_, model.coef_)

    # Predicting y
    y_hat = model.predict(X_train)
    # Calculating RSS to get variance for use when calculating stddev of coeffs
    residuals = y_train.values - y_hat
    rss = residuals.reshape(-1, 1).T @ residuals.reshape(-1, 1)
    var = rss[0, 0] / (len(X_train) - len(X_train.columns) - 1)

    # Adding intercept to X_train, as sklearn usually does not need the column of 1's
    X_with_intercept = np.append(
        np.ones(X_train.shape[0]).reshape(-1, 1), X_train, axis=1
    )
    # Stddev of coefficients
    stddev = np.sqrt(
        (np.diag(var * np.linalg.pinv(X_with_intercept.T @ X_with_intercept)))
    )
    labels = ["Intercept"] + X_train.columns.tolist()

    coef_over_std = coefficients / stddev
    p_values = [
        2 * (1 - stats.t.cdf(np.abs(i), (len(X_with_intercept) - 1)))
        for i in coef_over_std
    ]

    # Putting results into table
    coeffs_table = pd.DataFrame(zip(labels, coefficients, stddev, p_values))
    # Giving nice names with TeX formatting
    coeffs_table.rename(
        columns={0: "Feature", 1: r"$\beta_i$", 2: r"$\pm$", 3: "p-values"},
        inplace=True,
    )
    return coeffs_table

<IPython.core.display.Javascript object>

In [4]:
# OLS on train data
ols_reg = skllm.LinearRegression().fit(X_train, y_train)
# R2 score
r2 = ols_reg.score(X_test, y_test)
coeffs_table = get_summary_linear_model(ols_reg, X_train, y_train)
most_important = coeffs_table["Feature"].values[1:][
    np.argmin(coeffs_table["p-values"].values[1:])
]
# Printing results
print(f"Got an R^2 score of {r2:.2f} for the test set.")
print(f"The most important feature (lowest p-value) is {most_important}.")
coeffs_table

Got an R^2 score of 0.65 for the test set.
The most important feature (lowest p-value) is FLGROSS.


Unnamed: 0,Feature,$\beta_i$,$\pm$,p-values
0,Intercept,2.317285,0.029736,0.0
1,ALTER,0.008289,0.017125,0.6288
2,ADHEU,-0.004462,0.06922,0.9486558
3,HOCHOZON,-0.099654,0.04245,0.01968645
4,AMATOP,0.012938,0.034508,0.7080294
5,AVATOP,-0.045576,0.036278,0.210195
6,ADEKZ,-0.013319,0.036899,0.7184402
7,ARAUCH,-0.016967,0.033215,0.6099289
8,AGEBGEW,0.018089,0.015769,0.2524305
9,FSNIGHT,0.029517,0.053945,0.5847592


<IPython.core.display.Javascript object>

The most important (lower p-value) feature seems to be FLGROSS. Some other important features seem to be gender. Male and female seem to completely cancel each other... Overfit maybe?

# Task 1.3
Scikit-learn for some reason doesn't have built in forward and backward selection, so I will create my own functions!

In [141]:
def backward_elimination(regressor, X_train, y_train, max_p_limit):
    regressor.fit(X_train, y_train)
    result_table = get_summary_linear_model(regressor, X_train, y_train)
    p_values = result_table["p-values"].values
    p_val_max, p_val_max_pos = p_values[1:].max(), p_values[1:].argmax() + 1
    feature_max_p_val = result_table["Feature"][p_val_max_pos]
    removed_features = [feature_max_p_val]
    X_reduce = X_train.drop(columns=feature_max_p_val, inplace=False)
    while p_val_max > max_p_limit:
        regressor.fit(X_reduce, y_train)
        result_table = get_summary_linear_model(regressor, X_reduce, y_train)
        p_values = result_table["p-values"].values
        p_val_max, p_val_max_pos = p_values[1:].max(), p_values[1:].argmax() + 1
        feature_max_p_val = result_table["Feature"][p_val_max_pos]
        if p_val_max > max_p_limit:
            removed_features.append(feature_max_p_val)
            X_reduce.drop(columns=feature_max_p_val, inplace=True)
            X_reduce.sort_index(axis=1, inplace=True)
            regressor.fit(X_reduce, y_train)

    return regressor, X_reduce, result_table, removed_features

<IPython.core.display.Javascript object>

In [142]:
base_regressor = skllm.LinearRegression()
(
    regressor_reduced,
    X_reduce_train,
    result_table_reduced,
    removed_features,
) = backward_elimination(base_regressor, X_train, y_train, max_p_limit=1e-2)
X_reduce_test = X_test.drop(columns=removed_features).sort_index(axis=1)
print(
    ols_reg.score(X_test, y_test), regressor_reduced.score(X_reduce_test, y_test),
)
print(X_reduce_train,)

0.6500127981002248 0.6555450391780477
        FLGEW   FLGROSS     FMILB  FSATEM  Female  Male
414  0.492973 -1.327469 -0.412082       0       1     0
123 -0.780541  0.459580 -0.412082       0       1     0
488 -0.780541 -0.805588 -0.412082       1       0     1
294  1.058980  0.443765 -0.412082       0       0     1
26   0.068469 -0.520925 -0.412082       0       1     0
..        ...       ...       ...     ...     ...   ...
398 -0.780541 -1.153509 -0.412082       0       1     0
443 -0.356036  0.507023 -0.412082       0       0     1
28  -0.497538 -0.679071 -0.412082       0       1     0
402 -1.063544 -0.457667 -0.412082       0       0     1
237 -0.922043 -0.884661 -0.412082       0       0     1

[248 rows x 6 columns]


<IPython.core.display.Javascript object>

In [134]:
def forward_selection(regressor, X_train, y_train, max_p_limit):
    X_null = pd.DataFrame({"null": np.zeros_like(y_train)})
    regressor.fit(X_null, y_train)
    result_table = get_summary_linear_model(regressor, X_null, y_train)
    p_val_max = result_table["p-values"][0]
    X_increased = pd.DataFrame()
    features = X_train.columns.values
    while p_val_max < max_p_limit:
        best_p = np.inf
        for feature in features:
            new_col = pd.DataFrame({feature: X_train[feature].values})
            if len(X_increased.values) == 0:
                X_candidate = X_increased.append(new_col)
            else:
                new_col_names = np.append(
                    X_increased.columns.values, new_col.columns.values
                )
                X_candidate = pd.DataFrame(
                    np.append(X_increased.values, new_col.values, axis=1),
                    columns=new_col_names,
                )
            regressor.fit(X_candidate, y_train)
            result_table = get_summary_linear_model(regressor, X_candidate, y_train)
            p_i = result_table["p-values"].values[-1]
            if p_i < best_p:
                best_p = p_i
                best_new_feature = feature
        new_col = pd.DataFrame({best_new_feature: X_train[best_new_feature].values})
        if len(X_increased.values) == 0:
            X_candidate = X_increased.append(new_col)
        else:
            new_col_names = np.append(
                X_increased.columns.values, new_col.columns.values
            )
            X_candidate = pd.DataFrame(
                np.append(X_increased.values, new_col.values, axis=1),
                columns=new_col_names,
            )
        result_table = get_summary_linear_model(regressor, X_candidate, y_train)
        p_val_max = result_table["p-values"].values.max()
        if p_val_max < max_p_limit or True:
            X_increased = X_candidate.sort_index(axis=1)
            features = features[features != best_new_feature]

    omitted_features = features
    regressor.fit(X_increased, y_train)
    return regressor, X_increased, result_table, omitted_features

<IPython.core.display.Javascript object>

In [135]:
base_regressor = skllm.LinearRegression()
(
    regressor_increased,
    X_increased_train,
    result_table_increased,
    omitted_features_increased,
) = forward_selection(base_regressor, X_train, y_train, max_p_limit=1e-2)
X_increased_test = X_test.drop(columns=omitted_features_increased).sort_index(axis=1)
print(regressor_increased.score(X_increased_test, y_test))
print(X_increased_train, X_increased_test)



0.6633110164289205
        FLGEW   FLGROSS  FSATEM  Female  Male
0    0.492973 -1.327469     0.0     1.0   0.0
1   -0.780541  0.459580     0.0     1.0   0.0
2   -0.780541 -0.805588     1.0     0.0   1.0
3    1.058980  0.443765     0.0     0.0   1.0
4    0.068469 -0.520925     0.0     1.0   0.0
..        ...       ...     ...     ...   ...
243 -0.780541 -1.153509     0.0     1.0   0.0
244 -0.356036  0.507023     0.0     0.0   1.0
245 -0.497538 -0.679071     0.0     1.0   0.0
246 -1.063544 -0.457667     0.0     0.0   1.0
247 -0.922043 -0.884661     0.0     0.0   1.0

[248 rows x 5 columns]         FLGEW   FLGROSS  FSATEM  Female  Male
419  1.058980  0.317248       0       1     0
366  3.181504  1.487528       0       0     1
329 -1.063544 -1.627946       0       0     1
404 -1.346548 -1.548874       0       0     1
454 -0.497538 -0.520925       0       1     0
..        ...       ...     ...     ...   ...
42  -0.073033 -0.758144       0       1     0
433  1.058980  0.775872       0      

<IPython.core.display.Javascript object>