In [63]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go

from numpy import ndarray
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [24]:
class RegressioModel:
    def __init__(self):
        self.coefs = None

    @property
    def coefs(self):
        return self._coefs

    @coefs.setter
    def coefs(self, value):
        self._coefs = value

    def fit(self, data: ndarray, target: ndarray) -> None:
        """
        Calculate parameters for linear regression.
        """

        Q, R = np.linalg.qr(data)
        self._coefs = np.linalg.solve(
           R, np.dot(Q.T, target)
        )
    
    def predict(self, data: ndarray) -> ndarray:
        """
        Calculate the output for every line array in data input. 
        """
        
        return np.array([
            np.dot(x, self._coefs) for x in data
        ])

In [55]:
def rsme(target, predicts):
    return np.sqrt(mean_squared_error(target, predicts))

In [56]:
data = pd.read_csv('data/life-expectancy-data.csv')
ord_data = pd.read_csv("data/curated_data.csv")

In [57]:
no_nulls = data.dropna()
print(f'dataset size: {len(no_nulls.index)}')

dataset size: 1649


In [58]:
# Extrating target values
target = no_nulls['Life expectancy ']
y = np.array(target)

# Removing non-feature columns
df = no_nulls.drop(
    columns=['Country', 'Status', 'Life expectancy '])

# Converting to array matrix
X = np.array(df[df.columns])

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [59]:
# run experiments
num_features = len(df.columns)

results = []

for window_size in range(num_features):

    model = RegressioModel()
    model.fit(
        X_train[:, :window_size],
        y_train
    )

    predicts = model.predict(X_test[:, :window_size])
    error = rsme(y_test, predicts)
    results.append((window_size, error))

In [73]:
# results = experiment(no_nulls)

fig = go.Figure()

x_axis = [result[0] for result in results]
y_axis = [result[1] for result in results]

fig.add_trace(
    go.Scatter(
        x=x_axis,
        y=y_axis,
        mode='lines'))

fig.update_layout(
    title="Original Dataset",
    xaxis_title="Number of Features",
    yaxis_title="RSME",
)

fig.show()

In [49]:
results

[(0, 70.28203301795998),
 (1, 9.049465551715814),
 (2, 6.235221577942496),
 (3, 6.087043549270151),
 (4, 5.550582323522999),
 (5, 5.422912363483934),
 (6, 5.356609424481372),
 (7, 5.362762991361265),
 (8, 4.986320909351455),
 (9, 4.760374045856606),
 (10, 4.7341094940897825),
 (11, 4.734513638933789),
 (12, 4.725566710454999),
 (13, 4.3016001126955),
 (14, 4.264528415501108),
 (15, 4.290849511671106),
 (16, 4.279378953617626),
 (17, 4.279380166341592),
 (18, 3.7319963850836366)]

## Using the dataset with features sorted by Stoppiglia method

In [51]:
# Extrating target values
target = ord_data['Life expectancy ']
y = np.array(target)

# Removing non-feature columns
new_df = ord_data.drop(columns=['Life expectancy '])

# Converting to array matrix
X = np.array(new_df[new_df.columns])

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [52]:
# run experiments
num_features = len(new_df.columns)

ord_results = []

for window_size in range(num_features):

    model = RegressioModel()
    model.fit(
        X_train[:, :window_size],
        y_train
    )

    predicts = model.predict(X_test[:, :window_size])
    error = rsme(y_test, predicts)
    ord_results.append((window_size, error))

In [74]:
# ord_results = experiment(ord_data)

fig = go.Figure()

x_axis = [result[0] for result in ord_results]
y_axis = [result[1] for result in ord_results]

fig.add_trace(
    go.Scatter(
        x=x_axis,
        y=y_axis,
        mode='lines'))

fig.update_layout(
    title="Stoppiglia Dataset",
    xaxis_title="Number of Features",
    yaxis_title="RSME",
)

fig.show()

In [54]:
ord_results

[(0, 70.28203301795998),
 (1, 9.049465551715814),
 (2, 8.942617962818504),
 (3, 8.938524745540311),
 (4, 6.255464201876502),
 (5, 5.71575847737982),
 (6, 5.667415278950387),
 (7, 5.671039640444944),
 (8, 5.674228076197476),
 (9, 5.6356479737132394),
 (10, 4.369092522670686),
 (11, 4.336565671481949),
 (12, 4.328949509867081),
 (13, 4.320312425597092),
 (14, 4.171406082625705),
 (15, 4.0854293288865735),
 (16, 4.081772488199299),
 (17, 4.087142432391972),
 (18, 4.095134889351552)]