In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pykalman import KalmanFilter
from sklearn.linear_model import LinearRegression

In [13]:
X_columns = ['temperature', 'cpu_percent', 'fan_rpm', 'sys_load_1', 'cpu_freq']
y_column = 'next_temp'


'next_temp'

In [14]:
def get_data(filename):
    """
    Read the given CSV file. Returns sysinfo DataFrame with target (next temperature) column created.
    """
    sysinfo = pd.read_csv(filename, parse_dates=['timestamp'])
    
    # TODO: add the column that we want to predict: the temperatures from the *next* time step.
    sysinfo[y_column] = sysinfo['temperature'].shift(-1) # should be the temperature value from the next row
    sysinfo = sysinfo[sysinfo[y_column].notnull()] # the last row should have y_column null: no next temp known
    return sysinfo

In [23]:
def get_trained_coefficients(X_train, y_train):
    """
    Create and train a model based on the training_data_file data.

    Return the model, and the list of coefficients for the 'X_columns' variables in the regression.
    """
    
    # TODO: create regression model and train.
    model = LinearRegression(fit_intercept=False)
    model.fit(X_train, y_train)
    
    # get coefficients of the model
    coefficients = model.coef_
    
    return model, coefficients

In [6]:
def output_regression(coefficients):
    regress = ' + '.join(f'{coef:.3}*{col}' for col, coef in zip(X_columns, coefficients))
    print(f'next_temp = {regress}')

In [7]:
def plot_errors(model, X_valid, y_valid):
    residuals = y_valid - model.predict(X_valid)
    plt.hist(residuals, bins=100)
    plt.savefig('test_errors.png')
    plt.close()

In [34]:
def smooth_test(coef, sysinfo, outfile):
    X_valid, y_valid = sysinfo[X_columns], sysinfo[y_column]
    
    # feel free to tweak these if you think it helps.
    transition_stddev = 0.4
    observation_stddev = 1.1

    dims = X_valid.shape[-1]
    initial = X_valid.iloc[0]
    observation_covariance = np.diag([observation_stddev, 2, 2, 1, 10]) ** 2
    transition_covariance = np.diag([transition_stddev, 80, 100, 10, 100]) ** 2
    
    # Transition = identity for all variables, except we'll replace the top row
    # to make a better prediction, which was the point of all this.
    transition = np.identity(dims) # identity matrix, except...
    
    # TODO: replace the first row of transition to use the coefficients we just calculated (which were passed into this function as coef.).
    transition[0, :] = coefficients.reshape(1, 5)
    
    
    kf = KalmanFilter(
        initial_state_mean=initial,
        initial_state_covariance=observation_covariance,
        observation_covariance=observation_covariance,
        transition_covariance=transition_covariance,
        transition_matrices=transition,
    )

    kalman_smoothed, _ = kf.smooth(X_valid)

    plt.figure(figsize=(15, 6))
    plt.plot(sysinfo['timestamp'], sysinfo['temperature'], 'b.', alpha=0.5)
    plt.plot(sysinfo['timestamp'], kalman_smoothed[:, 0], 'g-')
    plt.savefig(outfile)
    plt.close()


In [15]:
train = get_data('sysinfo-train.csv')
valid = get_data('sysinfo-valid.csv')

In [16]:
train

Unnamed: 0,timestamp,temperature,sys_load_1,cpu_percent,cpu_freq,fan_rpm,next_temp
0,2020-05-20 12:02:49.850707,32.000000,0.58,2.86,1474.153167,796,31.750000
1,2020-05-20 12:02:59.855392,31.750000,0.57,2.68,1579.055250,805,32.166667
2,2020-05-20 12:03:09.859261,32.166667,0.48,3.24,1500.348583,810,31.833333
3,2020-05-20 12:03:19.863313,31.833333,0.49,3.52,1230.249750,788,32.000000
4,2020-05-20 12:03:29.868141,32.000000,0.64,2.43,1766.959333,805,31.833333
...,...,...,...,...,...,...,...
21691,2020-05-23 00:19:44.901521,30.333333,1.11,4.73,800.048500,740,30.500000
21692,2020-05-23 00:19:54.906179,30.500000,1.02,3.43,1345.581333,745,30.333333
21693,2020-05-23 00:20:04.910416,30.333333,0.86,5.48,1108.595417,846,30.166667
21694,2020-05-23 00:20:14.914973,30.166667,0.88,3.68,1182.734083,745,30.500000


In [17]:
valid

Unnamed: 0,timestamp,temperature,sys_load_1,cpu_percent,cpu_freq,fan_rpm,next_temp
0,2020-05-18 07:17:25.880953,32.666667,0.72,4.00,2159.853250,797,32.666667
1,2020-05-18 07:17:35.885801,32.666667,0.61,2.66,1923.107000,790,32.666667
2,2020-05-18 07:17:45.889712,32.666667,0.60,5.23,1995.411917,761,32.750000
3,2020-05-18 07:17:55.893324,32.750000,0.59,15.27,3064.213083,836,35.000000
4,2020-05-18 07:18:05.895970,35.000000,0.65,5.46,2308.287833,954,32.666667
...,...,...,...,...,...,...,...
18978,2020-05-20 12:01:49.824712,32.166667,0.57,2.33,1681.276000,800,32.000000
18979,2020-05-20 12:01:59.829444,32.000000,0.56,3.11,1554.748083,774,31.833333
18980,2020-05-20 12:02:09.834053,31.833333,0.47,3.32,895.129750,803,32.333333
18981,2020-05-20 12:02:19.837627,32.333333,0.56,3.55,1390.888583,792,32.000000


In [18]:
X_train, y_train = train[X_columns], train[y_column]
X_valid, y_valid = valid[X_columns], valid[y_column]

In [19]:
X_train, y_train

(       temperature  cpu_percent  fan_rpm  sys_load_1     cpu_freq
 0        32.000000         2.86      796        0.58  1474.153167
 1        31.750000         2.68      805        0.57  1579.055250
 2        32.166667         3.24      810        0.48  1500.348583
 3        31.833333         3.52      788        0.49  1230.249750
 4        32.000000         2.43      805        0.64  1766.959333
 ...            ...          ...      ...         ...          ...
 21691    30.333333         4.73      740        1.11   800.048500
 21692    30.500000         3.43      745        1.02  1345.581333
 21693    30.333333         5.48      846        0.86  1108.595417
 21694    30.166667         3.68      745        0.88  1182.734083
 21695    30.500000         3.96      770        1.06  1386.947333
 
 [21696 rows x 5 columns],
 0        31.750000
 1        32.166667
 2        31.833333
 3        32.000000
 4        31.833333
            ...    
 21691    30.500000
 21692    30.333333
 21693 

In [20]:
X_valid, y_valid 

(       temperature  cpu_percent  fan_rpm  sys_load_1     cpu_freq
 0        32.666667         4.00      797        0.72  2159.853250
 1        32.666667         2.66      790        0.61  1923.107000
 2        32.666667         5.23      761        0.60  1995.411917
 3        32.750000        15.27      836        0.59  3064.213083
 4        35.000000         5.46      954        0.65  2308.287833
 ...            ...          ...      ...         ...          ...
 18978    32.166667         2.33      800        0.57  1681.276000
 18979    32.000000         3.11      774        0.56  1554.748083
 18980    31.833333         3.32      803        0.47   895.129750
 18981    32.333333         3.55      792        0.56  1390.888583
 18982    32.000000         2.64      789        0.63  1862.503417
 
 [18983 rows x 5 columns],
 0        32.666667
 1        32.666667
 2        32.750000
 3        35.000000
 4        32.666667
            ...    
 18978    32.000000
 18979    31.833333
 18980 

In [24]:
model, coefficients = get_trained_coefficients(X_train, y_train)
output_regression(coefficients)

next_temp = 0.596*temperature + -0.127*cpu_percent + 0.0163*fan_rpm + 0.727*sys_load_1 + 0.000274*cpu_freq


In [35]:
smooth_test(coefficients, train, 'train.png')
print(f"Training score: {model.score(X_train, y_train)}\nValidation score: {model.score(X_valid, y_valid)}")

Training score: 0.552037743885651
Validation score: 0.4787840508463901


In [36]:
plot_errors(model, X_valid, y_valid)
smooth_test(coefficients, valid, 'valid.png')