In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error



In [2]:
import os
os.chdir('downloads')

In [3]:
os.getcwd()

'C:\\Users\\petri\\downloads'

In [4]:
data = pd.read_csv('energydata_complete.csv')
data.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [5]:
data = data.drop(columns=['date', 'lights'], axis=1)

In [6]:
X = data.drop(columns=['Appliances'])
y = data['Appliances']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
scaler = MinMaxScaler()

# Fit and transform using the training data
scaler.fit_transform(X_train)

# Transform the test features
scaler.transform(X_test)

array([[0.43294615, 0.27554825, 0.12066877, ..., 0.3       , 0.51234794,
        0.51234794],
       [0.53854277, 0.85778509, 0.37727163, ..., 0.35454545, 0.46937859,
        0.46937859],
       [0.48574446, 0.27883772, 0.11146111, ..., 0.19090909, 0.00259746,
        0.00259746],
       ...,
       [0.43681802, 0.40712719, 0.18899927, ..., 0.51742424, 0.79266914,
        0.79266914],
       [0.55015839, 0.46732456, 0.33438333, ..., 0.52272727, 0.60824256,
        0.60824256],
       [0.43294615, 0.50219298, 0.21322995, ..., 0.74393939, 0.19877963,
        0.19877963]])

In [9]:
models = {'LinearRegression': LinearRegression()}

In [10]:
def train(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

In [11]:
def score(model, X_test, y_test):
    predictions = model.predict(X_test)
    
    r2 = r2_score(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    rss = np.sum(np.square(y_test-predictions))
    
    print('RMSE:', rmse)
    print('R-Squared:', r2)
    print('MSE:', mse)
    print('RSS', rss)
    print('MAE:', mae)
    print()
    return [r2, mae, mse, rmse, rss]

In [12]:
scoring1 = pd.DataFrame()
scoring1['Metric'] = ['R2_score', 'MAE', 'MSE', 'RMSE', 'RSS']

In [13]:
for name, func in models.items():
    print(f'Performance of {name} on Test:')
    print('=='*24)
    print ('Test set:')
    print("**"*8)
    
    model = train(func, X_train, y_train)
    results = score(model, X_test, y_test)
    scoring1[name] = results

Performance of LinearRegression on Test:
Test set:
****************
RMSE: 93.6404609399803
R-Squared: 0.14890246319303524
MSE: 8768.535925051976
RSS 51918501.212232746
MAE: 53.642977655849585



In [14]:
scoring1

Unnamed: 0,Metric,LinearRegression
0,R2_score,0.1489025
1,MAE,53.64298
2,MSE,8768.536
3,RMSE,93.64046
4,RSS,51918500.0


In [15]:
coef = model.coef_

# print the feature coefficients
print("Feature coefficients:", coef)




Feature coefficients: [-3.70720673e-01  1.63001980e+01 -1.83700295e+01 -1.37407481e+01
  2.58367429e+01  4.80316182e+00  2.79366382e+00  1.20498426e+00
 -1.60084298e+00  2.57510931e-01  7.36354686e+00  4.11648713e-01
  1.04063061e+00 -1.69278695e+00  9.99097025e+00 -5.77886090e+00
 -2.11484524e+01 -1.76267959e+00 -1.10736285e+01  1.70187889e-01
 -1.09352104e+00  2.23042487e+00  2.02585717e-01  5.70139259e+00
  1.64829984e-02  1.64829984e-02]


In [16]:
coefficients = model.coef_

# znajdź najniższą i najwyższą wagę
min_coef = min(coefficients)
max_coef = max(coefficients, key=abs)

# wydrukuj wyniki
print("Najniższa waga:", min_coef)
print("Najwyższa waga:", max_coef)

Najniższa waga: -21.14845235368254
Najwyższa waga: 25.836742864890926


In [17]:
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=0.4)
ridge_reg.fit(X_train, y_train)

In [18]:
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


In [19]:
def get_weights_data (model, feat, col_name):
#this function returns the weight of every feature
   weights=pd.Series(model.coef_, feat.columns).sort_values()
   weights_data = pd.DataFrame(weights).reset_index()
   weights_data.columns = ['Features', col_name]
   weights_data[col_name].round(3)
   return weights_data

In [20]:
linear_model_weights = get_weights_data(linear_model, X_train, 'Linear_Model_Weight')
ridge_weights_data = get_weights_data(ridge_reg, X_train, 'Ridge_Weight')
lasso_weights_data = get_weights_data(lasso_reg, X_train, 'Lasso_weight')

NameError: name 'linear_model' is not defined

In [None]:
final_weights = pd.merge(linear_model_weights, ridge_weights_data, on='Features')
final_weights = pd.merge(final_weights, lasso_weights_data, on='Features')

In [None]:
final_weights

1. The percent of the total variation of the dependent variable Y explained by the set of independent variables X is measured by
Coefficient of Determination



2. How do you define a Residual?

Y −  Y^

3.The straight line graph of the equation Y = a + BX, the slope is horizontal if

b = 0

4. Which of the one is true about Heteroskedasticity?

Linear Regression with varying error terms


5. Generally, which of the following method(s) is used for predicting continuous dependent variables?

  1. Linear Regression

  2. Logistic Regression

 A only


6. From the following options below, which of these is/are true about “Ridge” or “Lasso” regression methods in case of feature selection?

Lasso regression uses subset selection of features


7. Which of the following sentences is/are true about outliers in Linear Regression:

Linear regression is sensitive to outliers

8. Which of the following metrics can be used for evaluating regression models?

  1. R Squared

  2. Adjusted R Squared

  3. F Statistics

  4. RMSE / MSE / MAE

a, b, c and d

9. A best fit line relating X and Y has a R-Squared value of 0.75. How do I interpret this information?

75% of the variance in Y is explained by X

10. Which of the following measures is optimal for comparing the goodness of the fit of competing regression models involving the same dependent variable?

R-square

11. The Lasso can be interpreted as least-squares linear regression where:

Weights are regularized with the L1 norm
