In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from itertools import combinations
from sklearn.preprocessing import MinMaxScaler
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import importlib.util

In [None]:
#print(sys.path)

In [3]:
sys.path.insert(1, "C:/Users/Mi/Documents/Diploma/GMDH/build/Release")
sys.path.insert(1, "/home/mikhail-xnor/Projects/GMDH/build")

In [4]:
import gmdhpy as gm

In [5]:
dir(gm)

['Combi',
 'Criterion',
 'CriterionType',
 'GmdhModel',
 'Mia',
 'Multi',
 'ParallelCriterion',
 'PolynomialType',
 'Ria',
 'SequentialCriterion',
 'Solver',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'split_data',
 'splitted_data',
 'time_series_transformation']

In [5]:
def data_preparation(x, lags, validate_size, test_size=0):
    #print("x: ", x)
    x_data = list()
    y_data = list()
    for i in range(len(x) - lags):
        x_data.append(list(x.values[i:i+lags]))
        y_data.append(x.values[i+lags])
    x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=validate_size, shuffle=False)
    x_train = np.array(x_train)
    x_val = np.array(x_val)
    #print("x_train:\n", x_train)
    #print("y_train:\n", y_train)
    #print("x_val:\n", x_val)
    #print("y_val:\n", y_val)
    if test_size > 0:
        x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=test_size, shuffle=False)
        x_train = np.array(x_train)
        x_test = np.array(x_test)
        return x_train, y_train, x_test, y_test, x_val, y_val
    else:
        return x_train, y_train, x_val, y_val

def print_polynom(polynom, coeffs):
    best_polynom_str = "y ="
    for i in range(len(polynom)):
        if coeffs[i] > 0:
            if i > 0:
                best_polynom_str += " + "
            else:
                best_polynom_str += " "
        else:
            best_polynom_str += " - "
        best_polynom_str += str(abs(coeffs[i]))
        if i != len(coeffs) - 1:
            best_polynom_str += "*x" + str(polynom[i] + 1)
    print(best_polynom_str)

In [6]:
def regression(x, lags, validate_size):
    x_train, y_train, x_val, y_val = data_preparation(x, lags, validate_size)
    lr = LinearRegression(n_jobs=-1)
    lr.fit(x_train, y_train)
    coeffs = list(lr.coef_)
    coeffs.append(lr.intercept_)
    coeffs = [round(coeff, 6) for coeff in coeffs]
    y_pred = lr.predict(x_val)
    error = round(mean_squared_error(y_pred, y_val), 6)
    print_polynom(range(lags + 1), coeffs)
    return y_pred

In [7]:
def regression_combi(x, lags, validate_size, test_size):
    x_train, y_train, x_test, y_test, x_val, y_val = data_preparation(x, lags, validate_size, test_size)
    lr = LinearRegression(n_jobs=-1)
    last_error = 100000000000000
    best_coeffs = []
    best_polynom = []
    for level in range(1, x_train.shape[1] + 1):
        errors = []
        for comb in combinations(range(x_train.shape[1]), level):
            lr.fit(x_train[:, comb], y_train)
            coeffs = list(lr.coef_)
            coeffs.append(lr.intercept_)
            coeffs = [round(coeff, 6) for coeff in coeffs]
            y_pred = lr.predict(x_test[:, comb])
            error = round(mean_squared_error(y_pred, y_test), 6)
            comb = list(comb)
            comb.append(lags)
            errors.append((error, coeffs, comb))
        errors.sort()
        #print(errors[0])
        if last_error > errors[0][0]:
            last_error = errors[0][0]
            best_coeffs = errors[0][1]
            best_polynom = errors[0][2]
        else:
            break
    best_polynom_str = "y ="
    print_polynom(best_polynom, best_coeffs)
    y_pred = lr.fit(x_train[:, best_polynom[:-1]], y_train).predict(x_val[:, best_polynom[:-1]])
    return y_pred

In [None]:
"""%%time
data = pd.read_csv("Sberbank.csv")['close'][:-50000]
lags = 10
test_size = 0.33
validate_size = 0.2

mat, vec = gm.time_series_transformation(data, lags)
splited_data = gm.split_data(mat, vec, validate_size, False, 0)"""

In [6]:
def print_metrics(y_test, y_pred):
    print(f"R^2: {r2_score(y_test, y_pred)}")
    print(f"MSE: {mean_squared_error(y_test, y_pred)}")
    print(f"MAE: {mean_absolute_error(y_test, y_pred)}")

In [7]:
data = pd.read_csv("laptop_price.csv")
#data = data[data.columns[data.dtypes != object]].drop(['laptop_ID', 'Memory2_GB'], axis=1)
data = data.drop(['laptop_ID', 'Memory2_GB'], axis=1)
data = data.drop(data[data['Ram_GB'] == 64].index)
test_size = 0.2
validate_size = 0.2
data.head()

Unnamed: 0,Company,Product,TypeName,Inches,Ram_GB,OpSys,Weight_kg,Price_euros,Touchscreen,ScreenWidth,...,Cpu_producer,Cpu_series,Cpu_model,Cpu_GHz,Gpu_producer,Gpu_model,Gpu_series,Memory1_GB,Memory1_type,Memory2_type
0,Apple,MacBook Pro,Ultrabook,13.3,8,macOS,1.37,1339.69,0,2560,...,Intel,-,Core i5,2.3,Intel,Iris Plus Graphics,640,128,SSD,-
1,Apple,Macbook Air,Ultrabook,13.3,8,macOS,1.34,898.94,0,1440,...,Intel,-,Core i5,1.8,Intel,HD Graphics,6000,128,Flash Storage,-
2,HP,250 G6,Notebook,15.6,8,No OS,1.86,575.0,0,1920,...,Intel,7200U,Core i5,2.5,Intel,HD Graphics,620,256,SSD,-
3,Apple,MacBook Pro,Ultrabook,15.4,16,macOS,1.83,2537.45,0,2880,...,Intel,-,Core i7,2.7,AMD,Radeon Pro,455,512,SSD,-
4,Apple,MacBook Pro,Ultrabook,13.3,8,macOS,1.37,1803.6,0,2560,...,Intel,-,Core i5,3.1,Intel,Iris Plus Graphics,650,256,SSD,-


In [8]:
category_cols = data.columns[data.dtypes == object]
for col in category_cols:
    print(f'{col}: {data[col].unique().size}')

Company: 19
Product: 617
TypeName: 6
OpSys: 9
Cpu_producer: 3
Cpu_series: 88
Cpu_model: 20
Gpu_producer: 4
Gpu_model: 26
Gpu_series: 75
Memory1_type: 4
Memory2_type: 4


In [10]:
data = pd.get_dummies(data=data.drop(['Product'], axis=1), columns=category_cols.drop(['Product']))
splited_data = gm.split_data(data.drop(['Price_euros'], axis=1), data['Price_euros'])
scaler = MinMaxScaler().fit(splited_data.x_train)
splited_data.x_train = scaler.transform(splited_data.x_train);
splited_data.x_test = scaler.transform(splited_data.x_test);

In [None]:
%%time
combi = gm.Combi()
combi.fit(splited_data.x_train, splited_data.y_train, gm.Criterion(gm.CriterionType.regularity, gm.Solver.accurate), test_size, True, 17, 0, -1, 1, 0)
print(combi.get_best_polynomial())
y_pred_combi = combi.predict(splited_data.x_test)



The default value is used (pAverage = 1)!



LEVEL 4  [>                             ] 2% [00m:04s] (4421275 combinations)                                           

In [204]:
print_metrics(splited_data.y_test, y_pred_combi)

R^2: 0.5656537104846866
MSE: 194191.66116391218
MAE: 306.37671592064873


In [11]:
%%time
multi = gm.Multi()
multi.fit(splited_data.x_train, splited_data.y_train, gm.Criterion(gm.CriterionType.sym_regularity, gm.Solver.accurate), 3, test_size, True, 17, 0, -1, 1, 0);
print(multi.get_best_polynomial())
y_pred_multi = multi.predict(splited_data.x_test)

LEVEL 2  [>                        ] 0% [00m:00s] (792 combinations)                                                    

The default value is used (pAverage = 1)!





y = 68.1576*x1 + 1264.0104*x2 + 583.3418*x6 + 876.9094*x7 - 133.7545*x9 - 90.0244*x11 - 281.2344*x12 - 147.4882*x14 + 512.5186*x18 - 249.8333*x21 + 698.1163*x23 + 114.7603*x25 + 52.8582*x27 - 217.5045*x29 - 208.7145*x31 + 73.9833*x32 - 118.8062*x36 + 70.2661*x37 - 190.2924*x38 + 127.4333*x44 + 0*x49 + 145.0611*x53 - 37.4655*x54 - 77.5572*x55 - 342.3904*x57 - 144.2959*x61 + 696.8797*x70 - 252.6105*x74 - 245.1183*x75 + 179.2022*x78 - 382.5866*x79 - 194.4915*x81 - 481.9592*x82 + 647.8508*x86 + 79.5926*x87 - 181.0245*x101 - 178.7533*x102 + 9.4712*x110 - 337.0573*x111 + 1778.3219*x114 + 1319.3085*x115 + 0*x117 + 4.8364*x120 - 480.8776*x135 - 263.5042*x137 - 472.8783*x139 - 87.4755*x140 - 423.0498*x144 - 50.7167*x145 - 433.8248*x149 - 211.5708*x158 - 44.4892*x160 + 140.3568*x162 - 271.4232*x164 - 122.5084*x166 + 593.1237*x170 - 145.6158*x171 - 107.5831*x176 - 166.312*x177 - 216.4387*x179 - 117.6892*x184 - 351.7283*x185 - 383.0337*x187 + 462.3886*x189 + 841.1845*x191 - 321.8835*x194 - 296.297

In [19]:
print_metrics(splited_data.y_test, y_pred_multi)

R^2: 0.6868207650056062
MSE: 140019.14452514367
MAE: 259.38463752355136


In [11]:
try:
    y_pred_multi = multi.predict(np.delete(splited_data.x_test, -1, 1))
except Exception as err:
    print(err)

Input data number of cols is not match number of cols of fitted data!


In [29]:
%%time
mia = gm.Mia()
mia.fit(splited_data.x_train, splited_data.y_train, gm.Criterion(gm.CriterionType.sym_regularity, gm.Solver.accurate), 20, gm.PolynomialType.linear_cov, test_size, 1, 0, 3, -10, 1);
#print(mia.get_best_polynomial())
y_pred_mia = mia.predict(splited_data.x_test)

LEVEL 1  [>                        ] 1% [00m:00s] (35245 combinations)                                                  

The default value is used (threads = 1)!



CPU times: total: 8.69 s
Wall time: 7.04 s


In [40]:
print_metrics(splited_data.y_test, y_pred_mia)

R^2: 0.5708349269634132
MSE: 191875.19372965256
MAE: 309.870941863932


In [80]:
data.describe()

Unnamed: 0,Inches,Ram_GB,Weight_kg,Price_euros,Touchscreen,ScreenWidth,ScreenHeight,Cpu_GHz,Memory1_GB,Company_Acer,...,Gpu_series_W5130M,Gpu_series_W6150M,Memory1_type_Flash Storage,Memory1_type_HDD,Memory1_type_Hybrid,Memory1_type_SSD,Memory2_type_-,Memory2_type_HDD,Memory2_type_Hybrid,Memory2_type_SSD
count,1244.0,1244.0,1244.0,1244.0,1244.0,1244.0,1244.0,1244.0,1244.0,1244.0,...,1244.0,1244.0,1244.0,1244.0,1244.0,1244.0,1244.0,1244.0,1244.0,1244.0
mean,15.029904,8.403537,2.044035,1130.563055,0.144695,1897.607717,1072.469453,2.304196,446.77492,0.080386,...,0.000804,0.000804,0.055466,0.284566,0.006431,0.653537,0.836817,0.15836,0.001608,0.003215
std,1.414841,4.884624,0.66877,700.047321,0.351934,492.807819,283.722976,0.502959,367.634894,0.271999,...,0.028352,0.028352,0.22898,0.451389,0.079967,0.476034,0.369681,0.365225,0.04008,0.056636
min,10.1,2.0,0.69,174.0,0.0,1366.0,768.0,0.9,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,14.0,4.0,1.5,599.0,0.0,1600.0,900.0,2.0,256.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,15.6,8.0,2.04,986.5,0.0,1920.0,1080.0,2.5,256.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
75%,15.6,8.0,2.31,1491.45,0.0,1920.0,1080.0,2.7,512.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
max,18.4,32.0,4.7,6099.0,1.0,3840.0,2160.0,3.6,2048.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [1]:
%%time
ria = gm.Ria()
ria.fit(splited_data.x_train, splited_data.y_train, gm.Criterion(gm.CriterionType.regularity, gm.Solver.accurate), 5, gm.PolynomialType.linear, test_size, 0, 0, 1, -2, 1, 0);
#print(ria.get_best_polynomial())
y_pred_ria = ria.predict(splited_data.x_test)

NameError: name 'gm' is not defined

In [51]:
print_metrics(splited_data.y_test, y_pred_ria)

R^2: 0.6126159034652368
MSE: 173195.3582440234
MAE: 284.66992742385145


In [None]:
%%time
y_pred_lr_combi = regression_combi(data, lags, validate_size, test_size)

In [None]:
%time
y_pred_lr = regression(data, lags, validate_size)

In [None]:
#np.savetxt("Sber.csv", np.asarray(data.values), delimiter=",")

In [None]:
n = 20
plt.figure(figsize=(17, 6))
sns.lineplot(x=np.arange(n), y=splited_data.y_test[:n], label='original');
sns.lineplot(x=np.arange(n), y=y_pred_combi[:n], label='combi');
sns.lineplot(x=np.arange(n), y=y_pred_multi[:n], label='multi');
sns.lineplot(x=np.arange(n), y=y_pred_mia[:n], label='mia');
sns.lineplot(x=np.arange(n), y=y_pred_ria[:n], label='ria');
#sns.lineplot(x=np.arange(n), y=y_pred_lr_combi[:n], label='regr_combi');
#sns.lineplot(x=np.arange(n), y=y_pred_lr[:n], label='regr');

In [None]:
def stat(y_real, y_pred):
    y_real_diff = np.diff(y_real)
    y_pred_diff = np.diff(y_pred)
    print("right:", ((y_real_diff * y_pred_diff > 0)).sum())
    print("wrong:", ((y_real_diff * y_pred_diff < 0)).sum())
    print("zero:", ((y_real_diff * y_pred_diff == 0)).sum())

In [None]:
print("COMBI")
stat(splited_data.y_test, y_pred_combi)
print("\nMULTI")
stat(splited_data.y_test, y_pred_multi)
print("\nMIA")
stat(splited_data.y_test, y_pred_mia)
print("\nRIA")
stat(splited_data.y_test, y_pred_ria)