In [14]:
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# import sktime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer

from tsai.basics import timer
# from tsai.data.core import TSDatasets
# from fastai.data.transforms import Categorize
# from tsai.data.validation import combine_split_data
from tsai.data.external import get_UCR_data, get_Monash_regression_data, check_data
from tsai.models.MINIROCKET import *

In [2]:
warnings.filterwarnings(action='ignore', category=UserWarning)

df = pd.read_excel('./DATASET.xlsx')
df.rename(columns={'Unnamed: 0':'Section'}, inplace=True)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1009 entries, 0 to 1008
Data columns (total 15 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Section  101 non-null    object 
 1   year     1009 non-null   int64  
 2   Para-1   1009 non-null   float64
 3   Para-2   1009 non-null   float64
 4   Para-3   1009 non-null   float64
 5   Para-4   1009 non-null   float64
 6   Para-5   1009 non-null   int64  
 7   Para-6   1009 non-null   int64  
 8   Para-7   1009 non-null   int64  
 9   Para-8   1009 non-null   int64  
 10  Para-9   1009 non-null   float64
 11  Para-10  1009 non-null   float64
 12  Para-11  1009 non-null   float64
 13  Para-12  1009 non-null   int64  
 14  Para-13  1007 non-null   float64
dtypes: float64(8), int64(6), object(1)
memory usage: 118.4+ KB
None


In [3]:
def check_null(df):
    null_entries = df.loc[:,'year':].isnull().any()
    return df.loc[:,'year':].columns[null_entries].tolist()

In [4]:
def get_selected_parameters(x, y, parameters=[8, 9, 10, 11, 12]):
    new_x, new_y = [], []
    for x_vals in x:
        new_x.append(x_vals[parameters])
    for y_vals in y:
        new_y.append(y_vals[parameters])
    new_x = np.array(new_x)
    new_y = np.array(new_y)
    if new_y.dtype == 'object':
        new_y = new_y.astype('float64')
    return new_x, new_y 
    

In [5]:
null_colmns = check_null(df)
print(f'Null Columns: {null_colmns}')
if len(null_colmns)>0:
    print('Filling Null Values')
    [df[col].fillna(method='bfill',inplace=True) for col in null_colmns]

Null Columns: ['Para-13']
Filling Null Values


In [5]:
# C_mat = df.loc[:,'Para-1':].corr()
# fig = plt.figure(figsize = (15,15))
# plt.title('Corelation Heatmap')
# sns.heatmap(C_mat, vmax = 0.8, vmin = -0.8,  square = True, annot=True)
# plt.show()

In [6]:
# fig, axs = plt.subplots(5, 3, figsize=(10,10))
# x, y = 0,-1
# for i in range(13):
#     x = i//3
#     y = i%3
#     col = df.columns[i+2]
#     axs[x,y].scatter(df[col].value_counts().index, df[col].value_counts())

In [6]:
df['Section'].ffill(inplace=True)
df_sections = df.groupby('Section', dropna=False)

print(f'Grouped dataframe by sections\nDataFrame Size: {len(df)}\nGroups: {len(df_sections)}')

Grouped dataframe by sections
DataFrame Size: 1009
Groups: 101


In [8]:
# dsid = 'LSST'
# X_train, y_train, X_valid, y_valid = get_UCR_data(dsid)
# print(type(X_train))
# print(y_train[0])
# print(X_valid.shape)

# dsid = 'NATOPS' 
# X, y, splits = get_UCR_data(dsid, return_split=False)
# tfms  = [None, [Categorize()]]
# dsets = TSDatasets(X, y, tfms=tfms, splits=splits, inplace=True)
# print(dsets)

In [7]:
print('Converting Dataframe to 3d structure:\n1. Samples\n2. Variables\n3. Length')
x_raw, y_raw = [], []
i=0
for section in list(df_sections):
    section[1].reset_index(inplace=True)
    # print(section[1].loc[:8,'Para-1':'Para-7'].T)
    if len(section[1]) == 10:
        x_raw.append(section[1].loc[:8,'Para-1':'Para-13'].T)
        y_raw.append(section[1].loc[9,'Para-1':'Para-13'].T)
x_raw = np.array(x_raw)
y_raw = np.array(y_raw)
if y_raw.dtype == 'object':
    y_raw = y_raw.astype('float64')
print(f'New Shape:\nX: {x_raw.shape}\nY: {y_raw.shape}')

Converting Dataframe to 3d structure:
1. Samples
2. Variables
3. Length
New Shape:
X: (100, 13, 9)
Y: (100, 13)


In [10]:
# X, y, splits = combine_split_data([x_raw], [y_raw])

# check_data(X, y, splits)
# tfms  = [None, [Categorize()]]
# dsets = TSDatasets(X, y, tfms=tfms, splits=splits, inplace=True)

In [8]:
x, y = get_selected_parameters(x_raw, y_raw)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
print('Train test split')

Train test split


In [9]:
rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
model = MiniRocketRegressor(scoring=rmse_scorer)
timer.start(False)
model.fit(x_train, y_train)
t = timer.stop()
y_pred = model.predict(x_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'valid rmse        : {rmse:.5f} time: {t}')

valid rmse        : 34.15516 time: 0:00:00.104288


In [10]:
r2 = r2_score(y_test, y_pred)
print(r2)

0.3708040059744794


In [11]:
model.save(f'MRVRegressor_{r2}')

In [12]:
y_raw_pred = model.predict(x)
r2_total = r2_score(y, y_raw_pred)
print(r2_total)

0.9614462778601804


In [13]:
print(y_pred)

[[ 2.70209159e+02  1.48168041e+02  4.80012056e+01  3.80408356e+00
   1.77940507e+00]
 [ 3.08626759e+02  2.05199421e+02  1.10566645e+02  2.58185451e+00
   1.01203187e+00]
 [ 1.46248669e+02  8.43791895e+01  1.23279169e+02  5.22907968e+00
   1.54520018e+00]
 [ 8.22270015e+01  2.57076810e+02  5.91091560e+01 -4.93747039e+00
   2.77006699e-01]
 [ 8.16755664e+01  1.73369893e+02  3.15694615e+01  9.00058794e+00
   1.90231597e+00]
 [ 8.63275944e+00  6.02946494e+01  3.10294282e+01  9.28901204e+00
   1.22283672e+00]
 [ 1.18083106e+02  6.70732061e+01  4.35938911e+00  5.94681414e+00
   1.77176510e+00]
 [-1.94396220e+01  1.74956408e+01  1.11265293e+01  5.63273408e+00
   2.15912930e+00]
 [ 1.09952306e+02  1.79565032e+02  3.91560247e+01  1.01081067e+01
   6.29340120e-01]
 [ 1.95571526e+02  7.23721771e+01  2.07176219e+01  1.13480866e+01
   1.14352332e+00]]
