# Solar Dataset Analysis

In [1]:
import numpy as np
import ml_utilities as mlutils
from loaders import DataLoader
import itertools
import pickle

## 1. What is the maximum number of unique records that can be obtained with the non-categorical attributes?

In [2]:
# Generate all possible combinations of 1 < d <=7 features
path = 'C:/Users/alexc/OneDrive/Documents/GitHub/Thesis/Data/raw/solar/flare.data.2.txt'
features = range(3,10)
feature_combinations = []
for m in range(2,len(features)+1):
    feature_combinations.append(mlutils.findsubsets(features,m))
    
# How many feature combinations are there?
num_combinations = sum([len(x) for x in feature_combinations])
print ("There are"+" "+str(num_combinations)+" "+"feature combinations")

# Load each of the data sets and calculate the number of unique records
unique_records_comb = []
max_unique_records = 0
unique_records_comb_max = []
for combination in itertools.chain(*feature_combinations):
    # Load data
    loader = DataLoader()
    loader.load(path,feature_indices=combination,targets=[10])
    # Calculate the number of unique records
    no_records = mlutils.get_unique_records(loader.features,number=True)
    if no_records > max_unique_records:
        max_unique_records = no_records
    unique_records_comb.append(tuple([no_records,combination]))
# Print the feature combinations that generate the maximum number of distinct records
for entry in unique_records_comb:
        if entry[0] == max_unique_records:
            unique_records_comb_max.append(entry)
print("Maximum number of distinct features and the corresponding feature set: ",unique_records_comb_max)

There are 120 feature combinations
Maximum number of distinct features and the corresponding feature set:  [(36, [3, 4, 5, 6, 7, 8]), (36, [3, 4, 5, 6, 7, 8, 9])]


In [5]:
# Get the data corresponding to the feature combination that maximises # of unique records and has minimum dimensionality
min_len = 11 # init. min length to an aribtrary number > feat. set dimensionality
for entry in unique_records_comb_max:
    if len(entry[1]) < min_len:
        min_len = len(entry[1])
for entry in unique_records_comb_max:
    if len(entry[1]) == min_len:
        features = entry[1]
        break
loader = DataLoader()
loader.load(path,feature_indices=features,target_indices=[10],unique=True)
solar_data = loader.data

In [9]:
# Save the dataset
basepath = 'C:/Users/alexc/OneDrive/Documents/GitHub/Thesis/Data/processed/solar/'
with open(basepath+"solar_data_p.pickle","wb") as data:
    pickle.dump(solar_data,data)

In [2]:
# Reload the data set
basepath = 'C:/Users/alexc/OneDrive/Documents/GitHub/Thesis/Data/processed/solar/'
with open(basepath+"solar_data_p.pickle","rb") as data:
    solar_data = pickle.load(data)

In [4]:
print (solar_data)

[[1. 1. 1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 2. 1. 0.]
 [1. 1. 1. 2. 2. 1. 0.]
 [1. 1. 2. 2. 2. 1. 0.]
 [1. 1. 3. 2. 2. 1. 0.]
 [1. 2. 1. 1. 1. 1. 0.]
 [1. 2. 1. 1. 2. 1. 0.]
 [1. 2. 1. 2. 1. 1. 0.]
 [1. 2. 1. 2. 2. 1. 0.]
 [1. 2. 1. 2. 2. 2. 1.]
 [1. 2. 2. 2. 2. 1. 0.]
 [1. 3. 1. 1. 1. 1. 0.]
 [1. 3. 1. 1. 2. 1. 0.]
 [1. 3. 1. 2. 1. 1. 0.]
 [1. 3. 1. 2. 2. 1. 0.]
 [1. 3. 1. 2. 2. 2. 2.]
 [2. 1. 1. 1. 2. 1. 0.]
 [2. 1. 1. 2. 2. 1. 0.]
 [2. 1. 3. 2. 2. 1. 5.]
 [2. 2. 1. 1. 1. 1. 0.]
 [2. 2. 1. 1. 2. 1. 0.]
 [2. 2. 1. 2. 2. 1. 0.]
 [2. 2. 1. 2. 2. 2. 1.]
 [2. 2. 2. 1. 2. 1. 0.]
 [2. 2. 2. 2. 2. 1. 0.]
 [2. 2. 2. 2. 2. 2. 4.]
 [2. 2. 3. 2. 2. 1. 0.]
 [2. 2. 3. 2. 2. 2. 0.]
 [2. 3. 1. 1. 2. 1. 2.]
 [2. 3. 1. 2. 2. 1. 2.]
 [2. 3. 1. 2. 2. 2. 0.]
 [2. 3. 2. 1. 2. 1. 1.]
 [2. 3. 2. 2. 2. 1. 0.]
 [2. 3. 3. 1. 2. 1. 0.]
 [2. 3. 3. 2. 2. 1. 2.]
 [2. 3. 3. 2. 2. 2. 6.]]


# Investigation of database normalisation

In [4]:
# Create a data set where the norm of each record has a norm bounded by 1
path = 'C:/Users/alexc/OneDrive/Documents/GitHub/Thesis/Data/raw/solar/flare.data.2.txt'
features = list(range(3,9))
targets = [10]
loader = DataLoader()
loader.load(path,feature_indices=features,target_indices=targets,unique=True,boundrec=True)
solar_data_bound = loader.data
print(solar_data_bound)

[[0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.         0.         0.         0.40824829 0.
  0.        ]
 [0.         0.         0.         0.40824829 0.40824829 0.
  0.        ]
 [0.         0.         0.20412415 0.40824829 0.40824829 0.
  0.        ]
 [0.         0.         0.40824829 0.40824829 0.40824829 0.
  0.        ]
 [0.         0.20412415 0.         0.         0.         0.
  0.        ]
 [0.         0.20412415 0.         0.         0.40824829 0.
  0.        ]
 [0.         0.20412415 0.         0.40824829 0.         0.
  0.        ]
 [0.         0.20412415 0.         0.40824829 0.40824829 0.
  0.        ]
 [0.         0.20412415 0.         0.40824829 0.40824829 0.40824829
  1.        ]
 [0.         0.20412415 0.20412415 0.40824829 0.40824829 0.
  0.        ]
 [0.         0.40824829 0.         0.         0.         0.
  0.        ]
 [0.         0.40824829 0.         0.         0.40824829 0.
  0.        ]
 [0.         0.40824829 0.    

Normalising the data such that the records has some undersirable effects given this data set. 

1. For example, in the 5th record, the features 3 and 4 are the same level despite being different before the application (this is all the more obvious if we look at the final row). I am not sure how this would affect the quality of the model predictions 

2. A lot of the features are also set to zero by this normalisation, which, again raises concerns about whether the data should be transformed in this way

3. It is not necessarily obvious how to normalise the targets - limiting to [-1,1] interval is an option?

Next steps:
 - Check empirical covariance matrices for both options and see if they are well-conditioned
 - Normalise the targets to [-1,1]
 - Select a testing set (10 samples) and a training set (26 samples) and fit simple regression models
 to both, see how the results compare and if the transformation makes sense

In [5]:
# Test whether empirical covariance matrix is invertible (transformed data set)
cov_emp = solar_data_bound.T@solar_data_bound
precision_emp = np.linalg.inv(cov_emp)
# Calculate eigenvalues and eigenvectors of the empirical covariance matrix

[[ 0.91965364 -0.21531089 -0.29924656  0.14121472 -0.40219762 -0.03126052
  -0.0212258 ]
 [-0.21531089  0.93265078  0.11541697 -0.11581323 -0.31733734 -0.0903473
  -0.00614579]
 [-0.29924656  0.11541697  1.49402244 -0.22969654 -0.19404047  0.26630593
  -0.04997807]
 [ 0.14121472 -0.11581323 -0.22969654  0.85990531 -0.46392135 -0.31240036
  -0.00401983]
 [-0.40219762 -0.31733734 -0.19404047 -0.46392135  1.00654276 -0.01529334
   0.01141577]
 [-0.03126052 -0.0903473   0.26630593 -0.31240036 -0.01529334  1.49047014
  -0.06407051]
 [-0.0212258  -0.00614579 -0.04997807 -0.00401983  0.01141577 -0.06407051
   0.01920298]]
