# Task 3-1

## Imports

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer, KNNImputer

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.offline as pyo
pyo.init_notebook_mode()

## Utility Functions

In [2]:
def is_number(num_str):
    try:
        float(num_str)
        return True
    except ValueError:
        return False

vec_isalpha = np.vectorize(lambda x: not is_number(x))
vec_isnum = np.vectorize(lambda x: is_number(x))

## Code

In [3]:
my_ds = pd.read_csv("star_assessment.csv")

In [4]:
my_ds

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,redshift,plate,MJD,fiber_ID,class
0,1.237661e+18,135.689107,32.494632,23.87882,22.27530,20.39501,19.16573,18.79371,3606.0,301.0,2.0,79.0,6.543777e+18,0.634794,5812.0,56354.0,171.0,GALAXY
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518.0,301.0,5.0,119.0,1.176014e+19,0.779136,10445.0,58158.0,427.0,GALAXY
2,1.237661e+18,142.188790,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606.0,301.0,2.0,120.0,5.152200e+18,0.644195,4576.0,55592.0,299.0,GALAXY
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.25010,4192.0,301.0,3.0,214.0,1.030107e+19,0.932346,9149.0,58039.0,775.0,GALAXY
4,1.237680e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102.0,301.0,3.0,137.0,6.891865e+18,0.116123,6121.0,56187.0,842.0,GALAXY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1.237679e+18,39.620709,-2.594074,22.16759,22.97586,21.90404,21.30548,20.73569,7778.0,301.0,2.0,581.0,1.055431e+19,0.000000,9374.0,57749.0,438.0,GALAXY
99996,1.237679e+18,29.493819,19.798874,22.69118,22.38628,20.45003,19.75759,19.41526,7917.0,301.0,1.0,289.0,8.586351e+18,0.404895,7626.0,56934.0,866.0,GALAXY
99997,1.237668e+18,224.587407,15.700707,21.16916,19.26997,18.20428,17.69034,17.35221,5314.0,301.0,4.0,308.0,3.112008e+18,0.143366,2764.0,54535.0,74.0,GALAXY
99998,1.237661e+18,212.268621,46.660365,25.35039,21.63757,19.91386,19.07254,18.62482,3650.0,301.0,4.0,131.0,7.601080e+18,0.455040,6751.0,56368.0,470.0,GALAXY


In [5]:
my_ds.columns

Index(['obj_ID', 'alpha', 'delta', 'u', 'g', 'r', 'i', 'z', 'run_ID',
       'rerun_ID', 'cam_col', 'field_ID', 'spec_obj_ID', 'redshift', 'plate',
       'MJD', 'fiber_ID', 'class'],
      dtype='object')

In [6]:
def load_dataset(ds_name):
    # Load dataset
    ds = np.genfromtxt(
        f"{ds_name}.csv", 
        delimiter=",",
        skip_header=True,
        dtype="str"
    )
    
    # Strip whitespace from start and end of all elements
    ds = np.char.strip(ds)
    return ds

def clean_dataset(ds, check_cols=None):
    ds = np.copy(ds)
    
    # Create a mask to find rows with missing values in critical columns
    if not check_cols:
        missing_rows_mask = np.any(ds == "", axis=1)
    else:
        missing_rows_mask = np.any(ds[:, check_cols] == "", axis=1)

    # Remove rows with missing values using boolean indexing
    clean_ds = ds[~missing_rows_mask]
    
    return clean_ds

def numerise_dataset(encoded_ds):
    return np.where(
        encoded_ds == "", 
        np.nan, 
        encoded_ds
    ).astype(float)

def encode_dataset(clean_ds):
    encoded_ds = np.copy(clean_ds)
    encodings = {}
    
    # Get number of columns in dataset
    num_cols = encoded_ds.shape[1]
    
    # Loop through columns to see if any need to be encoded
    for i in range(num_cols):
        if np.all(vec_isalpha(encoded_ds[:, i])):
            # Initialise encoder
            oec = OrdinalEncoder(categories="auto", dtype=float)
            
            # Fit encoder
            oec.fit(encoded_ds[:, [i]])
            
            # Replace columns
            encoded_ds[:, i] = oec.transform(encoded_ds[:, [i]]).flatten()
            
            # Save category encoding
            encodings[i] = oec.categories_[0]
    
    return encoded_ds, encodings

def impute_dataset(numerised_ds, imp_choice="simple"):
    complete_ds = np.copy(numerised_ds)
    
    # Initialise imputer
    if imp_choice == "simple":
        imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
    elif imp_choice == "knn":
        imputer = KNNImputer(n_neighbors=2)
    elif imp_choice == "iterative":
        imputer = IterativeImputer(random_state=0)
    
    # Fit imputer and fill missing values
    complete_ds = imputer.fit_transform(complete_ds)
    
    return complete_ds

In [14]:
def load_and_preprocess(ds_name, crit_cols, imp_choice):
    # Load dataset
    new_ds = load_dataset(ds_name)
    print(new_ds.shape)
    
    # Clean dataset
    clean_ds = clean_dataset(new_ds, crit_cols)
    print(clean_ds.shape)
    
    # Encode dataset
    encoded_ds, encodings = encode_dataset(clean_ds)
    print(encoded_ds.shape)
    print(encodings)
    
    # Numerise dataset
    numerised_ds = numerise_dataset(encoded_ds)
    print(numerised_ds.shape)
    
    # Impute missing values
    complete_ds = impute_dataset(numerised_ds)
    
    num_cols = complete_ds.shape[1]
    for i in range(num_cols):
        print(f"\nColumn {i}:\n")
        print(np.any(np.isnan(numerised_ds[:, i])), np.any(np.isnan(complete_ds[:, i])))
    
    return complete_ds

In [15]:
ds = load_and_preprocess("gwp_assessment", [0,1,2,3], "simple")
# ds = load_and_preprocess("star_assessment", [0,1,8,9,12,16,17], "simple")
ds

(1197, 15)
(1146, 15)
(1146, 15)
{0: array(['1/1/2015', '1/10/2015', '1/11/2015', '1/12/2015', '1/13/2015',
       '1/14/2015', '1/15/2015', '1/17/2015', '1/18/2015', '1/19/2015',
       '1/20/2015', '1/21/2015', '1/22/2015', '1/24/2015', '1/25/2015',
       '1/26/2015', '1/27/2015', '1/28/2015', '1/29/2015', '1/3/2015',
       '1/31/2015', '1/4/2015', '1/5/2015', '1/6/2015', '1/7/2015',
       '1/8/2015', '2/1/2015', '2/10/2015', '2/11/2015', '2/12/2015',
       '2/14/2015', '2/15/2015', '2/16/2015', '2/17/2015', '2/18/2015',
       '2/19/2015', '2/2/2015', '2/22/2015', '2/23/2015', '2/24/2015',
       '2/25/2015', '2/26/2015', '2/28/2015', '2/3/2015', '2/4/2015',
       '2/5/2015', '2/7/2015', '2/8/2015', '2/9/2015', '3/1/2015',
       '3/10/2015', '3/11/2015', '3/2/2015', '3/3/2015', '3/4/2015',
       '3/5/2015', '3/7/2015', '3/8/2015', '3/9/2015'], dtype='<U11'), 1: array(['Quarter1', 'Quarter2', 'Quarter3', 'Quarter4', 'Quarter5'],
      dtype='<U11'), 2: array(['finishing', 'swe

array([[ 0.        ,  0.        ,  1.        , ...,  0.        ,
        59.        ,  0.94072542],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         8.        ,  0.8865    ],
       [ 0.        ,  0.        ,  1.        , ...,  0.        ,
        30.5       ,  0.80057049],
       ...,
       [51.        ,  1.        ,  0.        , ...,  0.        ,
         8.        ,  0.625625  ],
       [51.        ,  1.        ,  0.        , ...,  0.        ,
        15.        ,  0.50588889],
       [51.        ,  1.        ,  0.        , ...,  0.        ,
         6.        ,  0.39472222]])

In [16]:
ds.shape

(1146, 15)

In [17]:
# gwp --> [0,1,2,3]
# star --> [0,1,8,9,12,16,17]
my_ds.corr()

  my_ds.corr()


Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,redshift,plate,MJD,fiber_ID
obj_ID,1.0,-0.013466,-0.301041,0.01532,0.015711,0.153922,0.147566,0.013809,1.0,,-0.046947,0.031321,0.239473,0.065421,0.239458,0.262692,0.067023
alpha,-0.013466,1.0,0.138778,-0.001517,-0.002418,-0.022014,-0.023603,-0.002916,-0.013655,,0.019556,-0.165611,-0.002518,0.001894,-0.002438,0.019861,0.030541
delta,-0.301041,0.138778,1.0,0.002078,0.00354,-0.006658,-0.004404,0.003633,-0.301249,,0.032573,-0.173406,0.112352,0.031773,0.112485,0.10737,0.028451
u,0.01532,-0.001517,0.002078,1.0,0.999312,0.054118,0.04571,0.998095,0.015322,,0.003538,-0.00838,0.029988,0.014304,0.029988,0.031986,0.016301
g,0.015711,-0.002418,0.00354,0.999312,1.0,0.062349,0.056241,0.999162,0.015713,,0.003506,-0.008856,0.039425,0.022942,0.039422,0.040255,0.017463
r,0.153922,-0.022014,-0.006658,0.054118,0.062349,1.0,0.962861,0.053668,0.154012,,0.00825,-0.026301,0.655326,0.433234,0.655266,0.671285,0.223016
i,0.147566,-0.023603,-0.004404,0.04571,0.056241,0.962861,1.0,0.055968,0.147646,,0.007481,-0.026585,0.661702,0.492246,0.661648,0.672585,0.214722
z,0.013809,-0.002916,0.003633,0.998095,0.999162,0.053668,0.055968,1.0,0.013815,,0.003363,-0.008907,0.037799,0.030368,0.037792,0.037457,0.014664
run_ID,1.0,-0.013655,-0.301249,0.015322,0.015713,0.154012,0.147646,0.013815,1.0,,-0.047113,0.031407,0.239402,0.065477,0.239387,0.262639,0.066972
rerun_ID,,,,,,,,,,,,,,,,,


## Markdown Answer

In [50]:
my_ds.to_numpy()

array([['1/1/2015', 'Quarter1', 'sweing', ..., 0.0, 59.0, 0.940725424],
       ['1/1/2015', 'Quarter1', 'finishing ', ..., 0.0, 8.0, 0.8865],
       ['1/1/2015', 'Quarter1', 'sweing', ..., 0.0, 30.5, 0.800570492],
       ...,
       ['3/11/2015', 'Quarter2', nan, ..., 0.0, 8.0, 0.625625],
       ['3/11/2015', 'Quarter2', 'finishing', ..., 0.0, 15.0,
        0.505888889],
       ['3/11/2015', 'Quarter2', 'finishing', ..., 0.0, 6.0, 0.394722222]],
      dtype=object)

In [51]:
new_ds

array([['1/1/2015', 'Quarter1', 'sweing', ..., '0', '59', '0.940725424'],
       ['1/1/2015', 'Quarter1', 'finishing ', ..., '0', '8', '0.8865'],
       ['1/1/2015', 'Quarter1', 'sweing', ..., '0', '30.5',
        '0.800570492'],
       ...,
       ['3/11/2015', 'Quarter2', '', ..., '0', '8', '0.625625'],
       ['3/11/2015', 'Quarter2', 'finishing', ..., '0', '15',
        '0.505888889'],
       ['3/11/2015', 'Quarter2', 'finishing', ..., '0', '6',
        '0.394722222']], dtype='<U11')

In [262]:
"a".isalpha()

True

In [292]:
float(22)

22.0

In [48]:
np.any(np.isnan([1,2]))

False