In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder

pio.templates.default="simple_white"

In [None]:
#read the data file from Kaggle
data_read = pd.read_csv("star_classification.csv")

In [None]:
#make a copy so the original read can be referred back to
master_df=data_read.copy()

In [None]:
#change the predictor variables of interests names to something meaningful
master_df = master_df.rename({
    "u":"Ultraviolet",
    "g":"Green",
    "r":"Red",
    "i":"Near_Infrared",
    "z":"Infrared",
    "redshift":"Redshift"
},axis=1)

In [None]:
#check there are no missing values
master_df.info()

In [None]:
#check the distribution of the data
master_df.describe()

In [None]:
#drop variables not of interest
master_df = master_df.drop(["alpha","delta","run_ID","rerun_ID","cam_col",
                            "field_ID","spec_obj_ID","plate","fiber_ID"],axis=1)

In [None]:
#find outlier data entry
master_df[master_df["Ultraviolet"]==master_df["Ultraviolet"].min()]

In [None]:
#drop the outlier from the master set
master_df=master_df.drop(79543)

In [None]:
#check how many unique objects there are
len(master_df.obj_ID.unique())

In [None]:
#sort the frame based on modified Julian date
master_df = master_df.sort_values(by=["MJD"])

In [None]:
#copy duplicates into a new frame
duplicates_df=master_df.loc[master_df["obj_ID"].duplicated(),:]

In [None]:
#drop duplicates and have a frame of initial observations
initials_df=master_df.drop_duplicates(["obj_ID"])

In [None]:
#check the lengths tally
len(initials_df)

In [None]:
#check the lengths tally
len(duplicates_df)

In [None]:
##check the lengths tally
len(duplicates_df+initials_df)

In [None]:
#check if the predictor variables fit a Gaussian distribution
guassian_analysis_df = pd.DataFrame(columns=["Variable","Statistic","P-Value"])

for i in initials_df.drop(["obj_ID","MJD","class"],axis=1).iloc[:201,:].columns:
    vector = initials_df[i]
    test_distribution=getattr(stats,"norm")
    parameters=test_distribution.fit(vector)
    test_stat,p_value=stats.kstest((vector),"norm",parameters)
    
    guassian_analysis_df=guassian_analysis_df.append({
        "Variable":i,
        "Statistic":round(test_stat,5),
        "P-Value":round(p_value,5)
    },ignore_index=True)
    
guassian_analysis_df

In [None]:
#create a pairplot of all interesting predictor variables
sns.pairplot(initials_df.drop(["obj_ID","MJD"],axis=1),
            hue="class",corner=True)

In [None]:
#create box plots of all interesting predictor variables
for i in initials_df.drop(["obj_ID","MJD","class"],axis=1).columns:
    fig = px.box(master_df,
                 y=i,
                 x="class",
                 color="class",
                 height=640,
                 width=640)
    
    fig.update_xaxes(showline=True, linewidth=1, mirror=True,title="Class")
    fig.update_yaxes(showline=True, linewidth=1, mirror=True)
    fig.update_traces(marker={"size": 15,
                             "symbol": 134,
                             "line_width": 3})
    
    fig.update_layout(legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="center",
        x=0.5,
        title="Class:",
        bordercolor="black",
        borderwidth=1))
    
    fig.show()

In [None]:
#create correlation matrices for predictor variables by class
for c in initials_df["class"].unique():
    corr_df=initials_df[initials_df["class"]==c].drop(["obj_ID","MJD","class"],axis=1).corr(method="spearman")
    fig = plt.figure(figsize=(8,6))

    ax=fig.add_subplot(111)
    ax.set_title(f"Spearman rank correlations for class: {c}",fontname="Calibri",fontsize=14)
    cax=ax.matshow(corr_df,cmap="viridis",vmin=-1,vmax=1)
    fig.colorbar(cax,label="Spearman rank correlation")
    
    ticks=np.arange(0,len(corr_df.columns),1)

    ax.set_xticks(ticks)
    plt.xticks(rotation=90)
    ax.set_xticklabels(corr_df.columns)

    ax.set_yticks(ticks)
    ax.set_yticklabels(corr_df.columns)    

    plt.show()

In [None]:
#complete Kruskal-Wallis test to check all predictor variables are of value
kw_df = pd.DataFrame(columns=["Variable","Statistic","P-Value"])

predictor_variables = ["Ultraviolet","Green","Red","Near_Infrared","Infrared","Redshift"]

for i in predictor_variables:
    #create a new dictionary for class data
    class_dict={}

    #loop through each class and add suicides_100k_pop data to dictionary
    for j in initials_df["class"].unique():
        class_dict[j]=initials_df[i][initials_df["class"]==j].values

    #run the Kruskal-Wallis test
    test_stat, p_value = stats.kruskal(*class_dict.values())
    
    kw_df = kw_df.append({
        "Variable":i,
        "Statistic":test_stat,
        "P-Value":p_value
    },ignore_index=True)
    
kw_df

In [None]:
#fit an ordinal encoder to the master dataframe based on class
enc = OrdinalEncoder().fit(master_df[["class"]])

In [None]:
#create the training data set
training_data = initials_df.drop(["obj_ID","MJD"],axis=1)

In [None]:
#trainsform the class column from string to ordinal encoding
training_data[["class"]] = enc.transform(training_data[["class"]])

In [None]:
#create the test dataset
test_data = duplicates_df.drop(["obj_ID","MJD"],axis=1)

In [None]:
#transfor the class column from string to ordinal encoding
test_data[["class"]] = enc.transform(test_data[["class"]])

In [None]:
#export files to csv
training_data.to_csv("stellar_training_data.csv")
test_data.to_csv("stellar_test_data.csv")