# SkySort

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

#### Read SDSS_DR18 Data from the data folder into sdss_data

In [2]:
sdss_data = pd.read_csv("../data/SDSS_DR18.csv")

#### Manipulate the sdss_data

In [3]:
sdss_data.head()

Unnamed: 0,objid,specobjid,ra,dec,u,g,r,i,z,run,...,psfMag_g,psfMag_i,psfMag_z,expAB_u,expAB_g,expAB_r,expAB_i,expAB_z,redshift,class
0,1.24e+18,3.24e+17,184.950869,0.733068,18.87062,17.59612,17.11245,16.83899,16.70908,756,...,19.96352,19.25145,19.0523,0.479021,0.518483,0.520474,0.508502,0.488969,0.041691,GALAXY
1,1.24e+18,3.25e+17,185.729201,0.679704,19.5956,19.92153,20.34448,20.66213,20.59599,756,...,19.92417,20.65535,20.57387,0.573926,0.531728,0.403072,0.999874,0.189495,-0.000814,STAR
2,1.24e+18,3.24e+17,185.68769,0.82348,19.26421,17.87891,17.09593,16.65159,16.35329,756,...,19.33645,18.16669,17.78844,0.701666,0.743386,0.770897,0.778642,0.736771,0.113069,GALAXY
3,1.24e+18,2.88e+18,185.677904,0.768362,19.49739,17.96166,17.41269,17.20545,17.11567,756,...,17.96176,17.21564,17.12367,0.999818,0.78776,0.745611,0.399718,0.986137,8.7e-05,STAR
4,1.24e+18,2.88e+18,185.814763,0.77694,18.31519,16.83033,16.26352,16.0632,15.97527,756,...,16.85104,16.08275,15.98694,0.999795,0.83445,0.723526,0.712259,0.527055,1.8e-05,STAR


In [4]:
sdss_data.columns

Index(['objid', 'specobjid', 'ra', 'dec', 'u', 'g', 'r', 'i', 'z', 'run',
       'rerun', 'camcol', 'field', 'plate', 'mjd', 'fiberid', 'petroRad_u',
       'petroRad_g', 'petroRad_i', 'petroRad_r', 'petroRad_z', 'petroFlux_u',
       'petroFlux_g', 'petroFlux_i', 'petroFlux_r', 'petroFlux_z',
       'petroR50_u', 'petroR50_g', 'petroR50_i', 'petroR50_r', 'petroR50_z',
       'psfMag_u', 'psfMag_r', 'psfMag_g', 'psfMag_i', 'psfMag_z', 'expAB_u',
       'expAB_g', 'expAB_r', 'expAB_i', 'expAB_z', 'redshift', 'class'],
      dtype='object')

As we can see from the column names, for the classifying whether an object is a star, galaxy or quasar, we will be use the 5 sdss filters (u, g, r, i, z) and the class column to train the dataset. We will be pulling u, g, r, i, z and class columns to another data frame to use for training the model.

In [5]:
columns_needed = ['u', 'g', 'r', 'i', 'z', 'class']

In [6]:
cleaned_data = sdss_data[columns_needed]
cleaned_data

Unnamed: 0,u,g,r,i,z,class
0,18.87062,17.59612,17.11245,16.83899,16.70908,GALAXY
1,19.59560,19.92153,20.34448,20.66213,20.59599,STAR
2,19.26421,17.87891,17.09593,16.65159,16.35329,GALAXY
3,19.49739,17.96166,17.41269,17.20545,17.11567,STAR
4,18.31519,16.83033,16.26352,16.06320,15.97527,STAR
...,...,...,...,...,...,...
99995,19.39861,18.35476,18.00348,17.89408,17.81222,STAR
99996,19.07703,18.05159,17.78332,17.68976,17.66209,STAR
99997,19.07982,17.51349,16.64037,16.24183,15.91180,GALAXY
99998,17.27528,16.41704,16.11662,15.98858,15.97745,STAR


In [7]:
class_count = print(sdss_data['class'].value_counts())
class_count

GALAXY    52343
STAR      37232
QSO       10425
Name: class, dtype: int64


In [11]:
features = cleaned_data[['u', 'g', 'r', 'i', 'z']]
target = cleaned_data['class']

In [12]:
features_train, features_test, target_train, target_test = train_test_split(
    features, target, test_size = 0.2, random_state = 42)

In [13]:
model = RandomForestClassifier(n_estimators=100, random_state=42)

In [15]:
model.fit(features_train, target_train)

In [16]:
target_pred = model.predict(features_test)

In [17]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(target_test, target_pred))
print(classification_report(target_test, target_pred))


Accuracy: 0.95515
              precision    recall  f1-score   support

      GALAXY       0.97      0.96      0.96     10373
         QSO       0.93      0.91      0.92      2115
        STAR       0.94      0.96      0.95      7512

    accuracy                           0.96     20000
   macro avg       0.95      0.94      0.95     20000
weighted avg       0.96      0.96      0.96     20000

