# Stellar Classification Dataset - SDSS17

### Classification of Stars, Galaxies and Quasars. Sloan Digital Sky Survey DR17

https://www.kaggle.com/datasets/fedesoriano/stellar-classification-dataset-sdss17

![image.png](../Images/Stellar.png)

### Library & Data Import

In [None]:
import numpy as np
import pandas as pd

In [2]:
X_test = pd.read_csv('../Datasets/Stellar_X_test.csv')
X_train = pd.read_csv('../Datasets/Stellar_X_train.csv')
y_train = pd.read_csv('../Datasets/Stellar_y_train.csv')

### 1. Data Exploration

In [3]:
y_train

Unnamed: 0,galaxy
0,1
1,0
2,1
3,0
4,1
...,...
69995,0
69996,1
69997,0
69998,1


### 2. Data Preprocessing

#### (1) Missing Value

In [4]:
# 이상치 제거
y_train = y_train.loc[X_train.u != -9999]
X_train = X_train.loc[X_train.u != -9999]

In [5]:
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

### 3. Data Modeling

#### (1) Data Split

In [None]:
COL_NUM = ['alpha', 'delta', 'u', 'g', 'r', 'i', 'z', 'redshift']
COL_CAT = ['run_ID', 'rerun_ID', 'cam_col', 'field_ID', 'plate']

X_train[COL_CAT] = X_train[COL_CAT].astype('object')
X_test[COL_CAT] = X_test[COL_CAT].astype('object')

#### (2) Label Encoding

In [7]:
from sklearn.preprocessing import LabelEncoder

X = pd.concat([X_train, X_test])

y_TRAIN = y_train['galaxy']

for _col in COL_CAT:
    lbe = LabelEncoder()
    lbe.fit(X[_col])
    X_train[_col] = lbe.transform(X_train[_col])
    X_test[_col] = lbe.transform(X_test[_col])

In [8]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(X_train, y_TRAIN, test_size = 0.25, stratify=y_train, random_state=1234)

print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)

(52499, 13) (17500, 13) (52499,) (17500,)


#### (3) Scaling

In [10]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit(xtrain[COL_NUM])

xtrain[COL_NUM] = scaler.transform(xtrain[COL_NUM])
xtest[COL_NUM] = scaler.transform(xtest[COL_NUM])

### 4. Modeling

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [13]:
def make_models(xtrain, xtest, ytrain, ytest):
    model1 = LogisticRegression(solver='lbfgs', max_iter=1000).fit(xtrain, ytrain)
    print('model1', get_scores(model1, xtrain, xtest, ytrain, ytest))

    model2 = DecisionTreeClassifier(random_state=0).fit(xtrain, ytrain)
    print('model2', get_scores(model2, xtrain, xtest, ytrain, ytest))

    for d in range(3, 8):
        model2 = DecisionTreeClassifier(max_depth=d, random_state=0).fit(xtrain, ytrain)
        print('model2', d, get_scores(model2, xtrain, xtest, ytrain, ytest))

    model3 = RandomForestClassifier(random_state=0).fit(xtrain, ytrain)
    print('model3', get_scores(model3, xtrain, xtest, ytrain, ytest))

    for d in range(3, 8):
        model3 = RandomForestClassifier(500, max_depth=d, random_state=0).fit(xtrain, ytrain)
        print('model3', d, get_scores(model3, xtrain, xtest, ytrain, ytest))

    model4 = XGBClassifier(eval_metric='logloss').fit(xtrain, ytrain)
    print('model4', get_scores(model4, xtrain, xtest, ytrain, ytest))

### 5. Model Evaluation

In [14]:
from sklearn.metrics import roc_auc_score

def get_scores(model, xtrain, xtest, ytrain, ytest):
    A = model.score(xtrain, ytrain)
    
    ypred = model.predict_proba(xtest)[:, 1]
    
    B = roc_auc_score(ytest, ypred)
    
    return f'{A:.4} {B:.4}'

In [15]:
make_models(xtrain, xtest, ytrain, ytest)

model1 0.7893 0.8198
model2 1.0 0.9617
model2 3 0.9474 0.9623
model2 4 0.9615 0.972
model2 5 0.9618 0.9804
model2 6 0.9668 0.9856
model2 7 0.9747 0.9867
model3 1.0 0.9946
model3 3 0.9398 0.9823
model3 4 0.9558 0.9861
model3 5 0.9603 0.9887
model3 6 0.9659 0.9903
model3 7 0.9715 0.9918
model4 0.9914 0.9953


### 6. Save Result

In [None]:
final_model = RandomForestClassifier(max_depth=4, random_state=0).fit(xtrain, ytrain)

print('final model', get_scores(final_model, xtrain, xtest, ytrain, ytest))

In [None]:
pred = model_rf.predict_proba(X_test)[:,1]

pd.DataFrame({'index': X_test.index, 'target': pred}).to_csv('./result.csv', index=False)