In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
import category_encoders as ce

In [2]:
# imports the csv from my github as a dataframe

df0 = pd.read_csv('https://raw.githubusercontent.com/SeanAntosiak/LS-DS6-Unit-2-Project/master/SDSSdataset.csv')

In [31]:
# looks at dataframe

df0.head()

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,camcol,field,specobjid,class,redshift,plate,mjd,fiberid
0,1.23765e+18,183.531326,0.089693,19.47406,17.0424,15.94699,15.50342,15.22531,752,301,4,267,3.72236e+18,STAR,-9e-06,3306,54922,491
1,1.23765e+18,183.598371,0.135285,18.6628,17.21449,16.67637,16.48922,16.3915,752,301,4,267,3.63814e+17,STAR,-5.5e-05,323,51615,541
2,1.23765e+18,183.680207,0.126185,19.38298,18.19169,17.47428,17.08732,16.80125,752,301,4,268,3.23274e+17,GALAXY,0.123111,287,52023,513
3,1.23765e+18,183.870529,0.049911,17.76536,16.60272,16.16116,15.98233,15.90438,752,301,4,269,3.72237e+18,STAR,-0.000111,3306,54922,510
4,1.23765e+18,183.883288,0.102557,17.55025,16.26342,16.43869,16.55492,16.61326,752,301,4,269,3.72237e+18,STAR,0.00059,3306,54922,512


In [72]:
# looking just at QSO objects

df1[df1['class']=='QSO'].head()

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,...,class,redshift,plate,mjd,fiberid,uRS,gRS,rRS,iRS,zRS
15,1.23765e+18,184.350647,0.20723,18.73832,18.60962,18.39696,18.31174,17.97663,752,301,...,QSO,0.271937,287,52023,587,5.095641,5.060642,5.002812,4.979638,4.888509
17,1.23765e+18,184.245664,0.198257,19.22143,19.30248,19.13823,19.11351,19.23454,752,301,...,QSO,1.178098,287,52023,583,22.644728,22.740213,22.54671,22.517588,22.660173
22,1.23765e+18,184.65417,0.122673,19.07731,18.64518,18.49678,18.52677,18.45765,752,301,...,QSO,0.925173,288,52000,421,17.649818,17.250023,17.112727,17.140473,17.076525
45,1.23765e+18,185.164376,0.074358,17.55001,17.42367,17.35734,17.19343,16.97589,752,301,...,QSO,2.044347,288,52000,516,35.87831,35.620027,35.484426,35.149337,34.70461
62,1.23765e+18,160.384806,-0.586705,19.52565,19.29625,18.95885,18.52108,18.57694,756,301,...,QSO,0.134643,275,51910,281,2.628988,2.598101,2.552673,2.49373,2.501251


In [54]:
# redshift values for QSO are much higher but intensity is similar
# I will create features that tries to account for this

cols = ['u','g','r','i','z']

df1 = df0.copy()

for col in cols0:
    df1[f'{col}RS']=(df1[col]*(df1['redshift']));

In [94]:
df2 = df1[(df1['class']=='QSO') | (df1['class']=='GALAXY')]

In [95]:
# checking to make sure features were added correctly

df2.head()

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,...,class,redshift,plate,mjd,fiberid,uRS,gRS,rRS,iRS,zRS
2,1.23765e+18,183.680207,0.126185,19.38298,18.19169,17.47428,17.08732,16.80125,752,301,...,GALAXY,0.123111,287,52023,513,2.386262,2.239601,2.15128,2.10364,2.068422
6,1.23765e+18,183.864379,0.019201,19.38322,17.88995,17.10537,16.66393,16.36955,752,301,...,GALAXY,0.100242,287,52023,559,1.943019,1.79333,1.714682,1.670431,1.640921
9,1.23765e+18,183.973498,0.081626,18.67249,17.71375,17.49362,17.28284,17.22644,752,301,...,GALAXY,0.040508,288,52000,400,0.756388,0.717551,0.708634,0.700096,0.697811
14,1.23765e+18,184.189574,0.099482,19.25667,17.54869,16.63578,16.14922,15.76639,752,301,...,GALAXY,0.072087,288,52000,389,1.388163,1.265039,1.199229,1.164155,1.136557
15,1.23765e+18,184.350647,0.20723,18.73832,18.60962,18.39696,18.31174,17.97663,752,301,...,QSO,0.271937,287,52023,587,5.095641,5.060642,5.002812,4.979638,4.888509


In [98]:
# sets initinal X features and y lables 
# using intensity(brightness) and redshift(distance) colums as features
# change between df1 and  df2 to view  for just galaxy and QSO and then for all three class types


X0 = df2[['u', 'g', 'r', 'i', 'z','uRS', 'gRS', 'rRS', 'iRS', 'zRS', 'redshift']]

y0 = df2['class']

In [99]:
# creating a train test split, and then again for a train validation split

X0train, X0test, y0train, y0test = tts(X0,y0, train_size=0.90, test_size=0.10, random_state=8)

X1train, X1val, y1train, y0val = tts(X0train,y0train, train_size=0.90, test_size=0.10, random_state=8)

In [100]:
# finding a baseline for majority class

y0.value_counts(normalize=True)

# predicting galaxy every time would result in an accuracy of almost 50%

GALAXY    0.854651
QSO       0.145349
Name: class, dtype: float64

In [101]:
# settings a logistic regression model and scores it on test set

mod0 = LogReg(solver='lbfgs', multi_class='auto', max_iter=1000) # keywords set to silence warnings
mod0.fit(X1train,y1train)
mod0.score(X1val, y0val)


0.9772296015180265

In [102]:
# tries a basic RandomForestClassifier and scores it

mod1 = RandomForestClassifier(max_depth=20, n_estimators=1000, n_jobs=-1) # keywords set to silence warnings
mod1.fit(X1train,y1train)
mod1.score(X1val, y0val)


0.9734345351043643

In [103]:
# checks importance values

importance = mod1.feature_importances_
pd.Series(importance, X1val.columns)

u           0.012197
g           0.009846
r           0.013984
i           0.047152
z           0.023891
uRS         0.087129
gRS         0.195942
rRS         0.183465
iRS         0.144371
zRS         0.153250
redshift    0.128775
dtype: float64