In [None]:
# The table results from a query which joins two tables:

# "PhotoObj" which contains photometric data
# "SpecObj" which contains spectral data.
# 16 variables (double) and 1 additional variable (char) 'class'.
# A class object can be predicted from the other 16 variables.

# Variables description:
# objid = Object Identifier
# ra = J2000 Right Ascension (r-band)
# dec = J2000 Declination (r-band)
# u = better of deV/Exp magnitude fit (u-band)
# g = better of deV/Exp magnitude fit (g-band)
# r = better of deV/Exp magnitude fit (r-band)
# i = better of deV/Exp magnitude fit (i-band)
# z = better of deV/Exp magnitude fit (z-band)
# run = Run Number
# rerun = Rerun Number
# camcol = Camera column
# field = Field number
# specobjid = Object Identifier
# class = object class (galaxy, star or quasar object)
# redshift = Final Redshift
# plate = plate number
# mjd = MJD of observation
# fiberid = fiberID

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
## for plotting
import matplotlib.pyplot as plt
import seaborn as sns
sdss_dataset = pd.read_csv("D:\datasets\SDSS16.csv")

sdss_dataset
print('Number of NaN values for each feature:\n',sdss_dataset.isnull().sum())


In [None]:
sdss_dataset

In [None]:
sdss_dataset = sdss_dataset.drop(['dec','ra','objid', 'run', 'rerun', 'camcol', 'field', 'specobjid', 'plate', 'mjd', 'fiberid'],1)

In [None]:
sdss_dataset

In [None]:
sns.catplot(x="class", kind="count", palette="ch:.25", data=sdss_dataset)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="ticks")

# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(7, 6))
ax.set_yscale("log")

# Plot the orbital period with horizontal boxes
sns.boxplot(x="class", y="redshift", data=sdss_dataset,
            whis=[0, 100], width=.6, palette="vlag")

# Add in points to show each observation
sns.stripplot(x="class", y="redshift", data=sdss_dataset,
              size=4, color=".3", linewidth=0)

# Tweak the visual presentation
ax.xaxis.grid(True)
ax.set(ylabel="")
sns.despine(trim=True, left=True)

In [None]:
sdss_dataset = sdss_dataset.rename(columns={'u': 'u-band', 
                                                  'g': 'g-band',
                                                 'r': 'r-band',
                                                 'i': 'i-band',
                                                 'z': 'z-band',
                                                 'class':'class',
                                                 'redshift': 'redshift',})

In [None]:
sdss_dataset

In [None]:
X = sdss_dataset.drop('class', 1)
y = sdss_dataset['class']

In [None]:
y

In [None]:
import pandas as pd
from sklearn import preprocessing

x = X.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
X=pd.DataFrame(x_scaled, columns=X.columns)
X

In [None]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE().fit_resample(X, y)

In [None]:
temp = pd.DataFrame(y_resampled)
temp
#plot SMOTE
sns.catplot(x="class", kind="count", palette="ch:.25", data=temp)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.20, random_state=42)

In [None]:
y_resampled

In [None]:
y_test

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
GNB = GaussianNB()
scores = cross_val_score(GNB, X_train, y_train, cv=10, scoring='accuracy')
scores

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [None]:
y_pred  =  classifier.predict(X_test)

In [None]:
y_pred

In [None]:
test = pd.DataFrame(y_pred, y_test)
test.head(20)

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test,y_pred)
print(cm)
print(ac)

In [None]:
sns.heatmap(cm, annot = True, cmap='Blues', fmt='g')

In [None]:
print(classification_report(y_test, y_pred))