In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from scipy import stats

In [None]:
file_path = '../resources/data_scss.tab'
column_names = [
    'ID', "RA", "DEC", "EPOCH", "MU_ACOSD", "MU_D", "SIGMU_A", "SIGMU_D",
    "B_J", "R_1", "R_2", "I", "AREA", "A_I", "B_I", "P_A", "CLASS",
    "N(0,1)", "BLEND", "QUALITY", "FIELD", "XMIN", "XMAX", "YMIN",
    "YMAX", "IPEAK", "MAG", "ISKY", "XCEN_I", "YCEN_I", "A_U", "B_U",
    "THETA_U", "THETA_I", "AP(1)", "AP(2)", "AP(3)", "AP(4)", "AP(5)",
    "AP(6)", "AP(7)", "AP(8)", "PRFMAG"
]

data = pd.read_csv(file_path, sep='\t', skiprows=1, names=column_names, low_memory=False)

for col in column_names:
    data[col] = pd.to_numeric(data[col], errors='coerce')

data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.count()

In [None]:
irrelevant_columns = ["ID", "EPOCH", "MU_ACOSD", "MU_D", "SIGMU_A", "SIGMU_D",
                      "XCEN_I", "YCEN_I", "P_A", "XMIN", "XMAX", "YMIN", "YMAX", "AP(1)",
                      "AP(2)", "AP(3)", "AP(4)", "AP(5)", "AP(6)", "AP(7)", "AP(8)"]

data.drop(columns=irrelevant_columns, inplace=True)
filtered_data = data[data['CLASS'].isin([1, 2])].copy()

In [None]:
filtered_data.head()

In [None]:
filtered_data.describe()

In [None]:
filtered_data.replace([np.inf, -np.inf], np.nan, inplace=True)
filtered_data = filtered_data.dropna(subset=['B_I', 'A_I', 'AREA', 'A_U', 'B_U'])

# drop all rows that have value of column B_U of 0
filtered_data = filtered_data[filtered_data['B_U'] != 0]

In [None]:
filtered_data.count()

In [None]:
filtered_data.loc[:, 'Ellipticity'] = 1 - (filtered_data['B_I'] / filtered_data['A_I'])
filtered_data.loc[:, 'Filling_Factor'] = filtered_data['AREA'] / (np.pi * filtered_data['A_U'] * filtered_data['B_U'])

In [None]:
filtered_data.describe()

In [None]:
numerical_columns = filtered_data.select_dtypes(include=[np.number]).columns.tolist()
numerical_columns.remove('CLASS')

In [None]:
def remove_outliers(df, columns):
    for column in columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

In [None]:
filtered_data = remove_outliers(filtered_data, numerical_columns)
filtered_data.describe()

In [None]:
filtered_data.head()

In [None]:
filtered_data = filtered_data[filtered_data['R_1'] != 99.999]
filtered_data = filtered_data[filtered_data['R_2'] != 99.999]
filtered_data = filtered_data[filtered_data['I'] != 99.999]
filtered_data.count()

In [None]:
# convert the class label to strings where 2 is star and 1 is galaxy
filtered_data['CLASS'] = filtered_data['CLASS'].map({2: 'STAR', 1: 'GALAXY'})

In [None]:
filtered_data.describe()

## Data Exploration

In [None]:
sns.set_context("paper", rc={"font.size":10,
                             "axes.titlesize":15,
                             "axes.labelsize":12,
                             "xtick.labelsize":10,
                             "ytick.labelsize":10,
                             "legend.fontsize":15})

palette = {'GALAXY':'#4daf4a',
           'STAR':'#ff7f00'}

In [None]:
labels = [filtered_data['CLASS'].value_counts().iloc[0],
          filtered_data['CLASS'].value_counts().iloc[1]]

plt.figure(figsize=(10,6))
filtered_data['CLASS'].value_counts().plot(kind='barh', title='Comparison of Sky Objects',
                                           color=['#4daf4a','#ff7f00']).invert_yaxis()
plt.xlabel('Number of Observations')
plt.xlim(0,60000)
for index, value in enumerate(labels):
    plt.text(value, index, str(value))

plt.show()

In [None]:
f, axs = plt.subplots(1,2,
                      figsize=(15,8),
                      sharey=True,
                     gridspec_kw=dict(width_ratios=[3,0.8]))
sns.scatterplot(x = 'RA',y = 'DEC', hue = 'CLASS', data = filtered_data, ax = axs[0], palette = palette, alpha = 0.5)
sns.kdeplot(y = 'DEC', hue = 'CLASS', data = filtered_data, ax = axs[1], palette = palette, legend = False)
f.tight_layout

plt.suptitle('Equatorial Coordinates', fontsize = 15);

In [None]:
filtered_data.hist(bins = 80, figsize = (20,15));

In [None]:
def get_hists(feature_name):
    fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(15, 5), sharex = True)
    
    ax = sns.histplot(filtered_data[filtered_data['CLASS']=='GALAXY'][feature_name], bins = 30, ax = axes[0], 
                      color = '#4daf4a', kde = False)
    ax.set_title('Galaxy')
    ax = sns.histplot(filtered_data[filtered_data['CLASS']=='STAR'][feature_name], bins = 30, ax = axes[1], 
                      color = '#ff7f00', kde = False)
    ax.set_title('Star')

    fig.suptitle(feature_name.upper(), fontsize = 15)
    fig.tight_layout(pad = 0.5);

In [None]:
columns = list(filtered_data.drop(['CLASS'], axis = 1).columns)
for name in columns:
    get_hists(name)

## Baseline Model - Logistic Regression

In [None]:
filtered_data.head()

In [None]:
filtered_data.drop(columns=['RA', 'DEC'], inplace=True)

In [None]:
filtered_data.head()

## Baseline Model - Logistic Regression

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier

from sklearn.decomposition import PCA

In [None]:
X = filtered_data.drop('CLASS', axis=1)
y = filtered_data['CLASS']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2018)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=2019)

In [None]:
std_scale = StandardScaler()
X_train_scaled = std_scale.fit_transform(X_train)
X_val_scaled = std_scale.transform(X_val)
X_test_scaled = std_scale.transform(X_test)

In [None]:
logreg_baseline = LogisticRegression(solver='lbfgs')

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(logreg_baseline, X_train_scaled, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

print('Mean Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

logreg_baseline.fit(X_train_scaled, y_train)

# Evaluate the model on the validation set
y_val_pred = logreg_baseline.predict(X_val_scaled)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.3f}')

# Evaluate the model on the test set
y_test_pred = logreg_baseline.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.3f}')

# Print classification report on the test set
print("Classification Report on Test Data:")
print(classification_report(y_test, y_test_pred))

In [None]:
def plot_confusion_matrix(y_test, y_pred):
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Galaxy', 'Star'], yticklabels=['Galaxy', 'Star'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()


In [None]:
# call plot_confusion_matrix
plot_confusion_matrix(y_test, y_test_pred)

In [None]:
print('Logistic Regression with Standard Scaling - Validation Set')
print('Logistic Regression accuracy:', metrics.accuracy_score(y_val, y_val_pred))
print(metrics.classification_report(y_val,y_val_pred, digits = 3))

## K-Nearest Neighbor
