In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [2]:
file_path = 'data.tab'
column_names = [
    "RA", "DEC", "EPOCH", "MU_ACOSD", "MU_D", "SIGMU_A", "SIGMU_D",
    "B_J", "R_1", "R_2", "I", "AREA", "A_I", "B_I", "P_A", "CLASS",
    "N(0,1)", "BLEND", "QUALITY", "FIELD", "XMIN", "XMAX", "YMIN",
    "YMAX", "IPEAK", "MAG", "ISKY", "XCEN_I", "YCEN_I", "A_U", "B_U",
    "THETA_U", "THETA_I", "AP(1)", "AP(2)", "AP(3)", "AP(4)", "AP(5)",
    "AP(6)", "AP(7)", "AP(8)", "PRFMAG"
]

data = pd.read_csv(file_path, sep='\t', skiprows=[1], names=column_names, low_memory=False)
for col in column_names:
    data[col] = pd.to_numeric(data[col], errors='coerce')


data.head()

Unnamed: 0,RA,DEC,EPOCH,MU_ACOSD,MU_D,SIGMU_A,SIGMU_D,B_J,R_1,R_2,...,THETA_I,AP(1),AP(2),AP(3),AP(4),AP(5),AP(6),AP(7),AP(8),PRFMAG
id,,,,,,,,,,,...,,,,,,,,,,
000001,179.794583,-31.665994,1979.171,-10.78,4.052,19.63,17.98,21.273,99.999,20.003,...,134.0,27.0,21.0,17.0,10.0,1.0,0.0,0.0,0.0,-20993.0
000002,179.880473,-31.665958,1979.171,-5.733,-2.919,8.44,6.031,18.446,17.789,17.694,...,23.0,83.0,78.0,67.0,52.0,41.0,28.0,15.0,0.0,-23179.0
000003,180.152623,-31.665845,1979.171,999900000.0,999900000.0,999900000.0,999900000.0,22.272,99.999,99.999,...,3.0,12.0,10.0,6.0,1.0,0.0,0.0,0.0,0.0,-20191.0
000004,180.261758,-31.665657,1979.171,-27.0,-2.189,11.18,9.307,20.238,18.656,18.757,...,36.0,37.0,35.0,28.0,23.0,15.0,8.0,0.0,0.0,-21779.0


In [3]:
irrelevant_columns = ["RA", "DEC", "EPOCH", "MU_ACOSD", "MU_D", "SIGMU_A", "SIGMU_D",
                      "XCEN_I", "YCEN_I", "P_A", "XMIN", "XMAX", "YMIN", "YMAX", "AP(1)",
                      "AP(2)", "AP(3)", "AP(4)", "AP(5)", "AP(6)", "AP(7)", "AP(8)"]
data.drop(columns=irrelevant_columns, inplace=True)
filtered_data = data[data['CLASS'].isin([1, 2])]
filtered_data.dropna(subset=['CLASS', 'A_I', 'B_I', 'AREA', 'A_U', 'B_U'], inplace=True)

filtered_data['Ellipticity'] = 1 - (filtered_data['B_I'] / filtered_data['A_I'])
filtered_data['Filling_Factor'] = filtered_data['AREA'] / (np.pi * filtered_data['A_U'] * filtered_data['B_U'])

filtered_data.replace([np.inf, -np.inf], np.nan, inplace=True)
filtered_data.dropna(subset=['Ellipticity', 'Filling_Factor'], inplace=True)

scaler = StandardScaler()
features_to_normalize = ['B_J', 'R_1', 'R_2', 'I', 'AREA', 'A_I', 'B_I', 'N(0,1)',
                         'IPEAK', 'MAG', 'ISKY', 'A_U', 'B_U', 'THETA_U', 'THETA_I',
                         'Ellipticity', 'Filling_Factor']

filtered_data[features_to_normalize] = scaler.fit_transform(filtered_data[features_to_normalize])

# Define the features and target variable
features = ['B_J', 'R_1', 'R_2', 'I', 'AREA', 'A_I', 'B_I', 'N(0,1)',
            'IPEAK', 'MAG', 'ISKY', 'A_U', 'B_U', 'THETA_U', 'THETA_I',
            'Ellipticity', 'Filling_Factor']
target = 'CLASS'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data.dropna(subset=['CLASS', 'A_I', 'B_I', 'AREA', 'A_U', 'B_U'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Ellipticity'] = 1 - (filtered_data['B_I'] / filtered_data['A_I'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Filling_Factor'] = filtered_data['AREA'] / (np.pi * filtered_data['A_U'] * fi

In [4]:
filtered_data.head()

Unnamed: 0,B_J,R_1,R_2,I,AREA,A_I,B_I,CLASS,"N(0,1)",BLEND,...,IPEAK,MAG,ISKY,A_U,B_U,THETA_U,THETA_I,PRFMAG,Ellipticity,Filling_Factor
1,0.203621,1.371983,-0.661718,0.953811,-0.089215,-0.048579,-0.015277,1.0,0.67049,0.0,...,-0.466699,0.01116,0.099852,-0.069009,-0.116351,0.713765,0.993596,-20993.0,-0.060025,-0.185748
2,-1.308319,-0.753225,-0.722412,-1.050997,0.126548,0.318523,0.604852,2.0,-0.25269,0.0,...,1.524571,-1.369808,0.069217,0.398411,0.758656,-0.944387,-0.897443,-23179.0,-0.727183,0.702485
3,0.737908,1.371983,1.441068,0.953811,-0.167337,-0.667106,-0.582235,2.0,-0.505191,0.0,...,-0.726401,0.759556,0.001924,-0.661011,-0.585131,-1.171143,-1.23817,-20191.0,-0.125893,1.322456
4,-0.349919,-0.730812,-0.69447,-1.029477,-0.052015,-0.126014,-0.008108,2.0,-0.489791,0.0,...,0.612335,-0.58815,-0.026772,0.033626,0.018065,-0.561737,-0.67597,-21779.0,-0.309439,0.374532
5,-0.58738,-0.722514,-0.692499,-1.017977,-0.029695,-0.11502,0.171718,2.0,-0.610864,0.0,...,0.855648,-0.813857,0.169654,-0.059735,0.326738,0.827143,-0.897443,-22126.0,-0.833305,0.156972


In [5]:
X = filtered_data[features]
y = filtered_data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [6]:
X_train.size

1934396

In [7]:
X_test.size

829039

In [8]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)

In [9]:
rf_report = classification_report(y_test, rf_predictions, target_names=['Galaxy', 'Star'])
rf_cm = confusion_matrix(y_test, rf_predictions)

print("Random Forest Classification Report:")
print(rf_report)
print("Random Forest Confusion Matrix:")
print(rf_cm)

Random Forest Classification Report:
              precision    recall  f1-score   support

      Galaxy       0.99      0.99      0.99     15292
        Star       1.00      0.99      1.00     33475

    accuracy                           0.99     48767
   macro avg       0.99      0.99      0.99     48767
weighted avg       0.99      0.99      0.99     48767

Random Forest Confusion Matrix:
[[15196    96]
 [  185 33290]]
