In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# Star Type mapping:
# 0 = Red Dwarf, 1 = Brown Dwarf, 2 = White Dwarf,
# 3 = Main Sequence, 4 = Supergiant, 5 = Hypergiant

# Load dataset
df = pd.read_csv('train_test_dataset.csv')

In [38]:
df.head(100)

Unnamed: 0,S.No.,Temperature_K,Luminosity_Lo,Radius_Ro,Absolute_Magnitude,Star_Color,Spectral_Class,Star_Type
0,1,5132.872,1.159,1.084,6.011,Yellowish White,A,3
1,2,9295.788,0.013,0.009,14.534,White,D,2
2,3,5813.333,0.786,0.840,4.883,Blue White,G,3
3,4,5377.994,-0.070,0.811,5.598,White,F,3
4,5,5473.370,1.055,0.655,5.376,Yellowish White,G,3
...,...,...,...,...,...,...,...,...
95,96,5832.443,1.120,1.247,7.052,White,A,3
96,97,9595.890,0.014,0.015,15.074,White,D,2
97,98,5932.045,0.058,1.291,5.646,White,A,3
98,99,3495.585,0.048,0.672,10.335,Yellow,M,1


In [39]:
df['Star_Color'] = df['Star_Color'].str.lower().str.replace('-', ' ').str.strip()
print(df['Star_Color'].value_counts())

Star_Color
blue white         1081
white               967
red                 639
blue                591
brown               436
yellowish           287
yellowish white     272
yellow              271
orange              264
Name: count, dtype: int64


In [40]:
print(df.isnull().mean()*100)
df.shape

S.No.                 0.00
Temperature_K         0.14
Luminosity_Lo         0.00
Radius_Ro             0.00
Absolute_Magnitude    0.00
Star_Color            3.84
Spectral_Class        1.86
Star_Type             0.00
dtype: float64


(5000, 8)

In [41]:
cols=df.columns
new_df=df[cols].dropna()
print(df.shape)
print(new_df.shape)
print(new_df.isnull().mean()*100)
print(new_df.head())

(5000, 8)
(4709, 8)
S.No.                 0.0
Temperature_K         0.0
Luminosity_Lo         0.0
Radius_Ro             0.0
Absolute_Magnitude    0.0
Star_Color            0.0
Spectral_Class        0.0
Star_Type             0.0
dtype: float64
   S.No.  Temperature_K  Luminosity_Lo  Radius_Ro  Absolute_Magnitude  \
0      1       5132.872          1.159      1.084               6.011   
1      2       9295.788          0.013      0.009              14.534   
2      3       5813.333          0.786      0.840               4.883   
3      4       5377.994         -0.070      0.811               5.598   
4      5       5473.370          1.055      0.655               5.376   

        Star_Color Spectral_Class  Star_Type  
0  yellowish white              A          3  
1            white              D          2  
2       blue white              G          3  
3            white              F          3  
4  yellowish white              G          3  


In [72]:
df_no_cat = df.drop(columns=['Star_Color', 'Spectral_Class'])
print(df_no_cat.head())

   S.No.  Temperature_K  Luminosity_Lo  Radius_Ro  Absolute_Magnitude  \
0      1       5132.872          1.159      1.084               6.011   
1      2       9295.788          0.013      0.009              14.534   
2      3       5813.333          0.786      0.840               4.883   
3      4       5377.994         -0.070      0.811               5.598   
4      5       5473.370          1.055      0.655               5.376   

   Star_Type  
0          3  
1          2  
2          3  
3          3  
4          3  


In [73]:
X_nc = df_no_cat.drop(columns=['Star_Type'])
y_nc = df_no_cat['Star_Type']
X_nc_train, X_nc_test, y_nc_train, y_nc_test = train_test_split(X_nc, y_nc, test_size=0.1, random_state=42)
print(X_nc_train.shape, X_nc_test.shape)

(4500, 5) (500, 5)


In [77]:
# Impute missing values in X_nc_train and X_nc_test for numerical columns with mean
for col in X_nc_train.columns:
    if X_nc_train[col].dtype in [np.float64, np.int64]:
        mean_val = X_nc_train[col].mean()
        X_nc_train[col] = X_nc_train[col].fillna(mean_val)
        X_nc_test[col] = X_nc_test[col].fillna(mean_val)

print(X_nc_train.isnull().sum())
print(X_nc_test.isnull().sum())


S.No.                 0
Temperature_K         0
Luminosity_Lo         0
Radius_Ro             0
Absolute_Magnitude    0
dtype: int64
S.No.                 0
Temperature_K         0
Luminosity_Lo         0
Radius_Ro             0
Absolute_Magnitude    0
dtype: int64


In [78]:
from sklearn.metrics import accuracy_score

# Train a DecisionTreeClassifier on X_nc_train and y_nc_train, predict on X_nc_test, and calculate accuracy
dt_nc = DecisionTreeClassifier(max_depth=5, min_samples_split=10, random_state=42)
dt_nc.fit(X_nc_train, y_nc_train)
y_nc_pred = dt_nc.predict(X_nc_test)

nc_accuracy = accuracy_score(y_nc_test, y_nc_pred)
print("Accuracy on X_nc_train/X_nc_test split:", nc_accuracy)

Accuracy on X_nc_train/X_nc_test split: 1.0


In [52]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(new_df.drop(columns=['Star_Type']),new_df['Star_Type'],test_size=0.1)

In [None]:
trf2 = ColumnTransformer(transformers=[
    ('scale', StandardScaler(), [col for col in X_train.columns if col not in cat_cols]),
    ('encode', OneHotEncoder(sparse=False, drop='first'), cat_cols),
], remainder='passthrough')

In [82]:
cat_cols = ['Star_Color', 'Spectral_Class']
trf2= ColumnTransformer(transformers=[
    ('scale', StandardScaler(),[col for col in X_train.columns if col not in cat_cols]),
    ('encode',OneHotEncoder(sparse=False,drop='first'),cat_cols),
],remainder='passthrough')

In [83]:
X_train_new=trf2.fit_transform(X_train)
print(pd.DataFrame(X_train_new))
X_test_new=trf2.fit_transform(X_test)

            0         1         2         3         4    5    6    7    8   \
0    -1.653317  0.140277 -0.238986  1.169368 -0.957550  0.0  0.0  0.0  0.0   
1     1.286380 -0.955687 -0.502802 -0.684306  1.253167  0.0  0.0  0.0  1.0   
2    -0.421745 -0.725471 -0.502801 -0.683223  0.609800  0.0  0.0  1.0  0.0   
3     1.030161  2.356802  1.749348  2.052826 -1.572364  0.0  0.0  0.0  0.0   
4     1.479441 -0.014123 -0.502801 -0.684414  0.742552  0.0  0.0  0.0  0.0   
...        ...       ...       ...       ...       ...  ...  ...  ...  ...   
4233 -1.251405 -0.494093 -0.502787 -0.682564 -0.033753  0.0  0.0  0.0  0.0   
4234  0.954085  0.165847 -0.287625  1.333956 -1.048863  1.0  0.0  0.0  0.0   
4235  0.415093 -0.489662 -0.502777 -0.681970 -0.171882  1.0  0.0  0.0  0.0   
4236 -1.347577 -0.613232 -0.502762 -0.682909  0.040133  0.0  0.0  0.0  0.0   
4237 -0.919828 -0.543397 -0.502773 -0.682560 -0.016602  0.0  0.0  0.0  0.0   

       9    10   11   12   13   14   15   16   17   18   19  
0



In [63]:
from sklearn.tree import DecisionTreeClassifier
dt= DecisionTreeClassifier(max_depth=5, min_samples_split=10, random_state=42)

In [64]:
dt.fit(X_train_new,y_train)

In [65]:
y_pr=dt.predict(X_test_new)

In [66]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pr)

1.0

In [67]:
pipe = Pipeline(steps=[
    ('preprocessor', trf2),
    ('classifier', dt)
])

pipe.fit(X_train, y_train)



In [68]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, fbeta_score

# Predict on X_train and X_test
y_train_pred = dt.predict(X_train_new)
y_test_pred = dt.predict(X_test_new)

# Accuracy
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

# Recall (macro)
train_recall = recall_score(y_train, y_train_pred, average='macro')
test_recall = recall_score(y_test, y_test_pred, average='macro')

# F1 score (macro)
train_f1 = f1_score(y_train, y_train_pred, average='macro')
test_f1 = f1_score(y_test, y_test_pred, average='macro')

# F2 score (macro)
train_f2 = fbeta_score(y_train, y_train_pred, beta=2, average='macro')
test_f2 = fbeta_score(y_test, y_test_pred, beta=2, average='macro')

print("Train Accuracy:", train_acc)
print("Test Accuracy:", test_acc)
print("Train Recall:", train_recall)
print("Test Recall:", test_recall)
print("Train F1 Score:", train_f1)
print("Test F1 Score:", test_f1)
print("Train F2 Score:", train_f2)
print("Test F2 Score:", test_f2)

Train Accuracy: 1.0
Test Accuracy: 1.0
Train Recall: 1.0
Test Recall: 1.0
Train F1 Score: 1.0
Test F1 Score: 1.0
Train F2 Score: 1.0
Test F2 Score: 1.0


In [69]:
# Check for overlap between train and test sets
overlap = set(X_train.index).intersection(set(X_test.index))
print(f"Number of overlapping indices: {len(overlap)}")

Number of overlapping indices: 0


Number of duplicate rows in new_df: 0
