# **1 - Importing Libraries**

In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# **2 - Data Processing**

### Loading dataset

In [58]:
data = pd.read_csv('dataset/Star99999_raw.csv')
data.shape

(99999, 6)

### Exploring dataset

In [59]:
data.head(10)

Unnamed: 0.1,Unnamed: 0,Vmag,Plx,e_Plx,B-V,SpType
0,0,9.1,3.54,1.39,0.482,F5
1,1,9.27,21.9,3.1,0.999,K3V
2,2,6.61,2.81,0.63,-0.019,B9
3,3,8.06,7.75,0.97,0.37,F0V
4,4,8.55,2.87,1.11,0.902,G8III
5,5,12.31,18.8,4.99,1.336,M0V:
6,6,9.64,17.74,1.3,0.74,G0
7,7,9.05,5.17,1.95,1.102,M6e-M8.5e Tc
8,8,8.59,4.81,0.99,1.067,G5
9,9,8.59,10.76,1.1,0.489,F6V


#### Columns:

- Vmag : Visual Apparent Magnitude of the Star (m)

- Plx : Parallax distance Between the Star and the Earth (d)

- e_Plx : Standard Error of Plx

- B-V : B-V color index (A hot star has a B-V color index close to 0 or negative, while a cool star has a B-V color index close to 2.0. Other stars are somewhere in between)

- SpType : Spectral Type

In [60]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  99999 non-null  int64 
 1   Vmag        99999 non-null  object
 2   Plx         99999 non-null  object
 3   e_Plx       99999 non-null  object
 4   B-V         99999 non-null  object
 5   SpType      97377 non-null  object
dtypes: int64(1), object(5)
memory usage: 4.6+ MB


In [61]:
data.describe()

Unnamed: 0.1,Unnamed: 0
count,99999.0
mean,49999.0
std,28867.369122
min,0.0
25%,24999.5
50%,49999.0
75%,74998.5
max,99998.0


### Data cleaning

In [62]:
data.drop('Unnamed: 0', axis = 1, inplace = True)
data.rename(columns={'B-V':'B_V'}, inplace = True)
data.head()

Unnamed: 0,Vmag,Plx,e_Plx,B_V,SpType
0,9.1,3.54,1.39,0.482,F5
1,9.27,21.9,3.1,0.999,K3V
2,6.61,2.81,0.63,-0.019,B9
3,8.06,7.75,0.97,0.37,F0V
4,8.55,2.87,1.11,0.902,G8III


In [63]:
data.query('B_V == "      "')

Unnamed: 0,Vmag,Plx,e_Plx,B_V,SpType
44,9.59,15.10,1.92,,G6/G8V:
52,10.96,-1.76,2.45,,
319,12.14,6.48,4.13,,F1:
374,9.08,6.13,1.59,,
389,11.67,4.86,2.61,,F7
...,...,...,...,...,...
99525,8.71,,,,
99665,9.25,5.97,3.05,,
99701,10.46,23.91,1.78,,K0
99770,9.53,,,,


### converting object type columns to numeric

In [64]:
data.Vmag = pd.to_numeric(data.Vmag, downcast='float', errors ='coerce')
data.Plx = pd.to_numeric(data.Plx, downcast='float', errors ='coerce')
data.e_Plx = pd.to_numeric(data.e_Plx, downcast='float', errors ='coerce')
data.B_V = pd.to_numeric(data.B_V, downcast='float', errors ='coerce')

In [65]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Vmag    99998 non-null  float32
 1   Plx     99768 non-null  float32
 2   e_Plx   99768 non-null  float32
 3   B_V     98871 non-null  float32
 4   SpType  97377 non-null  object 
dtypes: float32(4), object(1)
memory usage: 2.3+ MB


In [66]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Vmag,99998.0,8.369723,1.313881,-1.44,7.64,8.44,9.14,14.08
Plx,99768.0,7.212443,11.349038,-54.950001,2.51,4.63,8.41,772.330017
e_Plx,99768.0,1.365389,1.816845,0.38,0.88,1.1,1.39,114.459999
B_V,98871.0,0.704728,0.489686,-0.4,0.348,0.612,1.075,5.46


### Checking for missing values

In [67]:
data.isnull().sum()

Vmag         1
Plx        231
e_Plx      231
B_V       1128
SpType    2622
dtype: int64

### Droping the missing values

In [68]:
new_data = data.dropna()
new_data.shape

(96742, 5)

In [69]:
new_data.isnull().sum()

Vmag      0
Plx       0
e_Plx     0
B_V       0
SpType    0
dtype: int64

### Checking for duplicated

In [70]:
new_data.duplicated().sum()

0

### Reindexing

In [71]:
new_data.reset_index(drop=True, inplace = True)

In [72]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96742 entries, 0 to 96741
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Vmag    96742 non-null  float32
 1   Plx     96742 non-null  float32
 2   e_Plx   96742 non-null  float32
 3   B_V     96742 non-null  float32
 4   SpType  96742 non-null  object 
dtypes: float32(4), object(1)
memory usage: 2.2+ MB


In [73]:
new_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Vmag,96742.0,8.291884,1.248016,-1.44,7.61,8.4,9.07,13.61
Plx,96742.0,7.105202,11.14569,-35.099998,2.52,4.6,8.27,772.330017
e_Plx,96742.0,1.264169,1.156238,0.38,0.87,1.08,1.36,69.07
B_V,96742.0,0.700666,0.49006,-0.4,0.342,0.605,1.072,5.46


### Adding new features

Plx = Parallax Distance Between the Star and the Earth (arsec) 
- Plx : Parallax Distance Between the Star and the Earth (arsec)

- Distance = Distance Between the Star and the Earth (parsec)

- ly = Distance Between the Star and Earth (light years)

- Amag = Absolute Magnitude of the Star (mag)

- Temperature = Temperature Effective considering the star as a perfect black body (kelvin)

- Luminosity = Star's luminosity (watts)

- Luminosity(Sun=1) = Star's luminosity in the terms of the luminosity of the Sun (watts)

- Mass(Sun=1) = Star's Mass in the terms of the Mass of the Sun (solar mass)

- Radius(Sun) = Star's Radius in the terms of the Radius of the Sun (solar radius)

- Radius = Star's Radius (meters)

Some Equations asks for log of Plx(d) (which is the distance between the star and the Earth). But log of 0 is infinite and we can't use it

In [74]:
new_data.query("Plx == 0")

Unnamed: 0,Vmag,Plx,e_Plx,B_V,SpType
945,8.68,0.0,1.19,1.29,K2III
1413,7.63,0.0,0.77,0.267,B1II...
18219,10.09,0.0,1.84,0.19,A0
23867,10.75,0.0,1.21,-0.012,B
26477,9.01,0.0,1.4,0.089,B8
29457,8.44,0.0,1.26,0.042,B1Vne
29968,6.57,0.0,0.89,0.121,A0III
36969,7.9,0.0,0.99,1.745,S
37320,8.24,0.0,1.29,1.341,K0
39917,9.28,0.0,1.6,0.956,G


In [75]:
new_data = new_data.query('Plx != 0')
new_data.shape

(96707, 5)

In [76]:
new_data.query('Plx == 0')

Unnamed: 0,Vmag,Plx,e_Plx,B_V,SpType


### Reindexing

In [77]:
new_data.reset_index(drop = True, inplace = True)

In [78]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96707 entries, 0 to 96706
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Vmag    96707 non-null  float32
 1   Plx     96707 non-null  float32
 2   e_Plx   96707 non-null  float32
 3   B_V     96707 non-null  float32
 4   SpType  96707 non-null  object 
dtypes: float32(4), object(1)
memory usage: 2.2+ MB


In [79]:
new_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Vmag,96707.0,8.291726,1.247959,-1.44,7.61,8.4,9.07,13.61
Plx,96707.0,7.107774,11.146887,-35.099998,2.52,4.6,8.27,772.330017
e_Plx,96707.0,1.264157,1.15642,0.38,0.87,1.08,1.36,69.07
B_V,96707.0,0.700665,0.490008,-0.4,0.342,0.605,1.072,5.46


In [80]:
df = new_data.copy()

### Conversions

The Plx is in Milliarcsecond (mas) and we need it in arcsecond (arsec)
- 1 mas = 0.00099999995874704 arcsec.
- 1 pc = 1 arsecs (arcsecond)


In [81]:
# converting Plx to arsec

df['Plx'] = df.Plx.map(lambda x : x*0.00099999995874704)

In [82]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Vmag,96707.0,8.291726,1.247959,-1.44,7.61,8.4,9.07,13.61
Plx,96707.0,0.007108,0.011147,-0.0351,0.00252,0.0046,0.00827,0.77233
e_Plx,96707.0,1.264157,1.15642,0.38,0.87,1.08,1.36,69.07
B_V,96707.0,0.700665,0.490008,-0.4,0.342,0.605,1.072,5.46


To get the light years we need to get the distance modulus from its Plx
- d (parsecs) = 1 / Plx (arcsec)
- 1 parsecs = 3.26 light lyears

In [83]:
# getting te distance modulus
df['Distance'] = df.Plx.map(lambda x : 1/x)

In [84]:
# converting the distance parsec in light years
df['ly'] = df['Distance'].map(lambda x : (abs(x)*3.26))

In [85]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Vmag,96707.0,8.291726,1.247959,-1.44,7.61,8.4,9.07,13.61
Plx,96707.0,0.007108,0.011147,-0.0351,0.00252,0.0046,0.00827,0.77233
e_Plx,96707.0,1.264157,1.15642,0.38,0.87,1.08,1.36,69.07
B_V,96707.0,0.700665,0.490008,-0.4,0.342,0.605,1.072,5.46
Distance,96707.0,286.335567,3292.193462,-100000.00636,109.89011,202.429156,361.010848,100000.00636
ly,96707.0,1952.241255,10594.701704,4.220994,393.244891,705.627752,1278.431449,326000.020735


### Getting Amag (Absolute Magnitude)

- Amag = Vmag + 5 * log10(Plx + 1)

In [86]:
# getting Absolute Maagnitude
df['Amag'] = df.Vmag + 5 *(np.log10(((df.Plx)))+1)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [87]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Vmag,96707.0,8.291726,1.247959,-1.44,7.61,8.4,9.07,13.61
Plx,96707.0,0.007108,0.011147,-0.0351,0.00252,0.0046,0.00827,0.77233
e_Plx,96707.0,1.264157,1.15642,0.38,0.87,1.08,1.36,69.07
B_V,96707.0,0.700665,0.490008,-0.4,0.342,0.605,1.072,5.46
Distance,96707.0,286.335567,3292.193462,-100000.00636,109.89011,202.429156,361.010848,100000.00636
ly,96707.0,1952.241255,10594.701704,4.220994,393.244891,705.627752,1278.431449,326000.020735
Amag,93556.0,1.599608,2.326055,-13.31,0.263157,1.511542,3.06466,15.449015


### Getting Temperature Effective

For a perfect black body
- B_V = - 0.72 + (7090 / T)

So
- T = 7090 / (B_V + 0.72)

In [88]:
df['Temperature'] = 7090/(df.B_V + 0.72)

### Getting Luminosity

- Amag = 4.8 - 2.5 * log10(L/Lsun)

So
- L = Lsun * 10 ** ((4.8 - Amag) / 2.5)

Where Lsun is the sun luminosity (3.828e+26)

In [89]:
# getting luminosity by the sun luminosity (3.828e+26)
df['Luminosity'] = df.Amag.map(lambda x: 3.828e+26 * (10**((4.8 - x) / 2.5)))

In [90]:
# considering sun luminosity = 1
df['Luminosity(Sun=1)'] = df.Amag.map(lambda x: 10**((4.8 - x) / 2.5))

### Getting the Stars Mass

- L / Lsun = (M / Msun) ** alpha

Where Msun is the sun mass and alpha = 2/7

In [91]:
# getting mass from sun mass and luminosity = 1 
df['Mass(Sun=1)'] = df['Luminosity(Sun=1)']**2/7

### Getting the Star Radius

- Rsun = 7.35355×10−8 (light-year) / 2.25461×10−8 (parsec)

- Tsun = 5.778 K

- Lsun = 3.828×e+1026 watts

In [92]:
# we can get the star radius from the sun real values:
df['Radius(sun)'] = df.Luminosity* 7.35355e-8*5.778/3.828e+26*df.Temperature

In [93]:
# we can also get the radius from it's luminosity and temperature
df['Radius'] = 1181.57*np.sqrt(df.Luminosity) / df.Temperature

In [94]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Vmag,96707.0,8.291726,1.247959,-1.44,7.61,8.4,9.07,13.61
Plx,96707.0,0.007107773,0.01114689,-0.0351,0.00252,0.0046,0.00827,0.77233
e_Plx,96707.0,1.264157,1.15642,0.38,0.87,1.08,1.36,69.07
B_V,96707.0,0.7006654,0.4900075,-0.4,0.342,0.605,1.072,5.46
Distance,96707.0,286.3356,3292.193,-100000.0,109.8901,202.4292,361.0108,100000.0
ly,96707.0,1952.241,10594.7,4.220994,393.2449,705.6278,1278.431,326000.0
Amag,93556.0,1.599608,2.326055,-13.31,0.2631575,1.511542,3.06466,15.44901
Temperature,96707.0,5685.351,2187.985,1147.249,3956.473,5350.943,6676.083,22156.25
Luminosity,93556.0,8.568881999999999e+29,3.791047e+31,2.105552e+22,1.892813e+27,7.913258e+27,2.49867e+28,6.713855e+33
Luminosity(Sun=1),93556.0,2238.475,99034.67,5.500398e-05,4.944654,20.67204,65.27352,17538810.0


### Getting the stars types

In [95]:
def starType(star):
    dwarf = ['D','VI', 'VII', 'V']
    giant = ['IV', 'III', 'II', 'Ib', 'Ia', 'Ia-O']
    for i in dwarf :
        if i in star:
            return 'Dwarf'
    for i in giant:
        if i in star:
            return 'Giant'
    return 'Other'

In [96]:
df['Star_Type'] = df.SpType.apply(starType)

In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96707 entries, 0 to 96706
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Vmag               96707 non-null  float32
 1   Plx                96707 non-null  float64
 2   e_Plx              96707 non-null  float32
 3   B_V                96707 non-null  float32
 4   SpType             96707 non-null  object 
 5   Distance           96707 non-null  float64
 6   ly                 96707 non-null  float64
 7   Amag               93556 non-null  float64
 8   Temperature        96707 non-null  float32
 9   Luminosity         93556 non-null  float64
 10  Luminosity(Sun=1)  93556 non-null  float64
 11  Mass(Sun=1)        93556 non-null  float64
 12  Radius(sun)        93556 non-null  float64
 13  Radius             93556 non-null  float64
 14  Star_Type          96707 non-null  object 
dtypes: float32(4), float64(9), object(2)
memory usage: 9.6+ MB


In [98]:
df.Star_Type.value_counts()

Star_Type
Other    49196
Dwarf    27752
Giant    19759
Name: count, dtype: int64

In [99]:
df = df.query('Star_Type != "Other"')
df.Star_Type.value_counts()

Star_Type
Dwarf    27752
Giant    19759
Name: count, dtype: int64

In [100]:
df.isnull().sum()

Vmag                    0
Plx                     0
e_Plx                   0
B_V                     0
SpType                  0
Distance                0
ly                      0
Amag                 1256
Temperature             0
Luminosity           1256
Luminosity(Sun=1)    1256
Mass(Sun=1)          1256
Radius(sun)          1256
Radius               1256
Star_Type               0
dtype: int64

In [101]:
df_clean = df.dropna()

In [102]:
df_clean.isnull().sum()

Vmag                 0
Plx                  0
e_Plx                0
B_V                  0
SpType               0
Distance             0
ly                   0
Amag                 0
Temperature          0
Luminosity           0
Luminosity(Sun=1)    0
Mass(Sun=1)          0
Radius(sun)          0
Radius               0
Star_Type            0
dtype: int64

In [103]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 46255 entries, 1 to 96705
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Vmag               46255 non-null  float32
 1   Plx                46255 non-null  float64
 2   e_Plx              46255 non-null  float32
 3   B_V                46255 non-null  float32
 4   SpType             46255 non-null  object 
 5   Distance           46255 non-null  float64
 6   ly                 46255 non-null  float64
 7   Amag               46255 non-null  float64
 8   Temperature        46255 non-null  float32
 9   Luminosity         46255 non-null  float64
 10  Luminosity(Sun=1)  46255 non-null  float64
 11  Mass(Sun=1)        46255 non-null  float64
 12  Radius(sun)        46255 non-null  float64
 13  Radius             46255 non-null  float64
 14  Star_Type          46255 non-null  object 
dtypes: float32(4), float64(9), object(2)
memory usage: 4.9+ MB


### Dropping outliers

In [104]:
summary = df_clean.describe().T

In [105]:
summary['IQR'] = summary['75%'] - summary['25%']
summary.head()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,IQR
Vmag,46255.0,7.939285,1.313934,-0.62,7.22,8.18,8.85,12.79,1.630001
Plx,46255.0,0.007884,0.012689,1e-05,0.00273,0.00496,0.00905,0.77233,0.00632
e_Plx,46255.0,1.111432,0.770356,0.38,0.8,0.99,1.24,40.630001,0.44
B_V,46255.0,0.691953,0.498072,-0.4,0.323,0.617,1.074,3.44,0.751
Distance,46255.0,459.975018,2468.987387,1.294783,110.49724,201.61291,366.300379,100000.00636,255.803139


In [106]:
summary['cutoff'] = round(summary.IQR*1.6, 3)
summary.head()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,IQR,cutoff
Vmag,46255.0,7.939285,1.313934,-0.62,7.22,8.18,8.85,12.79,1.630001,2.608
Plx,46255.0,0.007884,0.012689,1e-05,0.00273,0.00496,0.00905,0.77233,0.00632,0.01
e_Plx,46255.0,1.111432,0.770356,0.38,0.8,0.99,1.24,40.630001,0.44,0.704
B_V,46255.0,0.691953,0.498072,-0.4,0.323,0.617,1.074,3.44,0.751,1.202
Distance,46255.0,459.975018,2468.987387,1.294783,110.49724,201.61291,366.300379,100000.00636,255.803139,409.285


In [107]:
summary['lw']=round(summary['25%']-summary.cutoff, 3)
summary['rw']=round(summary['75%']+summary.cutoff, 3)
summary.head()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,IQR,cutoff,lw,rw
Vmag,46255.0,7.939285,1.313934,-0.62,7.22,8.18,8.85,12.79,1.630001,2.608,4.612,11.458
Plx,46255.0,0.007884,0.012689,1e-05,0.00273,0.00496,0.00905,0.77233,0.00632,0.01,-0.007,0.019
e_Plx,46255.0,1.111432,0.770356,0.38,0.8,0.99,1.24,40.630001,0.44,0.704,0.096,1.944
B_V,46255.0,0.691953,0.498072,-0.4,0.323,0.617,1.074,3.44,0.751,1.202,-0.879,2.276
Distance,46255.0,459.975018,2468.987387,1.294783,110.49724,201.61291,366.300379,100000.00636,255.803139,409.285,-298.788,775.585


In [124]:
# code to drop the outliers

In [114]:
df_clean.shape

(46255, 15)

In [119]:
df_clean.loc[df_clean.B_V <= -0.33, 'Color_Type'] = 'O'
df_clean.loc[(df_clean.B_V > -0.33 ) & (df_clean.B_V < -0.02) , 'Color_Type'] = 'B'
df_clean.loc[(df_clean.B_V >= -0.02) & (df_clean.B_V < 0.3) , 'Color_Type'] = 'A'
df_clean.loc[(df_clean.B_V >= 0.3) & (df_clean.B_V < 0.58) , 'Color_Type'] = 'F'
df_clean.loc[(df_clean.B_V >= 0.58) & (df_clean.B_V < 0.81) , 'Color_Type'] = 'G'
df_clean.loc[(df_clean.B_V >= 0.81) & (df_clean.B_V < 1.4 ) , 'Color_Type'] = 'K'
df_clean.loc[df_clean.B_V >= 1.4,  'Color_Type'] = 'M'

In [123]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 46255 entries, 1 to 96705
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Vmag               46255 non-null  float32
 1   Plx                46255 non-null  float64
 2   e_Plx              46255 non-null  float32
 3   B_V                46255 non-null  float32
 4   SpType             46255 non-null  object 
 5   Distance           46255 non-null  float64
 6   ly                 46255 non-null  float64
 7   Amag               46255 non-null  float64
 8   Temperature        46255 non-null  float32
 9   Luminosity         46255 non-null  float64
 10  Luminosity(Sun=1)  46255 non-null  float64
 11  Mass(Sun=1)        46255 non-null  float64
 12  Radius(sun)        46255 non-null  float64
 13  Radius             46255 non-null  float64
 14  Star_Type          46255 non-null  object 
 15  Color_Type         46255 non-null  object 
dtypes: float32(4), float64(9), 