In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [2]:
from catboost import CatBoostClassifier
import pandas as pd

In [11]:
!ls sample_data

anscombe.json  california_housing_test.csv   mnist_test.csv	    README.md
astra.csv      california_housing_train.csv  mnist_train_small.csv  star_classification.csv


In [12]:
df = pd.read_csv('sample_data/star_classification.csv')

In [13]:
df.head()

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,1.237661e+18,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,3606,301,2,79.0,6.543777e+18,GALAXY,0.634794,5812.0,56354.0,171.0
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119.0,1.176014e+19,GALAXY,0.779136,10445.0,58158.0,427.0
2,1.237661e+18,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120.0,5.1522e+18,GALAXY,0.644195,4576.0,55592.0,299.0
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,4192,301,3,214.0,1.030107e+19,GALAXY,0.932346,9149.0,58039.0,775.0
4,1.23768e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137.0,6.891865e+18,GALAXY,0.116123,6121.0,56187.0,842.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 18 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   obj_ID       100000 non-null  float64
 1   alpha        100000 non-null  float64
 2   delta        100000 non-null  float64
 3   u            100000 non-null  float64
 4   g            100000 non-null  float64
 5   r            100000 non-null  float64
 6   i            100000 non-null  float64
 7   z            100000 non-null  float64
 8   run_ID       100000 non-null  int64  
 9   rerun_ID     100000 non-null  int64  
 10  cam_col      100000 non-null  int64  
 11  field_ID     100000 non-null  int64  
 12  spec_obj_ID  100000 non-null  float64
 13  class        100000 non-null  object 
 14  redshift     100000 non-null  float64
 15  plate        100000 non-null  int64  
 16  MJD          100000 non-null  int64  
 17  fiber_ID     100000 non-null  int64  
dtypes: float64(10), int64(7),

In [None]:
df.nunique ()

obj_ID          78053
alpha           99999
delta           99999
u               93748
g               92651
r               91901
i               92019
z               92007
run_ID            430
rerun_ID            1
cam_col             6
field_ID          856
spec_obj_ID    100000
class               3
redshift        99295
plate            6284
MJD              2180
fiber_ID         1000
dtype: int64

## Background

This dataset contains over 100,000 observations separated into three class types: Galaxies, Quasars, and Stars. In addition to the class,
each observation has 17 other defining features, several of which are different identification types that were irrelevant for the purposes
of this analysis. For completeness, I will list them all where now:

* obj_ID = Object Identifier, the unique value that identifies the object in the image catalog used by the CAS.

* alpha = Right Ascension angle (at J2000 epoch). This is the angle (in degrees) between the Vernal Equinox
and the desired point on the celestial sphere.

* delta = Declination angle (at J2000 epoch). The is the angle (in degrees) between the celestial equator and
the desired point on the celestial sphere.

* u = Ultraviolet filter in the photometric system. 3543 Angstroms (354.3 nm)

* g = Green filter in the photometric system. 4770 Angstroms (477.0 nm)

* r = Red filter in the photometric system. 6231 Angstroms (623.1 nm)

* i = Near Infrared filter in the photometric system. 7625 Angstroms (762.5 nm)

* z = Infrared filter in the photometric system. 9134 Angstroms (913.4 nm)

* run_ID = Run Number used to identify the specific scan.

* rereun_ID = Rerun Number to specify how the image was processed.

* cam_col = Camera column to identify the scanline within the run.

* field_ID = Field number to identify each field.

* spec_obj_ID = Unique ID used for optical spectroscopic objects (this means that 2 different observations with the same spec_obj_ID must share the output class).

* class = Object class (galaxy, star, or quasar object).

* redshift = Redshift value based on the increase in wavelength. The more red shifted light is, the further it has traveled from its
point of origin
Значение красного смещения, основанное на увеличении длины волны. Чем больше смещение света в красный цвет, тем дальше он удалился от своей исходной точки

* plate = Plate ID, identifies each plate in SDSS.

* MJD = Modified Julian Date, used to indicate when a given piece of SDSS data was taken.

* fiber_ID = Fiber ID that identifies the fiber that pointed the light at the focal plane in each observation.
Идентификатор fiber, который идентифицируюет fiber, направившее свет в фокальную плоскость при каждом наблюдении.

As explained earlier, the ID attributes were removed from the table before I began looking for trends.


In [8]:
df.describe()

Unnamed: 0.1,Unnamed: 0,alpha,delta,u,g,r,i,z,cam_col,redshift,plate,MJD,fiber_ID
count,6259.0,6259.0,6259.0,6259.0,6259.0,6259.0,6259.0,6259.0,6259.0,6259.0,6259.0,6259.0,6259.0
mean,12522.40965,177.301013,22.803363,22.035703,20.624414,19.66489,19.119157,18.803348,3.536188,0.590081,5139.240454,55589.647228,448.95846
std,7234.712544,99.696984,19.489059,2.208166,2.024459,1.856765,1.76698,1.77907,1.594201,0.77576,2959.534656,1813.947643,267.298548
min,3.0,0.024258,-16.713711,12.2624,10.51139,10.06854,11.29956,10.22551,1.0,-0.004016,266.0,51630.0,1.0
25%,6262.5,124.318935,3.359029,20.36695,19.02363,18.11313,17.72259,17.47115,2.0,0.042061,2528.5,54232.0,226.0
50%,12417.0,178.918109,21.250905,22.10598,21.05844,20.142,19.44065,19.03513,4.0,0.394683,5005.0,55888.0,441.0
75%,18854.5,235.919281,37.979984,23.606445,22.107225,21.07331,20.4632,19.9753,5.0,0.717195,7415.0,56780.0,637.0
max,25033.0,359.99981,82.764421,29.23438,27.26466,27.39709,25.98882,25.33364,6.0,7.011245,12547.0,58932.0,1000.0


In [14]:
df.columns

Index(['obj_ID', 'alpha', 'delta', 'u', 'g', 'r', 'i', 'z', 'run_ID',
       'rerun_ID', 'cam_col', 'field_ID', 'spec_obj_ID', 'class', 'redshift',
       'plate', 'MJD', 'fiber_ID'],
      dtype='object')

In [15]:
# уничтожаем так как одно и тоже число rerun_ID
# а та же друге ID
X = df.drop(['rerun_ID', 'field_ID', 'spec_obj_ID', 'obj_ID', 'run_ID'], axis=1)
X.head()

Unnamed: 0,alpha,delta,u,g,r,i,z,cam_col,class,redshift,plate,MJD,fiber_ID
0,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,2,GALAXY,0.634794,5812.0,56354.0,171.0
1,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,5,GALAXY,0.779136,10445.0,58158.0,427.0
2,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,2,GALAXY,0.644195,4576.0,55592.0,299.0
3,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,3,GALAXY,0.932346,9149.0,58039.0,775.0
4,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,3,GALAXY,0.116123,6121.0,56187.0,842.0


In [16]:
from sklearn.model_selection import train_test_split

y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.75,
                                                    random_state=42)

In [17]:
# X_train.head()
X_test2 = X_test.drop('class', axis=1)
X_test2.columns

Index(['alpha', 'delta', 'u', 'g', 'r', 'i', 'z', 'cam_col', 'redshift',
       'plate', 'MJD', 'fiber_ID'],
      dtype='object')

In [18]:
y_train.head(3)

62241       QSO
1513        QSO
56064    GALAXY
Name: class, dtype: object

In [19]:
from catboost import FeaturesData



In [20]:
# model = CatBoostClassifier( loss_function='MultiLogloss',
#                             eval_metric='HammingLoss',
#                            random_seed=21,
#                             iterations=500,
#                             class_names=['GALAXY', 'QSO', 'STAR'])
# # Fit model
train_label = ['obj_ID', 'alpha', 'delta', 'u', 'g', 'r', 'i', 'z', 'run_ID',
               'cam_col', 'field_ID', 'spec_obj_ID', 'redshift', 'plate', 'MJD', 'fiber_ID']
cat_features = [ 'run_ID',  'cam_col', 'field_ID', 'plate', 'MJD', 'fiber_ID']

# Get predicted probabilities for each class
#preds_proba = model.predict_proba(eval_data)
# Get predicted RawFormulaVal
#preds_raw = model.predict(eval_data, prediction_type='RawFormulaVal')


X_train2.head()

NameError: ignored

In [21]:
X_tarin2 = X_train.drop(['class'], axis=1)
X_tarin2.head(3)


Unnamed: 0,alpha,delta,u,g,r,i,z,cam_col,redshift,plate,MJD,fiber_ID
62241,5.809868,-0.078252,23.96439,22.59558,21.69461,21.43943,21.38774,3,3.518984,7864.0,56979.0,34.0
1513,9.155493,26.331388,18.77055,18.31045,17.94533,17.48212,17.3867,1,0.082576,7660.0,57357.0,928.0
56064,227.197354,9.863035,20.43842,18.39553,17.46085,16.98626,16.61003,5,0.107144,1718.0,53850.0,101.0


In [22]:
model = CatBoostClassifier( cat_features=None ) # классификатор
model.fit(X_tarin2, y_train) # обучение классификатора

Learning rate set to 0.097585
0:	learn: 1.1397412	total: 138ms	remaining: 2m 18s
1:	learn: 0.9704558	total: 218ms	remaining: 1m 48s
2:	learn: 0.8390983	total: 297ms	remaining: 1m 38s
3:	learn: 0.7398287	total: 379ms	remaining: 1m 34s
4:	learn: 0.6567347	total: 465ms	remaining: 1m 32s
5:	learn: 0.5868335	total: 551ms	remaining: 1m 31s
6:	learn: 0.5290802	total: 634ms	remaining: 1m 29s
7:	learn: 0.4802144	total: 718ms	remaining: 1m 29s
8:	learn: 0.4374632	total: 805ms	remaining: 1m 28s
9:	learn: 0.4010073	total: 887ms	remaining: 1m 27s
10:	learn: 0.3682557	total: 971ms	remaining: 1m 27s
11:	learn: 0.3407310	total: 1.02s	remaining: 1m 24s
12:	learn: 0.3157022	total: 1.06s	remaining: 1m 20s
13:	learn: 0.2944319	total: 1.1s	remaining: 1m 17s
14:	learn: 0.2752806	total: 1.14s	remaining: 1m 14s
15:	learn: 0.2577089	total: 1.19s	remaining: 1m 13s
16:	learn: 0.2418038	total: 1.23s	remaining: 1m 11s
17:	learn: 0.2286602	total: 1.28s	remaining: 1m 9s
18:	learn: 0.2167276	total: 1.32s	remaining: 1

<catboost.core.CatBoostClassifier at 0x795e1764ccd0>

In [23]:
X_test2.head(3)

Unnamed: 0,alpha,delta,u,g,r,i,z,cam_col,redshift,plate,MJD,fiber_ID
15727,239.353964,55.391238,20.1538,19.74234,19.31396,19.29645,19.38161,5,1.143417,8413.0,57897.0,681.0
70184,341.154029,5.727165,22.32511,21.90339,21.75924,21.40544,21.48124,1,0.836712,11305.0,58449.0,574.0
26773,181.324623,49.883392,21.87684,20.41339,19.81394,19.56042,19.30338,3,-0.000173,2919.0,54537.0,195.0


In [24]:
# Get predicted classes
predict = model.predict(X_test2)
pred = pd.DataFrame(predict)
pred.head()

Unnamed: 0,0
0,QSO
1,QSO
2,STAR
3,GALAXY
4,GALAXY


In [25]:
X_test.head()

Unnamed: 0,alpha,delta,u,g,r,i,z,cam_col,class,redshift,plate,MJD,fiber_ID
15727,239.353964,55.391238,20.1538,19.74234,19.31396,19.29645,19.38161,5,QSO,1.143417,8413.0,57897.0,681.0
70184,341.154029,5.727165,22.32511,21.90339,21.75924,21.40544,21.48124,1,QSO,0.836712,11305.0,58449.0,574.0
26773,181.324623,49.883392,21.87684,20.41339,19.81394,19.56042,19.30338,3,STAR,-0.000173,2919.0,54537.0,195.0
78435,129.420965,18.60107,19.64541,18.14898,17.4537,17.09736,16.7788,1,GALAXY,0.0951,2275.0,53709.0,68.0
20544,146.375642,19.259911,19.75237,18.76292,18.199,17.85236,17.75901,2,GALAXY,0.149367,2362.0,53759.0,251.0


In [26]:
X_test.head()


Unnamed: 0,alpha,delta,u,g,r,i,z,cam_col,class,redshift,plate,MJD,fiber_ID
15727,239.353964,55.391238,20.1538,19.74234,19.31396,19.29645,19.38161,5,QSO,1.143417,8413.0,57897.0,681.0
70184,341.154029,5.727165,22.32511,21.90339,21.75924,21.40544,21.48124,1,QSO,0.836712,11305.0,58449.0,574.0
26773,181.324623,49.883392,21.87684,20.41339,19.81394,19.56042,19.30338,3,STAR,-0.000173,2919.0,54537.0,195.0
78435,129.420965,18.60107,19.64541,18.14898,17.4537,17.09736,16.7788,1,GALAXY,0.0951,2275.0,53709.0,68.0
20544,146.375642,19.259911,19.75237,18.76292,18.199,17.85236,17.75901,2,GALAXY,0.149367,2362.0,53759.0,251.0


In [None]:
X.iloc[[75721,80184,19864,76699,92991]]

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
75721,1.237679e+18,16.95689,3.64613,23.33542,21.95143,20.48149,19.603,19.13094,7712,6,442,4.855017e+18,GALAXY,0.506237,4312,55511,495
80184,1.237662e+18,240.06324,6.134131,17.86033,16.79228,16.43001,16.30923,16.25873,3894,1,243,2.448928e+18,STAR,0.000345,2175,54612,348
19864,1.237679e+18,30.887222,1.18871,18.18911,16.89469,16.42161,16.24627,16.18549,7717,1,536,8.255357e+18,STAR,4e-06,7332,56683,943
76699,1.237668e+18,247.594401,10.88778,24.99961,21.71203,21.47148,21.30532,21.29109,5323,1,134,4.577999e+18,STAR,-0.000291,4066,55444,326
92991,1.237679e+18,18.896451,-5.26133,23.76648,21.79737,20.69543,20.23403,19.97464,7881,3,148,8.910472e+18,STAR,-0.000136,7914,57331,363


In [27]:
from catboost.utils import eval_metric



In [28]:

#res = pd.DataFrame(y_test).join( pd.DataFrame(predict))
# test_pool = Pool( y_test, predict)
metric = eval_metric(y_test, predict, 'Precision')
print(metric)

KeyError: ignored

In [None]:
for metric in ('Precision', 'Recall', 'F1'):
    print(metric)
    values = eval_metric(y_test, predict, metric)
    for model, value in zip(clf.classes_, values):
        print(f'class={cls}: {value:.4f}')
    print()

Precision


KeyError: 0

In [30]:
model.save_model('astra.cbm')

In [31]:
!ls

astra.cbm  astra.json  catboost_info  sample_data


In [32]:
X_test.to_csv('astra.csv')

In [None]:
ff = model.get_feature_importance()
ff

array([ 0.27624387,  0.27778713,  0.1622283 ,  0.10247422,  0.1581494 ,
        0.25616265,  0.21896545,  0.29039967,  0.27757768,  0.35941504,
        0.16894315, 91.3056414 ,  5.55423445,  0.15600785,  0.17002827,
        0.26574147])

In [None]:
nn = model.get_feature_names()
nn

AttributeError: 'CatBoostClassifier' object has no attribute 'get_feature_names'