<a href="https://colab.research.google.com/github/annykay/NoiseInDataImpact/blob/main/DataPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data preprocessing
<br>
Data preparation (including feature selection and modification, outliers dropout, etc. as well as ML classification models' hyperparameters optimization) steps are presented here. The main goal of this section is to generate dataframes and optimal models that would be further used for noise introduction simulations.
<br>
<br>
There are 4 different datasets and 5 ML models (DecisionTree, RandomForest, XGB, LogisticRegression, KNearestNeighbors).

In [None]:
# Imports (ADD HERE IF STH NEEDED FOR YOU IS STILL ABSENT)

import numpy as np
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor, KNeighborsClassifier
from sklearn.utils import resample
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

### Stellar classification Dataset
Source: https://www.kaggle.com/datasets/fedesoriano/stellar-classification-dataset-sdss17
<br>
Preparation pipeline is partially based on: https://www.kaggle.com/code/beyzanks/stellar-classification-98-4-acc-100-auc/notebook 

In [None]:
# Loading the raw data and changing class labels to integers

df = pd.read_csv(r'C:\\Users\gangs\Downloads\star_classification.csv', engine='python')
df["class"]=[0 if i == "GALAXY" else 1 if i == "STAR" else 2 for i in df["class"]]

In [None]:
# Deleting outliers

clf = LocalOutlierFactor()
y_pred = clf.fit_predict(df)
x_score = clf.negative_outlier_factor_
outlier_score = pd.DataFrame()
outlier_score["score"] = x_score
threshold2 = -1.5                                            
filtre2 = outlier_score["score"] < threshold2
outlier_index = outlier_score[filtre2].index.tolist()
df.drop(outlier_index, inplace=True)

In [None]:
# Deleting uncorrelated with target variable features

df = df.drop(['obj_ID','alpha','delta','run_ID','rerun_ID','cam_col','field_ID','fiber_ID'], axis = 1)

In [None]:
# Downsampling of major class to make data balanced

df_0 = df[df['class']==0]
df_1 = df[df['class']==1]
df_2 = df[df['class']==2]

df_0_downsampled = resample(df_0, replace=False, n_samples=17000, random_state=123)
df_downsampled = pd.concat([df_0_downsampled, df_1, df_2])

In [None]:
stellar_x = df_downsampled.drop(['class'], axis = 1)
stellar_y = df_downsampled.loc[:,'class'].values

In [None]:
# Data scaling and train-test splitting

stellar_x = StandardScaler().fit_transform(stellar_x)
X_train, X_test, y_train, y_test = train_test_split(stellar_x, stellar_y, test_size = 0.3, random_state = 123)

In [None]:
# Models optimization

RandomForest = GridSearchCV(
    RandomForestClassifier(random_state=0),
    {'max_depth': [i for i in range(12, 15)], 'n_estimators': [80, 90, 100]},
    scoring = 'f1_micro',
    cv = 3,
    n_jobs=-1
)
RandomForest.fit(X_train, y_train)
print('RandomForestClassifier\nbest parameters:', RandomForest.best_params_)
print('Train score: {}\nTest score: {}'.format(
    f1_score(y_train, RandomForest.predict(X_train), average='micro'),
    f1_score(y_test, RandomForest.predict(X_test), average='micro')
    )
)

DecisionTree = GridSearchCV(
    DecisionTreeClassifier(random_state=0),
    {'max_depth': [i for i in range(7, 11)]},
    scoring = 'f1_micro',
    cv = 3,
    n_jobs=-1
)
DecisionTree.fit(X_train, y_train)
print('\nDecisionTree\nbest parameters:', DecisionTree.best_params_)
print('Train score: {}\nTest score: {}'.format(
    f1_score(y_train, DecisionTree.predict(X_train), average='micro'),
    f1_score(y_test, DecisionTree.predict(X_test), average='micro')
    )
)

KNN = GridSearchCV(
    KNeighborsClassifier(),
    {'n_neighbors': [i for i in range(1, 5)]},
    scoring = 'f1_micro',
    cv = 3,
    n_jobs=-1
)
KNN.fit(X_train, y_train)
print('\nKNearestNeigbors\nbest parameters:', KNN.best_params_)
print('Train score: {}\nTest score: {}'.format(
    f1_score(y_train, KNN.predict(X_train), average='micro'),
    f1_score(y_test, KNN.predict(X_test), average='micro')
    )
)

XGB = GridSearchCV(
    XGBClassifier(random_state=0),
    {'max_depth': [15, 10, 12], 'n_estimators': [100, 80, 120], 'learning_rate': [0.1, 0.2, 0.05]},
    cv = 3,
    scoring='f1_micro',
    n_jobs=-1
)
XGB.fit(X_train, y_train)
print('\nXGBoostClassifier\nbest parameters:', XGB.best_params_)
print('Train score: {}\nTest score: {}'.format(
    f1_score(y_train, XGB.predict(X_train), average='micro'),
    f1_score(y_test, XGB.predict(X_test), average='micro')
    )
)

LogReg = GridSearchCV(
    LogisticRegression(random_state=0, max_iter=1000),
    {'C': np.arange(1900, 2200, 10)},
    cv = 3,
    scoring='f1_micro',
    n_jobs=-1
)
LogReg.fit(X_train, y_train)
print('\nLogisticregression\nbest parameters:', LogReg.best_params_)
print('Train score: {}\nTest score: {}'.format(
    f1_score(y_train, LogReg.predict(X_train), average='micro'),
    f1_score(y_test, LogReg.predict(X_test), average='micro')
    )
)

RandomForestClassifier
best parameters: {'max_depth': 14, 'n_estimators': 90}
Train score: 0.9925838860437131
Test score: 0.9770812928501469

DecisionTree
best parameters: {'max_depth': 9}
Train score: 0.9817255757982817
Test score: 0.9714005876591577

KNearestNeigbors
best parameters: {'n_neighbors': 3}
Train score: 0.9650463157305572
Test score: 0.9463271302644466

XGBoostClassifier
best parameters: {'learning_rate': 0.1, 'max_depth': 12, 'n_estimators': 100}
Train score: 0.9993563372792656
Test score: 0.9756447926869083

Logisticregression
best parameters: {'C': 2010}
Train score: 0.9584417765091092
Test score: 0.9598432908912831


In [None]:
print('kek')