In [1]:
import os
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import normalize
import warnings

In [2]:
data_raw = pd.read_csv("anemia.csv")
data_raw.head(15)

Unnamed: 0,Gender,Hemoglobin,MCH,MCHC,MCV,Result
0,1,14.9,22.7,29.1,83.7,0
1,0,15.9,25.4,28.3,72.0,0
2,0,9.0,21.5,29.6,71.2,1
3,0,14.9,16.0,31.4,87.5,0
4,1,14.7,22.0,28.2,99.5,0
5,0,11.6,22.3,30.9,74.5,1
6,1,12.7,19.5,28.9,82.9,1
7,1,12.7,28.5,28.2,92.3,1
8,0,14.1,29.7,30.5,75.2,0
9,1,14.9,25.8,31.3,82.9,0


In [3]:
data_raw.info()
data_raw.count()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1421 entries, 0 to 1420
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Gender      1421 non-null   int64  
 1   Hemoglobin  1421 non-null   float64
 2   MCH         1421 non-null   float64
 3   MCHC        1421 non-null   float64
 4   MCV         1421 non-null   float64
 5   Result      1421 non-null   int64  
dtypes: float64(4), int64(2)
memory usage: 66.7 KB


Gender        1421
Hemoglobin    1421
MCH           1421
MCHC          1421
MCV           1421
Result        1421
dtype: int64

In [4]:
print("There is {} duplicated values in data frame".format(data_raw.duplicated().sum()))
print("Data columns with null value: \n{} ".format(data_raw.isnull().sum()))

There is 887 duplicated values in data frame
Data columns with null value: 
Gender        0
Hemoglobin    0
MCH           0
MCHC          0
MCV           0
Result        0
dtype: int64 


In [5]:
duplicated = data_raw[data_raw.duplicated(keep=False)]
duplicated = duplicated.sort_values(by=['Gender', "Hemoglobin", "MCH",'MCHC','MCV',"Result"], ascending= False) 
duplicated.head()

Unnamed: 0,Gender,Hemoglobin,MCH,MCHC,MCV,Result
115,1,16.9,24.2,32.1,92.5,0
352,1,16.9,24.2,32.1,92.5,0
589,1,16.9,24.2,32.1,92.5,0
138,1,16.8,24.3,30.5,90.7,0
375,1,16.8,24.3,30.5,90.7,0


In [6]:
data_raw.drop_duplicates(inplace=True)
print("There is {} duplicated values in data frame".format(data_raw.duplicated().sum()))

print("There is {} missing values in data frame".format(data_raw.isnull().sum().sum()))

There is 0 duplicated values in data frame
There is 0 missing values in data frame


In [7]:
x = data_raw.copy(deep=True)
print(x.describe())

           Gender  Hemoglobin         MCH        MCHC         MCV      Result
count  534.000000  534.000000  534.000000  534.000000  534.000000  534.000000
mean     0.522472   13.287079   22.911985   30.249438   85.647004    0.462547
std      0.499963    2.066276    3.948482    1.412312    9.604934    0.499063
min      0.000000    6.600000   16.000000   27.800000   69.400000    0.000000
25%      0.000000   11.600000   19.500000   29.000000   77.325000    0.000000
50%      1.000000   13.100000   22.750000   30.400000   85.450000    0.000000
75%      1.000000   14.975000   26.100000   31.475000   94.150000    1.000000
max      1.000000   16.900000   30.000000   32.500000  101.600000    1.000000


In [8]:
s_list = ["Gender", "Hemoglobin", "MCH", "MCHC", "MCV"]
res = "Result"
warnings.filterwarnings('ignore')
def standartization(x):
    x_std = x.copy(deep=True)
    for column in s_list:
        x_std[column] = (x_std[column]-x_std[column].mean())/x_std[column].std()
    return x_std 
x_std=standartization(x)
print(x_std.head())

     Gender  Hemoglobin       MCH      MCHC       MCV  Result
0  0.955127    0.780594 -0.053688 -0.813870 -0.202709       0
1 -1.045021    1.264556  0.630119 -1.380317 -1.420833       0
2 -1.045021   -2.074785 -0.357602 -0.459841 -1.504123       1
3 -1.045021    0.780594 -1.750542  0.814666  0.192921       0
4  0.955127    0.683801 -0.230971 -1.451123  1.442279       0


In [9]:
y = x["Result"]
y.shape
x.drop("Result", axis=1,inplace=True)
x.head()
x_train,x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

models = []

models.append(("Logistic Regression", LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVR', SVC()))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('RandomForest', RandomForestClassifier()))
models.append(('GradientBoosting', GradientBoostingClassifier()))
models.append(('XGBoost', XGBClassifier()))

for name,model in models:
    mod = model.fit(x_train,y_train) #trainleri modele fit etmek
    y_pred = mod.predict(x_test) # tahmin
    acc = accuracy_score(y_test, y_pred) #rmse hesabı
    print("Classification Model: ",end=" ")
    print(name) #yazdırılacak kısım
    print("AccuracyScore: ",end=" ")
    print(acc)
    print("----------------------------------------------------------")

Classification Model:  Logistic Regression
AccuracyScore:  0.9813084112149533
----------------------------------------------------------
Classification Model:  KNN
AccuracyScore:  0.822429906542056
----------------------------------------------------------
Classification Model:  SVR
AccuracyScore:  0.8598130841121495
----------------------------------------------------------
Classification Model:  Decision Tree
AccuracyScore:  1.0
----------------------------------------------------------
Classification Model:  RandomForest
AccuracyScore:  1.0
----------------------------------------------------------
Classification Model:  GradientBoosting
AccuracyScore:  1.0
----------------------------------------------------------
Classification Model:  XGBoost
AccuracyScore:  1.0
----------------------------------------------------------


In [11]:
model=XGBClassifier()

mod = model.fit(x_train,y_train) 
y_pred = mod.predict(x_test) 

acc = accuracy_score(y_test, y_pred)
print(acc)

1.0


In [12]:
from sklearn.metrics import precision_score, recall_score
def f2_score(y_true, y_pred, beta=2):
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)

    f2_score = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
    
    return f2_score

In [13]:
print("F2 Score:", f2_score(y_test, y_pred))

F2 Score: 1.0


In [14]:
# import pickle
# with open('Anemia_model.pkl', 'wb') as file:
#     pickle.dump(mod, file)
