1. Veri seti okuma

In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

df = pd.read_csv("datasets/thyroid_cancer_risk_data.csv")

# 'Patient_ID' ve 'Diagnosis' modelleme için gerekli değil
df = df.drop(columns=['Patient_ID', 'Diagnosis'])
df.head()

Unnamed: 0,Age,Gender,Country,Ethnicity,Family_History,Radiation_Exposure,Iodine_Deficiency,Smoking,Obesity,Diabetes,TSH_Level,T3_Level,T4_Level,Nodule_Size,Thyroid_Cancer_Risk
0,66,Male,Russia,Caucasian,No,Yes,No,No,No,No,9.37,1.67,6.16,1.08,Low
1,29,Male,Germany,Hispanic,No,Yes,No,No,No,No,1.83,1.73,10.54,4.05,Low
2,86,Male,Nigeria,Caucasian,No,No,No,No,No,No,6.26,2.59,10.57,4.61,Low
3,75,Female,India,Asian,No,No,No,No,No,No,4.1,2.62,11.04,2.46,Medium
4,35,Female,Germany,African,Yes,Yes,No,No,No,No,9.1,2.11,10.71,2.11,High


2. Hedef değişkenleri sayısallaştırma

In [8]:
# Low = 0, Medium = 1, High = 2
risk_map = {'Low': 0, 'Medium': 1, 'High': 2}
df['Thyroid_Cancer_Risk'] = df['Thyroid_Cancer_Risk'].map(risk_map)
df['Thyroid_Cancer_Risk'].value_counts()

Thyroid_Cancer_Risk
0    108388
1     72400
2     31903
Name: count, dtype: int64

3. Kategorik sütunlara One-Hot ve Label Encoder uygulama

In [9]:
# One-Hot için uygun kolonlar (kategorisi fazla veya sıralı olmayanlar)
one_hot_cols = ['Country', 'Ethnicity', 'Gender']

# LabelEncoder uygulanacak ikili kategoriler
binary_cols = [
    'Family_History', 'Radiation_Exposure', 'Iodine_Deficiency',
    'Smoking', 'Obesity', 'Diabetes'
]

# Label Encoding
le = LabelEncoder()
for col in binary_cols:
    df[col] = le.fit_transform(df[col])

# One-Hot Encoding
df = pd.get_dummies(df, columns=one_hot_cols, drop_first=True)

df.head()

Unnamed: 0,Age,Family_History,Radiation_Exposure,Iodine_Deficiency,Smoking,Obesity,Diabetes,TSH_Level,T3_Level,T4_Level,...,Country_Nigeria,Country_Russia,Country_South Korea,Country_UK,Country_USA,Ethnicity_Asian,Ethnicity_Caucasian,Ethnicity_Hispanic,Ethnicity_Middle Eastern,Gender_Male
0,66,0,1,0,0,0,0,9.37,1.67,6.16,...,False,True,False,False,False,False,True,False,False,True
1,29,0,1,0,0,0,0,1.83,1.73,10.54,...,False,False,False,False,False,False,False,True,False,True
2,86,0,0,0,0,0,0,6.26,2.59,10.57,...,True,False,False,False,False,False,True,False,False,True
3,75,0,0,0,0,0,0,4.1,2.62,11.04,...,False,False,False,False,False,True,False,False,False,False
4,35,1,1,0,0,0,0,9.1,2.11,10.71,...,False,False,False,False,False,False,False,False,False,False


4. Özellik ve hedefi ayırma

In [10]:
X = df.drop(columns=['Thyroid_Cancer_Risk'])
y = df['Thyroid_Cancer_Risk']

5. Eğitim ve test seti ayırma

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

6. Sayısal verilerin ölçeklendirilmesi (Standart Scaler)

In [12]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

7. Ön işlenmiş veriyi kaydetme

In [13]:
train_df = pd.DataFrame(X_train_scaled, columns=X.columns)
train_df['Thyroid_Cancer_Risk'] = y_train.reset_index(drop=True)
train_df.to_csv("datasets/processed_train_data.csv", index=False)

test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
test_df['Thyroid_Cancer_Risk'] = y_test.reset_index(drop=True)
test_df.to_csv("datasets/processed_test_data.csv", index=False)

print("✅ Veri işleme tamamlandı ve dosyalar kaydedildi.")

✅ Veri işleme tamamlandı ve dosyalar kaydedildi.
