In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
warnings.filterwarnings('ignore')

In [2]:
print("현재 위치: ", os.getcwd())

현재 위치:  c:\Users\SAMSUNG\Desktop\Thyroid Project\thyroid-project\Notebook


In [3]:
relative_path = '../data/thyroid+disease/new-thyroid.data'
print("파일 존재:", os.path.exists(relative_path))

파일 존재: False


In [4]:
# 여러 경로 시도
possible_paths = [
    '../data/thyroid+disease/',
    './data/thyroid+disease/',
    'data/thyroid+disease/',
    '../../data/thyroid+disease/'
]
# for문으로 파일경로 찾기
for path in possible_paths:
    if os.path.exists(path):
        print(f"데이터 폴더 찾음: {path}")
        print("내용:")
        for file in os.listdir(path):
            print(f"  {file}")
        break

데이터 폴더 찾음: ../../data/thyroid+disease/
내용:
  allbp.data
  allbp.names
  allbp.test
  allhyper.data
  allhyper.names
  allhyper.test
  allhypo.data
  allhypo.names
  allhypo.test
  allrep.data
  allrep.names
  allrep.test
  ann-Readme
  ann-test.data
  ann-thyroid.names
  ann-train.data
  costs
  dis.data
  dis.names
  dis.test
  HELLO
  hypothyroid.data
  hypothyroid.names
  Index
  new-thyroid.data
  new-thyroid.names
  sick-euthyroid.data
  sick-euthyroid.names
  sick.data
  sick.names
  sick.test
  thyroid.theory
  thyroid0387.data
  thyroid0387.names


In [5]:
data_path = '../../data/thyroid+disease/new-thyroid.data'
print("파일 존재:", os.path.exists(data_path))  # 파일 존재 확인
df = pd.read_csv(data_path, header=None)  # 파일 읽어오기
df.head()  # 첫 5번째 행 출력

파일 존재: True


Unnamed: 0,0,1,2,3,4,5
0,1,107,10.1,2.2,0.9,2.7
1,1,113,9.9,3.1,2.0,5.9
2,1,127,12.9,2.4,1.4,0.6
3,1,109,5.3,1.6,1.4,1.5
4,1,105,7.3,1.5,1.5,-0.1


In [6]:
df.info()  # 데이터 정보 불러오기 (결측치 없음)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       215 non-null    int64  
 1   1       215 non-null    int64  
 2   2       215 non-null    float64
 3   3       215 non-null    float64
 4   4       215 non-null    float64
 5   5       215 non-null    float64
dtypes: float64(4), int64(2)
memory usage: 10.2 KB


In [7]:
# names 파일 내용 파싱해서 핵심만 추출
names_file = '../../data/thyroid+disease/new-thyroid.names'

# 핵심 정보만 추출해서 읽기
if os.path.exists(names_file):
    with open(names_file, 'r') as f:
        lines = f.readlines()
    
    print("=== 클래스 정보 ===")
    for line in lines:
        if 'Class' in line and ':' in line:
            print(line.strip())
    
    print("\n=== 데이터셋 기본 정보 ===")
    for line in lines:
        print(line.strip())
        
else:
    print("No such files exist.")

=== 클래스 정보 ===
1:	Class attribute (1 = normal, 2 = hyper, 3 = hypo)
9. Class Distribution: number of instances per class
Class 1: (normal)	150
Class 2: (hyper)	35
Class 3: (hypo)		30

=== 데이터셋 기본 정보 ===

1. Title of Database:
Thyroid gland data. ('normal', hypo and hyper functioning)

2. Sources:
(a) Original owner
Danny Coomans, Dept. of Maths. and Stats., James Cook University,
Townsville 4811, Australia.  email: madhc@manta.jcu.edu.au
(b) Donor of database
Stefan Aeberhard, Dept. of Comp. Science, James Cook University,
Townsville 4811, Australia.  email: stefan@coral.cs.jcu.edu.au
(c) Date received
July, 1992

3. Past Usage:
- (a) Coomans, D., Broeckaert, M. Jonckheer M. and Massart D.L.,
"Comparison of Multivariate Discriminant Techniques for
Clinical Data - Application to the Thyroid Functional State",
Meth. Inform. Med. 22 (1983) pp. 93-101.

(b) The data was used for comparing 16 different discriminant
techniques, each trying to predict the state of the
thyroid gland.

- (a) Co

In [None]:
# 0:	Class attribute (1 = normal, 2 = hyper, 3 = hypo)
# 1:	T3-resin uptake test. (A percentage)
# 2:	Total Serum thyroxin as measured by the isotopic
# displacement method.
# 3: 	Total serum triiodothyronine as measured by radioimmuno
# assay.
# 4: 	basal thyroid-stimulating hormone (TSH) as measured by
# radioimmuno assay.
# 5: 	Maximal absolute difference of TSH value after injection of
# 200 micro grams of thyrotropin-releasing hormone as compared
# to the basal value.

In [8]:
df[0].value_counts()  # 분류할 y값 찾음

0
1    150
2     35
3     30
Name: count, dtype: int64

In [None]:
y = df[[0]]  # y 값을 데이터프레임으로 저장
y

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
210,3
211,3
212,3
213,3


In [None]:
X = df.drop(0, axis=1)  # 원활한 학습을 위해 y값 제거
X

Unnamed: 0,1,2,3,4,5
0,107,10.1,2.2,0.9,2.7
1,113,9.9,3.1,2.0,5.9
2,127,12.9,2.4,1.4,0.6
3,109,5.3,1.6,1.4,1.5
4,105,7.3,1.5,1.5,-0.1
...,...,...,...,...,...
210,118,6.5,1.3,1.7,11.5
211,139,4.2,0.7,4.3,6.3
212,103,5.1,1.4,1.2,5.0
213,97,4.7,1.1,2.1,12.6


In [11]:
X.describe()

Unnamed: 0,1,2,3,4,5
count,215.0,215.0,215.0,215.0,215.0
mean,109.595349,9.804651,2.050233,2.88,4.19907
std,13.145447,4.697362,1.419486,6.118031,8.070519
min,65.0,0.5,0.2,0.1,-0.7
25%,103.0,7.1,1.35,1.0,0.55
50%,110.0,9.2,1.7,1.3,2.0
75%,117.5,11.3,2.2,1.7,4.1
max,144.0,25.3,10.0,56.4,56.3
