In [2]:
import pandas as pd
import os
import sys
import shutil
from thyroid.utils.exception import customException
import requests
import zipfile
import os

In [4]:
print("Current working directory:", os.getcwd())

Current working directory: c:\Users\aakas\Documents\Projects\Thyroid-disease-detection\research


In [5]:
os.chdir("../")

In [6]:
%pwd

'c:\\Users\\aakas\\Documents\\Projects\\Thyroid-disease-detection'

#### 1. Download and extract the data-set

In [7]:
url = "https://archive.ics.uci.edu/static/public/102/thyroid+disease.zip"

In [7]:
# Directory where the data will be extracted
directory = 'raw_data'

# Create the directory if it does not exist
if not os.path.exists(directory):
    os.makedirs(directory)


In [8]:
# Download the file
response = requests.get(url)
file_name = url.split('/')[-1]  # Extracting the file name from the URL
file_path = os.path.join(directory, file_name)

In [9]:
# Save the downloaded file
with open(file_path, 'wb') as f:
    f.write(response.content)

# Extract the downloaded zip file
with zipfile.ZipFile(file_path, 'r') as zip_ref:
    zip_ref.extractall(directory)

# Remove the zip file after extraction
os.remove(file_path)

#### 2 read all the files and find total number of data in each dataset


In [8]:
raw_directory = "raw_data"

In [9]:
def count_rows_in_file(file_path):
    with open(file_path, "r") as file:
        return sum(1 for _ in file)

observing no. of entries in .data files

In [10]:
file_extension = ".data"

for filename in os.listdir(raw_directory):
    if filename.endswith(file_extension):
        file_path = os.path.join(raw_directory, filename)
        row_count = count_rows_in_file(file_path)
        print(f"File: {filename}, Rows: {row_count}")


File: allbp.data, Rows: 2800
File: allhyper.data, Rows: 2800
File: allhypo.data, Rows: 2800
File: allrep.data, Rows: 2800
File: ann-test.data, Rows: 3428
File: ann-train.data, Rows: 3772
File: dis.data, Rows: 2800
File: hypothyroid.data, Rows: 3163
File: new-thyroid.data, Rows: 215
File: sick-euthyroid.data, Rows: 3163
File: sick.data, Rows: 2800
File: thyroid0387.data, Rows: 9172


data size of allbp, allhyper, allhypo, allrep, dis and sick is exactly 2800 rows

In [11]:
file_extension = ".test"

for filename in os.listdir(raw_directory):
    if filename.endswith(file_extension):
        file_path = os.path.join(raw_directory, filename)
        row_count = count_rows_in_file(file_path)
        print(f"File: {filename}, Rows: {row_count}")

File: allbp.test, Rows: 972
File: allhyper.test, Rows: 972
File: allhypo.test, Rows: 972
File: allrep.test, Rows: 972
File: dis.test, Rows: 972
File: sick.test, Rows: 972


we can observe that all entries in allbp, allhyper, allhypo, allrep, dis and sick  in test file is exactly 972 rows

In [9]:
row = '41,F,f,f,f,f,f,f,  f,f,f,f,f,f,f,f,t,1.3,t,2.5,t,125,t,1.14,t,109,f,?,SVHC,negative.|3733\n'

In [10]:
r = row.strip().split('\n')
r

['41,F,f,f,f,f,f,f,  f,f,f,f,f,f,f,f,t,1.3,t,2.5,t,125,t,1.14,t,109,f,?,SVHC,negative.|3733']

In [11]:
c =[]
cls = []
for attribute in row.strip().split(','):
     if '.|' in attribute:
        c.append(attribute.split('.|')[0])
        cls.append(attribute.split('.|')[1])



In [12]:
c

['negative']

In [13]:
cls

['3733']

we can observe that allbp, allhyper, allhypo, allrep, dis and sick all have 2800 rows

lets check what classes each data have

In [20]:
def extract_labels_from_file(file_path):
    #if file_path.lower().endswith('.data' or '.test'):
    unique_labels = set()
    with open(file_path, 'r') as file:
        data = file.readlines()
        for row in data:
            rw = row.strip().split('\n')  # Split input into rows
            for attribute in row.strip().split(','):
                if '.|' in attribute:
                    unique_labels.add(attribute.split('.|')[0])
    return unique_labels

In [13]:
dataset_filenames = ["allbp.data", "allhyper.data", "allhypo.data", "allrep.data", "dis.data", "sick.data"]
# Define a set to store unique class labels


# Loop through each dataset file
for filename in dataset_filenames:
    file_path = os.path.join(raw_directory, filename)  # Assuming the files are in the "raw" folder
    if os.path.exists(file_path):
        ul = extract_labels_from_file(file_path)
        print(f"{filename} : {ul}")

    

allbp.data : {'negative', 'decreased binding protein', 'increased binding protein'}
allhyper.data : {'goitre', 'negative', 'T3 toxic', 'hyperthyroid'}
allhypo.data : {'negative', 'secondary hypothyroid', 'primary hypothyroid', 'compensated hypothyroid'}
allrep.data : {'replacement therapy', 'negative', 'underreplacement', 'overreplacement'}
dis.data : {'negative', 'discordant'}
sick.data : {'negative', 'sick'}


In [21]:
dataset_filenames = [ "allhyper.test", "allhypo.test"]
# Define a set to store unique class labels


# Loop through each dataset file
for filename in dataset_filenames:
    file_path = os.path.join(raw_directory, filename)  # Assuming the files are in the "raw" folder
    if os.path.exists(file_path):
        ul = extract_labels_from_file(file_path)
        print(f"{filename} : {ul}")

    

allhyper.test : {'T3 toxic', 'hyperthyroid', 'secondary toxic', 'negative', 'goitre'}
allhypo.test : {'negative', 'primary hypothyroid', 'compensated hypothyroid'}


I have observed that there are 2800 patients ID and it is possibility that they are same in all these files, by checking data generally

The files allbp, allrep, dis, sick all contains therapies and data is almost identical with all hyper and hypo so we will consider only data from allhyper and allhypo files and discard rest


In [14]:
files_to_remove = ['allbp.data','allrep.data','dis.data','sick.data','allbp.test','allrep.test','dis.test','sick.test','allbp.name','allrep.name','dis.name','sick.name']
for file_name in files_to_remove:
    file_path = os.path.join(raw_directory, file_name)
    if os.path.exists(file_path):
        os.remove(file_path)