In [1]:
import pandas as pd
import os
import sys
import shutil
from thyroid.utils.exception import customException

In [2]:
print("Current working directory:", os.getcwd())

Current working directory: c:\Users\hp\Desktop\Thyroid-disease-detection\research


In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\hp\\Desktop\\Thyroid-disease-detection'

#### 1 Modifying 'all' datasest accordingly

In [6]:
# define configuration
raw_directory = "raw_data"
artifact_directory = "artifact"
file_names = ["allhyper.data","allhypo.data","allhyper.test","allhypo.test"]
ingested_directory = "ingested_data"


In [7]:
# Create the artifact directory if it doesn't exist
if not os.path.exists(artifact_directory):
    os.makedirs(artifact_directory)

# Create the ingested_data directory inside the artifact directory if it doesn't exist
ingested_data_directory = os.path.join(artifact_directory, ingested_directory)
if not os.path.exists(ingested_data_directory):
    os.makedirs(ingested_data_directory)


In [8]:
import os

def read_data_file(file_path):
    #if file_path.lower().endswith('.data' or '.test'):
    try:
        with open(file_path, 'r') as file:
            data = file.readlines()
            print("read_data_file")
            return data
        
    except FileNotFoundError:
        return "File not found."
    except Exception as e:
        return "An error occurred: " + str(e)
    else:
        # if the file does not end with .data
        return False


In [9]:
# process the file extension
all_rows = []
for file in file_names:
    print(file)
    file_path = os.path.join(raw_directory, file)
    data_rows = read_data_file(file_path)
    all_rows.extend(data_rows)

allhyper.data


read_data_file
allhypo.data
read_data_file
allhyper.test
read_data_file
allhypo.test
read_data_file


In [10]:
len(all_rows )

7544

In [11]:
import csv
from datetime import datetime

In [12]:
import re
def process_rows(record):
    cleaned_attributes = []
    rows = record.strip().split('\n')  # Split input into rows
    for attribute in record.strip().split(','):
        attribute = re.sub(r'-', 'negative', attribute)  # Replace '-' with 'negative'
        if '.|' in attribute:
            cleaned_attributes.append(attribute.split('.|')[0])
        elif '[' in attribute:
            cleaned_attributes.append(attribute.split('[')[0])
        else:
            cleaned_attributes.append(attribute)
    return cleaned_attributes

In [13]:
id_removed_data = []
if all_rows:
    for row in all_rows:
        id_removed_data.append(process_rows(row))

In [14]:
type(id_removed_data )

list

In [15]:
import yaml
from thyroid.utils.exception import customException

In [16]:
def read_yaml_file(file_path: str) -> dict:
    try:
        with open(file_path, "rb") as yaml_file:
            return yaml.safe_load(yaml_file)
    except Exception as e:
        raise customException(e, sys) from e

In [17]:
yaml_file_path = os.path.join("config", "config.yaml")
config_data = read_yaml_file(yaml_file_path)
expected_column_length = config_data['train_column_length']

In [20]:
column = config_data['columns']
column_names = [list(column.keys())[0] for column in column]



In [21]:
all_df = pd.DataFrame(id_removed_data, columns=column_names)

In [22]:
all_df.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,class
0,41,F,f,f,f,f,f,f,f,f,...,t,125,t,1.14,t,109,f,?,SVHC,negative
1,23,F,f,f,f,f,f,f,f,f,...,t,102,f,?,f,?,f,?,other,negative
2,46,M,f,f,f,f,f,f,f,f,...,t,109,t,0.91,t,120,f,?,other,negative
3,70,F,t,f,f,f,f,f,f,f,...,t,175,f,?,f,?,f,?,other,negative
4,70,F,f,f,f,f,f,f,f,f,...,t,61,t,0.87,t,70,f,?,SVI,negative


In [23]:
all_df['class'].unique()

array(['negative', 'hyperthyroid', 'T3 toxic', 'goitre',
       'compensated hypothyroid', 'primary hypothyroid',
       'secondary hypothyroid', 'secondary toxic'], dtype=object)

In [24]:
# Replacing values in the 'class' column
all_df['class'] = all_df['class'].replace(['goitre', 'T3 toxic', 'hyperthyroid', 'secondary toxic'], 'hyperthyroid')
all_df['class'] = all_df['class'].replace(['primary hypothyroid', 'compensated hypothyroid', 'secondary hypothyroid'], 'hypothyroid')


In [25]:
all_df['class'].unique()

array(['negative', 'hyperthyroid', 'hypothyroid'], dtype=object)

In [26]:
all_df.to_csv(f'{ingested_data_directory}/all-hyper-hypo.csv', index=False)

#### 2 Working on thyrodi0387

In [27]:
file = 'thyroid0387.data'

In [28]:
# process the file extension
all_rows = []
file_path = os.path.join(raw_directory, file)
data_rows = read_data_file(file_path)


read_data_file


In [29]:
id_removed_data = []
if data_rows:
    for row in data_rows:
        id_removed_data.append(process_rows(row))

In [30]:
len(id_removed_data[0])

30

In [31]:
thyroid_df = pd.DataFrame(id_removed_data, columns=column_names)

In [32]:
thyroid_df.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,class
0,29,F,f,f,f,f,f,f,f,t,...,f,?,f,?,f,?,f,?,other,negative
1,29,F,f,f,f,f,f,f,f,f,...,t,128,f,?,f,?,f,?,other,negative
2,41,F,f,f,f,f,f,f,f,f,...,f,?,f,?,f,?,t,11,other,negative
3,36,F,f,f,f,f,f,f,f,f,...,f,?,f,?,f,?,t,26,other,negative
4,32,F,f,f,f,f,f,f,f,f,...,f,?,f,?,f,?,t,36,other,S


In [33]:
thyroid_df['class'].unique()

array(['negative', 'S', 'F', 'AK', 'R', 'I', 'M', 'N', 'G', 'K', 'A',
       'KJ', 'L', 'MK', 'Q', 'J', 'C|I', 'O', 'LJ', 'H|K', 'D', 'GK',
       'MI', 'P', 'FK', 'B', 'GI', 'C', 'GKJ', 'OI', 'D|R', 'E'],
      dtype=object)

In [34]:
thyroid_df['class'].replace(['A','AK','B','C','C|I','D','D|R'],"hyperthyroid",inplace = True)
thyroid_df['class'].replace(['E','F','FK','G','GK','H','H|K'],"hypothyroid",inplace = True)

for value in set(thyroid_df['class']):
    if(value != 'hypothyroid' and value != 'hyperthyroid'):
        thyroid_df['class'].replace(value,'negative',inplace=True)

In [35]:
thyroid_df['sex'].unique()

array(['F', 'M', '?'], dtype=object)

In [36]:
thyroid_df.to_csv(f'{ingested_data_directory}/thyroid0387.csv', index=False)


#### 3 Working on hypothyroid.data

In [37]:
# process the file extension
file = "hypothyroid.data"
all_rows = []
file_path = os.path.join(raw_directory, file)
data_rows = read_data_file(file_path)


read_data_file


In [38]:
data_list = [item.strip().split(',') for item in data_rows]

In [39]:
with open(f'{raw_directory}/hypothyroid.names', 'r') as name_file:
    columns = [line.split(':')[0].strip() for line in name_file.readlines()]


In [40]:
data_columns = columns[2:-1]

In [41]:
data_columns.insert(0, 'class')

In [42]:
len(data_columns)

26

In [43]:
hypothyroid_df = pd.DataFrame(data_list, columns=data_columns)
hypothyroid_df.head()

Unnamed: 0,class,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,thyroid_surgery,query_hypothyroid,query_hyperthyroid,pregnant,...,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG
0,hypothyroid,72,M,f,f,f,f,f,f,f,...,y,0.6,y,15,y,1.48,y,10,n,?
1,hypothyroid,15,F,t,f,f,f,f,f,f,...,y,1.7,y,19,y,1.13,y,17,n,?
2,hypothyroid,24,M,f,f,f,f,f,f,f,...,y,0.2,y,4,y,1.0,y,0,n,?
3,hypothyroid,24,F,f,f,f,f,f,f,f,...,y,0.4,y,6,y,1.04,y,6,n,?
4,hypothyroid,77,M,f,f,f,f,f,f,f,...,y,1.2,y,57,y,1.28,y,44,n,?


In [44]:
hypothyroid_df.to_csv(f'{ingested_data_directory}/hypothyroid.csv', index=False)

#### 4. Working on sick-ethuroid data

In [45]:
# process the file extension
file = "sick-euthyroid.data"
all_rows = []
file_path = os.path.join(raw_directory, file)
data_rows = read_data_file(file_path)

read_data_file


In [46]:
data_list = [item.strip().split(',') for item in data_rows]

In [47]:
with open(f'{raw_directory}/sick-euthyroid.names', 'r') as name_file:
    columns = [line.split(':')[0].strip() for line in name_file.readlines()]

In [48]:
data_columns = columns[2:-1]
data_columns.insert(0, 'class')

In [49]:
len(data_columns)

26

In [50]:
sick_eu_df = pd.DataFrame(data_list, columns=data_columns)
sick_eu_df.head()

Unnamed: 0,class,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,thyroid_surgery,query_hypothyroid,query_hyperthyroid,pregnant,...,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG
0,sick-euthyroid,72,M,f,f,f,f,f,f,f,...,y,1.0,y,83,y,0.95,y,87,n,?
1,sick-euthyroid,45,F,f,f,f,f,f,f,f,...,y,1.0,y,82,y,0.73,y,112,n,?
2,sick-euthyroid,64,F,f,f,f,f,f,f,f,...,y,1.0,y,101,y,0.82,y,123,n,?
3,sick-euthyroid,56,M,f,f,f,f,f,f,f,...,y,0.8,y,76,y,0.77,y,99,n,?
4,sick-euthyroid,78,F,t,f,f,f,t,f,f,...,y,0.3,y,87,y,0.95,y,91,n,?


In [51]:
sick_eu_df['sex'].unique()

array(['M', 'F', '?'], dtype=object)

here we will extract only negative records

In [52]:
negative_records = sick_eu_df[sick_eu_df['class'] == 'negative']

In [53]:
negative_records.shape 

(2870, 26)

In [54]:
negative_records.head()

Unnamed: 0,class,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,thyroid_surgery,query_hypothyroid,query_hyperthyroid,pregnant,...,T3_measured,T3,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG
293,negative,20,F,f,f,f,f,f,f,f,...,y,1.8,y,68,y,0.99,y,68,n,?
294,negative,29,F,f,f,f,f,f,f,f,...,y,2.5,y,83,y,0.93,y,89,n,?
295,negative,66,F,f,f,f,f,f,t,f,...,y,1.7,y,76,y,0.83,y,92,n,?
296,negative,62,M,f,f,f,f,f,f,f,...,y,2.2,y,103,y,0.99,y,104,n,?
297,negative,72,F,f,f,f,f,f,f,f,...,y,1.5,y,66,y,0.97,y,69,n,?


In [55]:
negative_records['sex'].unique()

array(['F', 'M', '?'], dtype=object)

In [56]:
negative_records.to_csv(f'{ingested_data_directory}/euthyroid-negative.csv', index=False)

#### 5. Working on ANN data

Let's Merge the ann train and test data

In [57]:
file_names = ["ann-test.data","ann-train.data"]
# process the file extension
all_rows = []
for file in file_names:
    print(file)
    file_path = os.path.join(raw_directory, file)
    data_rows = read_data_file(file_path)
    all_rows.extend(data_rows)

ann-test.data


read_data_file
ann-train.data
read_data_file


In [58]:
len(all_rows )

7200

In [59]:
(all_rows[1].strip().split())

['0.32',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0.0013',
 '0.019',
 '0.084',
 '0.078',
 '0.107',
 '3']

In [60]:
data_list = [record.strip().split() for record in all_rows]

Since no columns are given I am creating columns

In [61]:
ann_columns = ["age","sex","on_thyroxine","query_on_thyroxine","on_antithyroid_medication","sick","pregnant","thyroid_surgery","I131_treatment","query_hypothyroid",
"query_hyperthyroid","lithium","goitre","tumor","hypopituitary","psych","TSH","T3","TT4","T4U","FTI","class"]                     

In [62]:
len(ann_columns)

22

In [63]:
# Create a DataFrame from the list of lists
ann_df = pd.DataFrame(data_list, columns=ann_columns)

In [64]:
ann_df.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,class
0,0.29,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0061,0.028,0.111,0.131,0.085,2
1,0.32,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0013,0.019,0.084,0.078,0.107,3
2,0.35,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0,0.031,0.239,0.1,0.239,3
3,0.21,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.001,0.018,0.087,0.088,0.099,3
4,0.22,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0.0004,0.022,0.134,0.135,0.099,3


In [65]:
ann_df['class'].value_counts()

class
3    6666
2     368
1     166
Name: count, dtype: int64

- 3 is referring to the 'negative' class
- 2 is referring to the 'hypothyroid' class
- 1 is referring to the 'hyperthyroid' class


In [66]:
ann_df['sex'].value_counts()

sex
0    5009
1    2191
Name: count, dtype: int64

- on this data age, TSH, T3, TT4, T4U, FTI can be multiplied by 100 to ge the orignal numbers
- class : 3-'negative', 2-'hypothyroid', 1-'hyperthyroid'
- since number of females are mor than numbers of males in all data here for sex 0-'female' and 1-'male'
- there are no measured columns for tests to we make it.

In [67]:
ann_df.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,goitre,tumor,hypopituitary,psych,TSH,T3,TT4,T4U,FTI,class
0,0.29,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0061,0.028,0.111,0.131,0.085,2
1,0.32,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0013,0.019,0.084,0.078,0.107,3
2,0.35,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0,0.031,0.239,0.1,0.239,3
3,0.21,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0.001,0.018,0.087,0.088,0.099,3
4,0.22,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0.0004,0.022,0.134,0.135,0.099,3


In [68]:
ann_df = ann_df.apply(pd.to_numeric, errors='coerce')

In [69]:
continuos_attributes = ['age','TSH','T3','TT4','T4U','FTI']

# map all continuous attributes in multiple of 100
ann_df[continuos_attributes] = ann_df[continuos_attributes] * 100

In [70]:
ann_df['sex'] = ann_df['sex'].map({0:'f',1:'m'})
ann_df['class'] = ann_df['class'].map({3:'negative',2:'hypothyroid',1:'hyperthyroid'})



In [71]:
def fillNewAttributes(row,attribute):
    if row[attribute] > 0:
        return 't'
    else:
        return 'f'

ann_df['TSH_measured'] = ann_df.apply(lambda row: fillNewAttributes(row,'TSH'), axis=1)
ann_df['T3_measured'] = ann_df.apply(lambda row: fillNewAttributes(row,'T3'), axis=1)
ann_df['TT4_measured'] = ann_df.apply(lambda row: fillNewAttributes(row,'TT4'), axis=1)
ann_df['T4U_measured'] = ann_df.apply(lambda row: fillNewAttributes(row,'T4U'), axis=1)
ann_df['FTI_measured'] = ann_df.apply(lambda row: fillNewAttributes(row,'FTI'), axis=1)

In [72]:
ann_df.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,T3,TT4,T4U,FTI,class,TSH_measured,T3_measured,TT4_measured,T4U_measured,FTI_measured
0,29.0,f,0,0,0,0,0,0,0,0,...,2.8,11.1,13.1,8.5,hypothyroid,t,t,t,t,t
1,32.0,f,0,0,0,0,0,0,0,0,...,1.9,8.4,7.8,10.7,negative,t,t,t,t,t
2,35.0,f,0,0,0,0,0,0,0,0,...,3.1,23.9,10.0,23.9,negative,f,t,t,t,t
3,21.0,f,0,0,0,0,0,0,0,0,...,1.8,8.7,8.8,9.9,negative,t,t,t,t,t
4,22.0,f,0,0,0,1,0,0,0,0,...,2.2,13.4,13.5,9.9,negative,t,t,t,t,t


In [73]:
ann_df.to_csv(f'{ingested_data_directory}/ann-data.csv', index=False)

In [74]:
file_list = os.listdir(ingested_data_directory)
for file in file_list:
    if file.endswith('.csv'):
        file_path = os.path.join(ingested_data_directory, file)
        df = pd.read_csv(file_path)
        print(f"File: {file}")
        print(f"Columns: {list(df.columns)}")
        print(f"Column Counts: {len(df.columns)}")

File: all-hyper-hypo.csv
Columns: ['age', 'sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery', 'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured', 'TSH', 'T3_measured', 'T3', 'TT4_measured', 'TT4', 'T4U_measured', 'T4U', 'FTI_measured', 'FTI', 'TBG_measured', 'TBG', 'referral_source', 'class']
Column Counts: 30
File: ann-data.csv
Columns: ['age', 'sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery', 'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'class', 'TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured']
Column Counts: 27
File: euthyroid-negative.csv
Columns: ['class', 'age', 'sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_medication', 'thyroid_su

In [75]:
data = pd.concat([all_df,thyroid_df,hypothyroid_df,negative_records,ann_df], axis=0)

In [76]:
data.head()

Unnamed: 0,age,sex,on_thyroxine,query_on_thyroxine,on_antithyroid_medication,sick,pregnant,thyroid_surgery,I131_treatment,query_hypothyroid,...,TT4_measured,TT4,T4U_measured,T4U,FTI_measured,FTI,TBG_measured,TBG,referral_source,class
0,41,F,f,f,f,f,f,f,f,f,...,t,125,t,1.14,t,109,f,?,SVHC,negative
1,23,F,f,f,f,f,f,f,f,f,...,t,102,f,?,f,?,f,?,other,negative
2,46,M,f,f,f,f,f,f,f,f,...,t,109,t,0.91,t,120,f,?,other,negative
3,70,F,t,f,f,f,f,f,f,f,...,t,175,f,?,f,?,f,?,other,negative
4,70,F,f,f,f,f,f,f,f,f,...,t,61,t,0.87,t,70,f,?,SVI,negative


In [77]:
print(all_df.shape)
print(thyroid_df.shape)
print(hypothyroid_df.shape)
print(negative_records.shape)
print(ann_df.shape)

(7544, 30)
(9172, 30)
(3163, 26)
(2870, 26)
(7200, 27)


In [78]:
data.shape

(29949, 30)

In [79]:
for col in data.columns:
    print(col , data[col].unique())

age ['41' '23' '46' '70' '18' '59' '80' '66' '68' '84' '67' '71' '28' '65'
 '42' '63' '51' '81' '54' '55' '60' '25' '73' '34' '78' '37' '85' '26'
 '58' '64' '44' '48' '61' '35' '83' '21' '87' '53' '77' '27' '69' '74'
 '38' '76' '45' '36' '22' '43' '72' '82' '31' '39' '49' '62' '57' '1' '50'
 '30' '29' '75' '19' '7' '79' '17' '24' '15' '32' '47' '16' '52' '33' '13'
 '10' '89' '56' '20' '90' '40' '88' '14' '86' '94' '12' '4' '11' '8' '5'
 '455' '2' '91' '6' '?' '93' '92' '9' '97' '65511' '95' '65512' '3'
 '65526' '98' 28.999999999999996 32.0 35.0 21.0 22.0 39.0 77.0 23.0 45.0
 65.0 53.0 79.0 56.99999999999999 54.0 61.0 14.000000000000002
 55.00000000000001 59.0 60.0 73.0 56.00000000000001 38.0 42.0 78.0 64.0
 52.0 81.0 75.0 36.0 51.0 76.0 70.0 87.0 63.0 40.0 16.0 28.000000000000004
 26.0 25.0 27.0 69.0 62.0 67.0 71.0 66.0 85.0 43.0 46.0 48.0 24.0 31.0
 68.0 74.0 44.0 52.190000000000005 37.0 33.0 41.0 82.0 20.0 15.0 47.0
 57.99999999999999 91.0 49.0 95.0 72.0 50.0 19.0 18.0 30.0 83.0 86.0

In [80]:
data['class'].unique()

array(['negative', 'hyperthyroid', 'hypothyroid'], dtype=object)

In [94]:
data.to_csv(f'{ingested_data_directory}/data.csv',index=False)

In [86]:
def save_to_csv(data, csv_path):
    try:
        print(len(data))
        with open(csv_path, 'w', newline='') as csv_file:
            csv_writer = csv.writer(csv_file)
            #csv_writer.writerow(column_names)
            for row in data:
                csv_writer.writerow(row)
        return "CSV file saved successfully."
    except Exception as e:
        return "An error occurred while saving CSV: " + str(e)