In [39]:
import pandas as pd
import os
import sys
import shutil
from thyroid.utils.exception import customException

In [40]:
print("Current working directory:", os.getcwd())

Current working directory: c:\Users\aakas\Documents\Projects\Thyroid-disease-detection


In [3]:
os.chdir("../")

In [41]:
%pwd

'c:\\Users\\aakas\\Documents\\Projects\\Thyroid-disease-detection'

In [115]:
# define configuration
raw_directory = "raw_data"
artifact_directory = "artifact"
file_name = "ann-train.data"
bad_directory = "bad_data"
ingested_directory = "ingested_data"


In [81]:
# Create the artifact directory if it doesn't exist
if not os.path.exists(artifact_directory):
    os.makedirs(artifact_directory)

# Create the ingested_data directory inside the artifact directory if it doesn't exist
ingested_data_directory = os.path.join(artifact_directory, ingested_directory)
if not os.path.exists(ingested_data_directory):
    os.makedirs(ingested_data_directory)


# Create the ingested_data directory inside the artifact directory if it doesn't exist
bad_data_directory = os.path.join(artifact_directory, bad_directory)
if not os.path.exists(bad_data_directory):
    os.makedirs(bad_data_directory)


In [82]:
import os

def read_data_file(file_path):
    if file_path.lower().endswith('.data' or '.txt'):
        try:
            with open(file_path, 'r') as file:
                data = file.readlines()
                print("read_data_file")
                return data
            
        except FileNotFoundError:
            return "File not found."
        except Exception as e:
            return "An error occurred: " + str(e)
    else:
        # if the file does not end with .data
        return False


In [83]:
import csv
from datetime import datetime

In [84]:
def save_to_csv(data, csv_path, column_names):
    try:
        print(len(data))
        with open(csv_path, 'w', newline='') as csv_file:
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(column_names)
            for row in data:
                csv_writer.writerow(row)
        return "CSV file saved successfully."
    except Exception as e:
        return "An error occurred while saving CSV: " + str(e)

In [117]:
import re
def process_rows(record):
    cleaned_attributes = []
    rows = input_data.strip().split('\n')  # Split input into rows
    for attribute in record.strip().split(','):
        attribute = re.sub(r'-', 'negative', attribute)  # Replace '-' with 'negative'
        if '.|' in attribute:
            cleaned_attributes.append(attribute.split('.|')[0])
        elif '[' in attribute:
            cleaned_attributes.append(attribute.split('[')[0])
        else:
            cleaned_attributes.append(attribute)
    return cleaned_attributes

In [119]:
# process the file extension
file_path = os.path.join(raw_directory, file_name)
data_rows = read_data_file(file_path)

read_data_file


In [120]:
data_rows

['0.73 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0.0006 0.015 0.12 0.082 0.146 3  \n',
 '0.24 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.00025 0.03 0.143 0.133 0.108 3  \n',
 '0.47 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0019 0.024 0.102 0.131 0.078 3  \n',
 '0.64 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0009 0.017 0.077 0.09 0.085 3  \n',
 '0.23 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.00025 0.026 0.139 0.09 0.153 3  \n',
 '0.69 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.00025 0.016 0.086 0.07 0.123 3  \n',
 '0.85 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.00025 0.023 0.128 0.104 0.121 3  \n',
 '0.48 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.00208 0.02 0.086 0.078 0.11 3  \n',
 '0.67 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0013 0.024 0.087 0.109 0.08 3  \n',
 '0.76 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0001 0.029 0.124 0.128 0.097 3  \n',
 '0.62 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0.011 0.008 0.073 0.074 0.098 2  \n',
 '0.18 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0.0001 0.023 0.098 0.085 0.115 3  \n',
 '0.59 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.0008 0.023 0.094 0.099 0.09475 3  \n',
 '0.49 0 1 0 0

In [88]:
import yaml

In [89]:
def read_yaml_file(file_path: str) -> dict:
    try:
        with open(file_path, "rb") as yaml_file:
            return yaml.safe_load(yaml_file)
    except Exception as e:
        raise customException(e, sys) from e

In [90]:
yaml_file_path = os.path.join("config", "config.yaml")
columns = read_yaml_file(yaml_file_path)
column_names = [list(column.keys())[0] for column in columns['columns']]


In [91]:
len(column_names)

30

In [92]:
from datetime import datetime

In [121]:
processed_data = []
if data_rows:
    print("Processing:", file_name)
    for row in data_rows:
        processed_data.append(process_rows(row))

        
    # convert file into .csv and save it to artifacts/ingested directory
    current_datetime = datetime.now().strftime('%Y%m%d_%H%M%S')
    csv_file_name = f"thyroid_{current_datetime}.csv"
    csv_path = os.path.join(ingested_data_directory, csv_file_name)
    save_result = save_to_csv(processed_data, csv_path,column_names)
else:
    bad_file_path = os.path.join(bad_data_directory, file_name)
    shutil.move(file_path, bad_file_path)
    print("Moved to bad_data:", file_name)     

Processing: ann-train.data
3772
