In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Load dataset
exasens_path = 'C:\Platform_bases_computing\Final_project\Exasens.csv'  # Replace with your file path
exasens_df = pd.read_csv(exasens_path)

# Select only numeric columns and fill missing values with their mean
numeric_cols = exasens_df.select_dtypes(include=['float64', 'int64']).columns
exasens_df[numeric_cols] = exasens_df[numeric_cols].fillna(exasens_df[numeric_cols].mean())

# Alternatively, drop rows with missing values
# exasens_df.dropna(inplace=True)


# Encode categorical disease labels
label_encoder = LabelEncoder()
exasens_df['Diagnosis'] = label_encoder.fit_transform(exasens_df['Diagnosis'])

# Normalize numerical features
scaler = StandardScaler()
numerical_cols = exasens_df.select_dtypes(include=['float64', 'int64']).columns
exasens_df[numerical_cols] = scaler.fit_transform(exasens_df[numerical_cols])

# Split into features and target
X_exasens = exasens_df.drop('Diagnosis', axis=1)
y_exasens = exasens_df['Diagnosis']

# Train-test split
X_train_exasens, X_test_exasens, y_train_exasens, y_test_exasens = train_test_split(X_exasens, y_exasens, test_size=0.3, random_state=42)


  exasens_path = 'C:\Platform_bases_computing\Final_project\Exasens.csv'  # Replace with your file path
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [None]:
# Loading the dataset

acute_path = 'C:\Platform_bases_computing\Final_project\diagnosis.data' 
acute_df = pd.read_csv(acute_path, sep='\t', header=None, encoding='ISO-8859-1') 

# Assign meaningful column names if necessary
acute_df.columns = ['temperature', 'nausea', 'lumbar_pain', 'urine_pushing', 'micturition_pains', 'burning_urethra', 'bladder_inflammation', 'nephritis']

# Encode yes/no values to 1/0
for col in ['nausea', 'lumbar_pain', 'urine_pushing', 'micturition_pains', 'burning_urethra', 'bladder_inflammation', 'nephritis']:
    acute_df[col] = acute_df[col].map({'yes': 1, 'no': 0})

# Split into features and targets
X_acute = acute_df.drop(['bladder_inflammation', 'nephritis'], axis=1)
y_acute_bl = acute_df['bladder_inflammation']
y_acute_neph = acute_df['nephritis']

# No scaling needed due to binary features
X_train_acute, X_test_acute, y_train_acute_bl, y_test_acute_bl = train_test_split(X_acute, y_acute_bl, test_size=0.3, random_state=42)
X_train_acute, X_test_acute, y_train_acute_neph, y_test_acute_neph = train_test_split(X_acute, y_acute_neph, test_size=0.3, random_state=42)


  acute_path = 'C:\Platform_bases_computing\Final_project\diagnosis.data'  # Make sure the path is correct


In [20]:
#checking the datasets 
import pandas as pd

# Load and fix Diagnosis Data
diagnosis_path = r'C:\Platform_bases_computing\Final_project\diagnosis.data'
exasens_path = r'C:\Platform_bases_computing\Final_project\Exasens.csv'

try:
    # Specify UTF-16 encoding to handle ÿþ characters
    diagnosis_df = pd.read_csv(diagnosis_path, sep='\t', header=None, encoding='utf-16')
    exasens_df = pd.read_csv(exasens_path, encoding='ISO-8859-1')
    
    # Replace commas with dots and convert to numeric
    diagnosis_df[0] = diagnosis_df[0].str.replace(',', '.').astype(float)

# Convert 'yes'/'no' to binary values (1/0)
    diagnosis_df = diagnosis_df.applymap(lambda x: 1 if x == 'yes' else (0 if x == 'no' else x))

    print("Cleaned Diagnosis Data:")
    print(diagnosis_df.head())

    print("Diagnosis Data Preview:")
    print(diagnosis_df.head())
    
    print("\nExasens Data Preview:")
    print(exasens_df.head())
    
except Exception as e:
    print(f"An error occurred: {e}")


Cleaned Diagnosis Data:
      0  1  2  3  4  5  6  7
0  35.5  0  1  0  0  0  0  0
1  35.9  0  0  1  1  1  1  0
2  35.9  0  1  0  0  0  0  0
3  36.0  0  0  1  1  1  1  0
4  36.0  0  1  0  0  0  0  0
Diagnosis Data Preview:
      0  1  2  3  4  5  6  7
0  35.5  0  1  0  0  0  0  0
1  35.9  0  0  1  1  1  1  0
2  35.9  0  1  0  0  0  0  0
3  36.0  0  0  1  1  1  1  0
4  36.0  0  1  0  0  0  0  0

Exasens Data Preview:
  Diagnosis     ID Imaginary Part    Unnamed: 3 Real Part    Unnamed: 5  \
0       NaN    NaN            NaN           NaN       NaN           NaN   
1       NaN    NaN           Min          Avg.       Min          Avg.    
2      COPD  301-4        -320.61  -300.5635307   -495.26  -464.1719907   
3      COPD  302-3        -325.39  -314.7503595   -473.73  -469.2631404   
4      COPD  303-3           -323  -317.4360556   -476.12  -471.8976667   

   Gender   Age  Smoking  Unnamed: 9  Unnamed: 10 Unnamed: 11      Unnamed: 12  
0     NaN   NaN      NaN         NaN          NaN

  diagnosis_df = diagnosis_df.applymap(lambda x: 1 if x == 'yes' else (0 if x == 'no' else x))


In [21]:
# Drop empty rows and rows containing header-like data
exasens_df.dropna(how='all', inplace=True)
exasens_df = exasens_df[~exasens_df['ID'].isin(['Min', 'Avg.'])]

# Rename columns for clarity
exasens_df.columns = ['Diagnosis', 'ID', 'Imaginary_Min', 'Imaginary_Avg', 
                      'Real_Min', 'Real_Avg', 'Gender', 'Age', 'Smoking', 
                      'Unnamed_9', 'Unnamed_10', 'Unnamed_11', 'Unnamed_12']

# Drop unnecessary columns
exasens_df.drop(columns=['Unnamed_9', 'Unnamed_10', 'Unnamed_11', 'Unnamed_12'], inplace=True)

# Convert Gender and Smoking to appropriate numerical values
exasens_df['Gender'] = exasens_df['Gender'].replace({'Male=1': 1, 'Female=0': 0}).astype(float)
exasens_df['Smoking'] = exasens_df['Smoking'].replace({'Non-smoker=1': 1, 'Ex-smoker=2': 2, 'Active-smoker=3': 3}).astype(float)

# Display the cleaned Exasens Data
print("Cleaned Exasens Data:")
print(exasens_df.head())


Cleaned Exasens Data:
  Diagnosis     ID Imaginary_Min Imaginary_Avg Real_Min      Real_Avg  Gender  \
1       NaN    NaN          Min          Avg.      Min          Avg.      NaN   
2      COPD  301-4       -320.61  -300.5635307  -495.26  -464.1719907     1.0   
3      COPD  302-3       -325.39  -314.7503595  -473.73  -469.2631404     0.0   
4      COPD  303-3          -323  -317.4360556  -476.12  -471.8976667     1.0   
5      COPD  304-4       -327.78  -317.3996698  -473.73   -468.856388     1.0   

    Age  Smoking  
1   NaN      NaN  
2  77.0      2.0  
3  72.0      2.0  
4  73.0      3.0  
5  76.0      2.0  


In [None]:
#Model training 
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# Input for Diagnosis data
input_diag = Input(shape=(diagnosis_df.shape[1],))

# Input for Exasens data
input_exasens = Input(shape=(exasens_df.shape[1],))

# Define a simple model that processes the inputs separately
x1 = Dense(64, activation='relu')(input_diag)
x2 = Dense(64, activation='relu')(input_exasens)

# Combine the outputs from both inputs (if necessary)
combined = concatenate([x1, x2])

# Further layers and output
output = Dense(1, activation='sigmoid')(combined)

model = Model(inputs=[input_diag, input_exasens], outputs=output)


# Feature engineering on Diagnosis Data
X_diag = diagnosis_df.drop(columns=['Diagnosis'])

# Feature engineering on Exasens Data
X_exasens = exasens_df.drop(columns=['Diagnosis', 'ID'])



NameError: name 'X' is not defined