In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [49]:
# data = pd.read_csv('training_dataset/raw_dataset.csv')
# data = pd.read_csv('training_dataset/raw_scores_pvallog.csv')
# data = pd.read_csv('training_dataset/ionocyte_raw_dataset.csv')
data = pd.read_csv('training_dataset/ionocyte_scores_pvallog.csv')

In [50]:
# data = pd.read_csv('ionocyte_raw_dataset.csv')
data['disease_ontology_label'] = (data['disease_ontology_label'] == 'COVID-19').astype(int)
X = data.drop(['NAME', 'disease_ontology_label'], axis=1)
y = data['disease_ontology_label']

# print("Class sizes:")
# print(y.value_counts())

In [51]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# # Impute missing values
# imputer = SimpleImputer(strategy='median')
# X_train = imputer.fit_transform(X_train)
# X_test = imputer.transform(X_test)


# # Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# # Feature selection using SelectKBest with f_classif
selector = SelectKBest(score_func=f_classif, k=50)  # Select top 100 features
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)


# Display the number of features after feature selection
print(f"Total number of features after selection: {X_train.shape[1]}")

# Define the model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),  # Increased from 64 to 256 units
    Dropout(0.3),  # Increased dropout rate to handle more complex model
    Dense(256, activation='relu'),  # Added an additional Dense layer
    Dropout(0.2),  # Maintain dropout to prevent overfitting
    Dense(64, activation='relu'),  # Further layer to increase model depth
    Dropout(0.2),  # Consistent dropout rate for complexity
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])
# Compile the model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=40, batch_size=10, validation_split=0.2, verbose=1)

Total number of features after selection: 50
Epoch 1/40


 1780 1781 1782 1785 1786 1788 1789 1790 1791 1792 1796 1798 1800 1801
 1804 1805 1806 1807 1810 1812 1813 1815 1816 1817 1818 1819 1822 1823
 1824 1826 1828 1832 1833 1835 1836 1837 1838 1841 1842 1843 1844 1849
 1851 1852 1853 1856 1857 1860 1861 1863 1868 1869 1870 1876 1877 1878
 1879 1880 1881 1882 1883 1884 1885 1888 1889 1890 1891 1892 1894 1895
 1896 1898 1901 1903 1905 1906 1907 1909 1910 1911 1912 1913 1914 1915
 1916 1918 1919 1920 1921 1922 1923 1924 1927 1929 1930 1931 1933 1934
 1937 1940 1942 1943 1944 1948 1951 1953 1956 1959 1962 1963 1966 1968
 1969 1972 1973 1978 1980 1982 1983 1984 1985 1986 1990 1991 1994 1996
 1997 1999 2000 2001 2003 2005 2006 2011 2012 2013 2014 2015 2017 2019
 2022 2023 2026 2028 2029 2030 2032 2034 2036 2039 2040 2043 2044 2045
 2046 2048 2050 2053 2054 2055 2056 2059 2062 2063 2065 2066 2068 2073
 2078 2081 2086 2088 2092 2093 2094 2096 2098 2099 2101 2103 2104] are constant.
  f = msb / msw


Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [52]:
# Evaluate the model on the test data
results = model.evaluate(X_test, y_test)
print("Test Loss, Test Accuracy:", results)

Test Loss, Test Accuracy: [1.786791205406189, 0.752136766910553]
