Part A: Feature Extraction and Clustering

An auto-encoder (AE) is a neural network that learns to copy its input to its output. One of the main usage of AEs is to reduce the dimension by extracting meaningful features in the latent space (code layer). Representing data in a lower-dimensional space can improve performance on different tasks, such as classification and clustering.
1. First, you should construct an AE with 3 neurons in the latent space and train the network with the given dataset.
2. Then, feed the data to the network and extract features from the latent space.
3. Implement k-means (with k=3 and 4) and Gaussian mixture model (GMM) clustering methods and compare the results according to the Davies–Bouldin index (DBI) criteria.
4. Repeat the mentioned steps for network with 5 neurons in the latent space.

In [None]:
import csv
import numpy as np
import keras
from keras import layers
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import davies_bouldin_score as DBI

spellman_file = open('/kaggle/input/gene-expression-bioinformatics-dataset/Spellman.csv', mode='r')
spellman_csv = csv.reader(spellman_file)

spellman = []
for row in spellman_csv:
    spellman.append(row)

# Just for example of writing a file: F = open('file', mode='w')
# Just for example of writing a file: F.write(str(spellman))

spellman = np.array(spellman)
spellman = spellman[1:, 1:]
spellman = spellman.astype(float)

# Just for example of writing a code not a module usage: spellman = [x[1:] for x in spellman[1:]]
# Just for example of writing a code not a module usage: print(spellman)

spell_train, spell_test = train_test_split(spellman, test_size=0.2)

latent_units = 3
orig_dim = 23

input_gene = keras.Input(shape=(23,))
encoded = keras.layers.Dense(latent_units, activation='relu')(input_gene)
decoded = keras.layers.Dense(orig_dim, activation='sigmoid')(encoded)
# Just for a sub-module usage: encoded = layers.Dense(latent_units, activation='relu')(input_gene)
# Just for a sub-module usage: decoded = layers.Dense(orig_dim, activation='sigmoid')(encoded)
autoencoder = keras.Model(input_gene, decoded)

encoder = keras.Model(input_gene, encoded)

encoded_input = keras.Input(shape=(latent_units,))
decoder_layer = autoencoder.layers[-1]
decoder = keras.Model(encoded_input, decoder_layer(encoded_input))

autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
autoencoder.fit(spell_train, spell_train,
                epochs=50,
                batch_size=256,
                shuffle=True,
                validation_data=(spell_test, spell_test))

encoded_genes = encoder.predict(spellman)
print(encoded_genes)

print("**************************************************************************")
kmeans = KMeans(n_clusters=3, random_state=0).fit(encoded_genes)
print(kmeans.labels_)

score = DBI(encoded_genes, kmeans.labels_)
print('score: ' + str(score))

print("##########################################################################")
labels = GaussianMixture(n_components=3, random_state=0).fit_predict(encoded_genes)
print(labels)

score = DBI(encoded_genes, labels)
print('score: ' + str(score))


# kmeans.labels_ convert to string and then show
# chideman e mad e nazar, entekhab , format khorooji
# chetor data chideman bokonid,
# F = open('file end', mode='w')
# ghahbl az inke print esh bokonid, bayad data ra bar asase format e madenazar chinesh konid,  va bad write konid.
# F.write(str(kmeans.labels_))

Part B: Gene Ontology (GO)

According to the DBI criteria, select the best result from part A and determine what information can be extracted from each cluster. Use gene ontology (GO) for each identified cluster:
• Go to the g:Profiler website, http://biit.cs.ut.ee/gprofiler/gost .On the left box, enter your cluster gene names (whitespace-separated)
• For options, choosing Saccharomyces cerevisiae from organism box.
• Click Run query button.
• The results are sorted by p-values in ascending order. Draw a table to list the top 3 GO categories, showing the Term-name, Term-ID, and p-value in each column.

In [None]:
import csv
import numpy as np
import keras
from keras import layers
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

spellman_file = open('/kaggle/input/gene-expression-bioinformatics-dataset/Spellman.csv', mode='r')
spellman_csv = csv.reader(spellman_file)

spellman = []
for row in spellman_csv:
    spellman.append(row)

spellman = np.array(spellman)
spellman = spellman[1:, 1:]
spellman = spellman.astype(float)

# spellman = [x[1:] for x in spellman[1:]]
# print(spellman)

spell_train, spell_test = train_test_split(spellman, test_size=0.2)

latent_units = 3
orig_dim = 23

input_gene = keras.Input(shape=(23,))
encoded = keras.layers.Dense(latent_units, activation='relu')(input_gene)
decoded = keras.layers.Dense(orig_dim, activation='sigmoid')(encoded)
# encoded = layers.Dense(latent_units, activation='relu')(input_gene)
# decoded = layers.Dense(orig_dim, activation='sigmoid')(encoded)
autoencoder = keras.Model(input_gene, decoded)

encoder = keras.Model(input_gene, encoded)

encoded_input = keras.Input(shape=(latent_units,))
decoder_layer = autoencoder.layers[-1]
decoder = keras.Model(encoded_input, decoder_layer(encoded_input))

autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
autoencoder.fit(spell_train, spell_train,
                epochs=50,
                batch_size=256,
                shuffle=True,
                validation_data=(spell_test, spell_test))

encoded_genes = encoder.predict(spell_test)
print(encoded_genes)

kmeans = KMeans(n_clusters=3, random_state=0).fit(encoded_genes)
print(kmeans.labels_)