In [1]:
import pandas as pd 
from matplotlib import pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
import seaborn as sns
import multiprocessing as mp
import os
from sklearn.cluster import Birch
from sklearn.metrics import silhouette_score
#from sklearn.decomposition import PCA
#import joblib

<h3>Changing dtype to reduce memory footprint, shifting X and Y coordinates to (0,0) and getting picture dimensions

In [None]:
data = pd.read_csv('./dataset.csv', index_col=0)

data.reset_index(inplace=True, drop=True)
data.head()

data['CellID'] = data['CellID'].astype(np.uint16)
data['ExonCount'] = data['ExonCount'].astype(np.uint8)
data['MIDCount'] = data['MIDCount'].astype(np.uint8)
data['x'] = data['x'].astype(np.uint16)
data['y'] = data['y'].astype(np.uint16)

unique_cells = data['CellID'].unique()
unique_genes = data['geneID'].unique()
print(f'{len(unique_cells)} unique cells and {len(unique_genes)} unique genes')

xmin, ymin = data['x'].min(), data['y'].min()
data['x'], data['y'] = data['x'] - xmin, data['y'] - ymin
xmax, ymax = data['x'].max(), data['y'].max()
xmax += 1
ymax += 1

<h3>Generating picture of cells

In [None]:
def generate_picture() -> np.ndarray:
    __data = data[['x', 'y']]
    picture = np.zeros(shape=(xmax, ymax), dtype=np.uint8)

    for row in range(__data.shape[0]):
        x, y = __data.iloc[row]
        picture[x][y] = 1

    plt.figure(figsize=(xmax/100, ymax/100))
    plt.imshow(picture, cmap='gray')
    plt.axis('off')
    plt.show()
    if not os.path.exists('cells.tiff'):
        plt.imsave('./cells.tiff', picture, cmap='gray')

    return picture

picture = generate_picture()

<h3>Creating new matrix cells/genes and filling it with MIDCount value

In [None]:
def midcount_filling() -> int:
    matrix = np.zeros((len(unique_cells), len(unique_genes)), dtype=np.uint8)
    cell_indices = pd.factorize(data['CellID'])[0] 
    gene_indices = pd.factorize(data['geneID'])[0]

    matrix_cell_indices = np.array(cell_indices, dtype=np.uint16)
    matrix_gene_indices = np.array(gene_indices, dtype=np.uint16)
    matrix_midcount = np.array(data['MIDCount'].values, dtype=np.uint8)

    matrix[matrix_cell_indices, matrix_gene_indices] = matrix_midcount

    unloged_matrix = pd.DataFrame(matrix, index=unique_cells, columns=unique_genes, dtype=np.uint8)
    unloged_matrix.to_csv('unloged_matrix.csv')

    unloged_matrix.info()
    return unloged_matrix.last_valid_index()

unloged_matrix_number_of_rows = midcount_filling()

half_rows = unloged_matrix_number_of_rows // 2
second_half_rows = unloged_matrix_number_of_rows - half_rows

<h3>Separated loader to reduce memory usage

In [2]:
def load_csv_data(file_name : str, option: str, dtype : str = None) -> any:
    '''
    Type of options:\n
    pddf -> returns loaded data as pd.DataFrame\n
    pddft -> returns loaded data as pd.DataFrame just transponed\n
    npnd -> returns loaded data as np.ndarray of dtype\n
    '''
    data = pd.read_csv(file_name, index_col=0)

    if option == 'pddf':
        return data
    if option == 'pddft':
        return data.T
    if option == 'npnd':
        return np.array(data.values, dtype=dtype)

<h3>Comparing genes similarity for all cells, if 2 different genes have similarity above the threshold, second one will be marked for dropping.
After first transformation number of genes may be drastically reduced, it depends on similarity percent that need's to be matched.<br>
Point is to reduce number of features with minimal loss of meaning.


In [None]:
cnt = 1

def transformationR(similarity : float, file : str | pd.DataFrame) -> pd.DataFrame:
    rows_to_drop : list = []
    if isinstance(file, str):
        transponed_data : pd.DataFrame = load_csv_data(file, option='pddft')
    if isinstance(file, pd.DataFrame):
        transponed_data = file.T

    global cnt

    print(cnt)
    print(transponed_data.shape)
    
    #if cnt >= file.shape[0]:
    #    reduced.to_csv('reduced_matrix.csv')
    #    return transponed_data.T 

    row = transponed_data.index[cnt-1]

    current_row = transponed_data.loc[row].values

    for next_row in transponed_data.index[cnt:]: 
        second_row = transponed_data.loc[next_row].values

        match = (current_row == second_row).sum() / second_row.__len__()
        
        if match >= similarity:
            if match not in rows_to_drop:
                rows_to_drop.append(next_row)
    
    transponed_data.drop(rows_to_drop, inplace=True)
    cnt += 1

    return transponed_data.T 

reduced = transformationR(0.999, 'unloged_matrix.csv')

for i in range(2000):
    reduced = transformationR(0.999, reduced)

reduced.to_csv('reduced_matrix.csv')

<h3>Loging cells/genes matrix

In [None]:
def loging_matrix() -> None:
    data_chunk = load_csv_data('reduced_matrix.csv', 'pddf', None)
    matrix = np.array(data_chunk.values, dtype=np.float32)

    matrix[matrix == 0] = -1
    matrix = np.where(matrix == -1, -1, np.log(matrix))

    loged_matrix = pd.DataFrame(matrix, index=data_chunk.index, columns=data_chunk.columns, dtype=np.float32)
    loged_matrix.to_csv('reduced_loged_matrix.csv')

loging_matrix()

<h3>PCA transformation

In [None]:
def pca_(data : pd.DataFrame) -> any:
    pca = PCA()

<h3>Clustering

In [None]:
def clustering() -> any:
    X = load_csv_data('reduced_loged_matrix.csv', option='pddf')
    model = Birch(n_clusters=14)
    model.fit(X)
    #joblib.dump(model, 'model.pkl')
    
    ##labels_df = pd.DataFrame(model.labels_)


    score = silhouette_score(X, model.labels_)#
    print(score)#    

    return model.labels_

labels = clustering()

<h3>Measuring silhouette score, values goes from -1 to 1.

In [None]:
def score(labels : list) -> None:
    data = load_csv_data('reduced_matrix.csv', option='pddf')
    score = silhouette_score(data, labels)
    print(score)

score(labels)

<h3>Creating new picture of clustered cells

In [None]:
def generate_clustered_picture() -> None:
    __data = data[['x', 'y', 'CellID']]
    __labels = pd.DataFrame(labels, index=unique_cells)
    picture = np.zeros(shape=(xmax, ymax), dtype=np.uint8)

    for row in range(__data.shape[0]):
        x, y, cellid = __data.iloc[row]
        picture[x][y] = __labels.loc[cellid][0]

    cmap = plt.get_cmap('tab20', 20) ###
    tmp = list(cmap.colors)
    tmp[0] = (0, 0, 0) 
    cmap = mcolors.ListedColormap(tmp)
    
    plt.figure(figsize=(xmax/100, ymax/100))
    plt.imshow(picture, cmap=cmap)
    plt.axis('off')
    plt.colorbar(ticks=np.arange(20)) ###
    plt.show()
    plt.imsave('clustered_cells.tiff', picture, cmap=cmap)

    return picture

clustered_picture = generate_clustered_picture()
