In [None]:
import pandas as pd
import numpy as np
import scanpy as sc
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_predict
import matplotlib.pyplot as plt

In [None]:
dataset = 'kumar-8-hard'
data_path = "./dataset/{}-filtered/10X/".format(dataset)
labels_path = "./dataset/{}-filtered/labels.csv".format(dataset)
markers_path = "./results/aggregate/{}/markers.csv".format(dataset)

In [None]:
adata = sc.read_10x_mtx(
    data_path,
    var_names='gene_symbols',
    cache=False
)

In [None]:
y_df = pd.read_csv(labels_path, index_col=0)
y_df

In [None]:
y_df = pd.DataFrame(adata.obs_names, columns=["cell"]).join(y_df, on="cell")
y_df

In [None]:
mask = ~np.isnan(np.array(y_df['cluster.ids'])).reshape(-1)
mask[mask==False]

In [None]:
y_df['cluster.ids'][mask]

In [None]:
y = np.array(y_df['cluster.ids'][mask])

In [None]:
clusters, counts = np.unique(y, return_counts=True)
clusters, counts

In [None]:
weights = counts[np.argsort(clusters)]

In [None]:
def apply_classifier(X, y):
    clf = RandomForestClassifier()
    y_pred = cross_val_predict(clf, X, y, cv=5)
    f1 = f1_score(y, y_pred)
    return f1

markers_df = pd.read_csv(markers_path)
tools = markers_df.tool.unique()

f1_markers = {}
for tool in tools:
    f1_markers_tool = []
    for cluster in clusters:
        y_bin = np.array(y==cluster, dtype=int)
        markers = markers_df[
            (markers_df['cluster']==cluster) & (markers_df['tool']==tool)
           ].gene.unique()
        X_markers = adata[mask, markers].X.toarray()
        f1_markers_tool.append(apply_classifier(X_markers, y_bin))
    f1_markers[tool] = round((weights*np.array(f1_markers_tool)).sum()/weights.sum(), 3)

f1_all = []
for cluster in clusters:
    y_bin = np.array(y==cluster, dtype=int)
    X_all = adata[mask, ].X.toarray()
    f1_all.append(apply_classifier(X_all, y_bin))
f1_weighted = round((weights*np.array(f1_all)).sum()/weights.sum(), 3)

print("F1 weighted when training on markers")
print(f1_markers)
print("F1 weighted when training on all genes")
print(f1_weighted)

In [66]:
# -------- train with increasing # of features taken from markers rank --------
scores = {}
step = 1
tools = ['scvi', 'seurat', 'scanpy', 'monocle']
for tool in tools:
    print('Tool: ', tool)
    for cluster in clusters:
        print('Cluster: ', cluster)
        n_markers = len(markers_df[markers_df['cluster']==cluster])   
        y_bin = np.array(y==cluster, dtype=int)
        for i in range(step, n_markers+step, step):
            markers = markers_df[
            (markers_df['cluster']==cluster) & (markers_df['tool']==tool) & (markers_df['rank']<=i)
           ].gene.unique()
            X = adata[mask, markers].X.toarray()
            f1 = apply_classifier(X, y_bin)
            if tool not in scores:
                scores[tool] = {}
            if cluster not in scores[tool]:
                scores[tool][cluster] = {}
            if 'scores' not in scores[tool][cluster]:
                scores[tool][cluster]['scores'] = []
            scores[tool][cluster]['scores'].append(f1)
        if 'mean' not in scores[tool][cluster]:
            scores[tool][cluster]['mean'] = []
        # mean for each cluster
        scores[tool][cluster]['mean'].append(np.mean(scores[tool][cluster]['scores']))

for tool in tools:
    for i in range(step, n_markers+step, step):
        sumForI = 0
        for j, cluster in enumerate(clusters):
            sumForI += (scores[tool][cluster]['scores'][i-1] * counts[j])
        if 'TotalMean' not in scores[tool]:
            scores[tool]['TotalMean'] = []
        scores[tool]['TotalMean'].append(sumForI/np.sum(counts))

# x_ticks = [i for i in range(step, n_markers+step, step)]
# # 4 axes plot
# fig, ax = plt.subplots(2, 2, figsize=(10, 10))
# plt.xticks(x_ticks, x_ticks)
# plt.plot(x_ticks, scores, marker='o')
# plt.ylabel("f1 weighted")
# plt.xlabel("# top features from each tool")
# plt.grid()
# plt.tight_layout()
# plt.show()
# plt.savefig(out_path+"score.eps")

# pd.DataFrame(scores, columns=['f1 weighted']).to_csv(out_path+"clf_score.csv")

Tool:  scvi
Cluster:  1
Cluster:  2
Cluster:  3
Cluster:  4
Cluster:  5


{'scvi': {1: {'scores': [0.819277108433735,
    0.8670520231213872,
    0.898876404494382,
    0.898876404494382,
    0.9050279329608939,
    0.893854748603352,
    0.8876404494382023,
    0.8876404494382023,
    0.8729281767955801,
    0.8633879781420767],
   'mean': [0.8794561675922192]},
  2: {'scores': [0.9666666666666667,
    0.9833333333333333,
    0.9836065573770492,
    0.9917355371900827,
    0.9917355371900827,
    0.9917355371900827,
    0.9917355371900827,
    0.9833333333333333,
    0.9917355371900827,
    0.9833333333333333],
   'mean': [0.9858950909994129]},
  3: {'scores': [0.9838709677419355,
    0.975609756097561,
    0.975609756097561,
    0.975609756097561,
    0.975609756097561,
    0.975609756097561,
    0.975609756097561,
    0.975609756097561,
    0.975609756097561,
    0.975609756097561],
   'mean': [0.9764358772619985]},
  4: {'scores': [0.9364161849710984,
    0.9824561403508771,
    0.9824561403508771,
    0.9854227405247813,
    0.9854227405247813,
    0.98

In [67]:
scores['scvi']['TotalMean']

[0.9396118242197339,
 0.9576839930532023,
 0.9634519372267307,
 0.9643278224590667,
 0.9668472169346177,
 0.9717606314674869,
 0.9715910147866155,
 0.9699346241384056,
 0.9700591049238534,
 0.9660825298320517]

In [None]:
# -------- train on all features and on markers --------

X_all = adata[mask, :].X.toarray()
report_all, feature_importance = apply_classifier(X_all, y)
report_markers, _ = apply_classifier(adata[mask, markers].X.toarray(), y)

pd.DataFrame(report_all).transpose().to_csv(out_path+"clf_report_all.csv")
pd.DataFrame(report_markers).transpose().to_csv(out_path+"clf_report_markers.csv")

sorted_idx = (-feature_importance).argsort()
rf_features_sorted = adata.var_names[sorted_idx]
importaces_sorted = feature_importance[sorted_idx]
pd.DataFrame(
    {'genes' : rf_features_sorted, 'importaces' : importaces_sorted}
    ).to_csv(out_path+"importances.csv")

In [None]:
# -------- select n_markers*n_clusters features with RFE and RF --------

selector = RFE(RandomForestClassifier(), n_features_to_select=n_markers*n_clusters, step=0.5)
selector.fit(X_all, y)
sorted_idx = (selector.ranking_).argsort()
rfe_features_sorted = adata.var_names[sorted_idx]
pd.DataFrame(
    {'genes' : rfe_features_sorted}
    ).to_csv(out_path+"rfe_ranking.csv")

# automatically choose the number of features
# rfe = RFECV(estimator=RandomForestClassifier())
# rfe.fit(X_all, y)
# rfe.show()

# TODO:
# - valutare intersezione
# - valutare bontà del ranking allenando con markers più in basso nella classifica?

# important_features = features_sorted[0:120]
# important_features = [f for f, i in zip(features_sorted, importaces_sorted) if i >= importaces_sorted[119]]
# intersection = set(markers).intersection(set(important_features))

#        rank di randomforest                  rank del tool
# gene1         100 *                         * 1 - (0-20)
# gene2         1 *                            
# gene3         1 *                            
# gene4         0 *                            