In [1]:
import pickle

In [18]:
import sys
import numpy as np
import pandas as pd
#import Pyreadstat
import seaborn as sns
import matplotlib.pyplot as plt
from ete3 import ClusterTree
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.model_selection import train_test_split

In [None]:
# https://stackoverflow.com/questions/31033835/newick-tree-representation-to-scipy-cluster-hierarchy-linkage-matrix-format

def newick_to_linkage(newick: str) -> (np.ndarray, [str]):
    """
    Convert newick tree into scipy linkage matrix

    :param newick: newick string, e.g. '(A:0.1,B:0.2,(C:0.3,D:0.4):0.5);'
    :returns: linkage matrix and list of labels
    """
    # newick string -> cophenetic_matrix
    tree = ClusterTree(newick)
    cophenetic_matrix, newick_labels = tree.cophenetic_matrix()
    cophenetic_matrix = pd.DataFrame(
        cophenetic_matrix, columns=newick_labels, index=newick_labels)

    # reduce square distance matrix to condensed distance matrices
    pairwise_distances = pdist(cophenetic_matrix)
    
    # return linkage matrix and labels
    return linkage(pairwise_distances), list(cophenetic_matrix.columns)


def readNewick(nwk):
    with open(nwk) as fh:
        return fh.readline()

In [None]:
cols = ([
    'id', 'aspA', 'glnA', 'gltA', 'glyA', 'pgm', 'tkt', 
    'uncA', 'ST (MLST)', 'clonal_complex (MLST)'
])
df = pd.read_excel('../../CC353_Analysis/10359_Dataframe.xlsx')[cols]
df['id'] = df['id'].astype(str)
df = df.set_index('id')

In [None]:
nwk = readNewick('../../CC353_Analysis/10359_grapetree/BIGSdb_024808_1423149961_99700_tree.nwk')

In [None]:
linkageMatrix, labels = newick_to_linkage(newick=nwk)

In [3]:
ngroup = 35

In [None]:
clusters = fcluster(linkageMatrix, t=ngroup, criterion='maxclust', depth=2, R=None, monocrit=None)
labelsID = pd.DataFrame(clusters, labels, columns=[f'fullNG-{ngroup}'])

In [None]:
df = pd.merge(df, labelsID, left_index=True, right_index=True).fillna('Unknown')

In [None]:
y = list(range(len(df)))
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

In [None]:
with open('../../Data/training_0.8idname.txt','w') as fh:
    for idname in X_train.index:
        print(idname, file=fh)

In [None]:
nwk = readNewick('../../Data/8284isolates_training_0.8_isolate_grapetree/BIGSdb_008959_0106465177_07730_tree.nwk')
linkageMatrix, labels = newick_to_linkage(newick=nwk)
clusters = fcluster(linkageMatrix, t=ngroup, criterion='maxclust', depth=2, R=None, monocrit=None)
labelsID = pd.DataFrame(clusters, labels, columns=[f'trainNG-{ngroup}'])

X_train = pd.merge(X_train, labelsID, left_index=True, right_index=True).fillna('Unknown')

In [None]:
X_train.to_excel("../../Data/X_train.xlsx")
X_test.to_excel("../../Data/X_test.xlsx")

In [11]:
X_trainAll = pd.read_excel("../../Data/X_train.xlsx")
X_testAll = pd.read_excel("../../Data/X_test.xlsx")

In [5]:
import catboost
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [6]:
target = f'trainNG-{ngroup}'

y_train = X_trainAll.pop(target)
X_train = X_trainAll[['aspA', 'glnA', 'gltA', 'glyA', 'pgm', 'tkt', 'uncA']].copy().astype('category')
X_test = X_testAll[['aspA', 'glnA', 'gltA', 'glyA', 'pgm', 'tkt', 'uncA']].copy().astype('category')

In [7]:
#If you want to run the model this is the one
model = CatBoostClassifier(verbose=0)
model = model.fit(X_train, y_train, cat_features=list(X_train.columns))

In [12]:
X_testAll['prediction'] = model.predict(X_test)
X_testAll['predictionProb'] = model.predict_proba(X_test).max(axis=1)

In [32]:
sub = X_testAll.loc[X_testAll['predictionProb'] > 0.99].copy()
adjusted_rand_score(sub['prediction'], sub[f'fullNG-{ngroup}'])

0.9127698701382765

In [31]:
sub

Unnamed: 0.1,Unnamed: 0,aspA,glnA,gltA,glyA,pgm,tkt,uncA,ST (MLST),clonal_complex (MLST),fullNG-35,prediction,predictionProb
0,40796,8,10,2,2,11,12,6,354,ST-354 complex,1,22,0.999942
1,63360,1,4,2,2,6,3,17,61,ST-61 complex,31,32,0.999910
4,42089,2,4,1,2,7,1,5,48,ST-48 complex,18,17,0.999950
5,59481,2,1,12,687,2,1,5,8358,ST-21 complex,29,31,0.999953
6,76877,4,7,10,4,42,7,1,137,ST-45 complex,10,7,0.999974
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054,41380,4,7,40,4,42,51,1,267,ST-283 complex,10,7,0.999973
2061,50348,4,7,10,4,1,7,1,45,ST-45 complex,10,7,0.999974
2065,24611,4,7,10,4,1,7,1,45,ST-45 complex,10,7,0.999974
2066,63838,1,4,2,2,6,3,17,61,ST-61 complex,31,32,0.999910


In [15]:
adjusted_rand_score(X_testAll['prediction'], X_testAll[f'fullNG-{ngroup}'])

0.879744301475557

In [None]:
labelledData['Prediction_Catboost'] = model.predict(X)
labelledData.to_excel("labelledCatBoost.xlsx") 

In [None]:
#Opening the model file 
model = pd.read_pickle(r'model.sav')

In [None]:
predicted_test = model.predict(X_test)

In [None]:
adjusted_rand_score(y_test,predicted_test.flatten())

In [None]:
labelledData = pd.read_excel("labelledCatBoost.xlsx")
# How accurately does the prediction match the tree?

In [None]:
labelledData.head()

In [None]:
# How accurately does the CC label match the tree?
adjusted_rand_score(labelledData['clonal_complex (MLST)'], labelledData['Group'])

In [None]:
# How accurately does the CC label match the tree?
adjusted_rand_score(y_pred.flatten(), y_test)

In [None]:
#This saved the model into pickle file 
#filename = 'model.sav'
#pickle.dump(model, open(filename, 'wb'))

In [None]:
with open('model.sav', 'rb') as file:
      
    # Call load method to deserialze
    model = pickle.load(file)


In [None]:
labelledData.groupby('Prediction_Catboost')['clonal_complex (MLST)'].agg(pd.Series.mode)

In [None]:
labelledData[labelledData['Prediction_Catboost'] == '8']

In [None]:
labelledData['Group'].unique()

In [None]:
len(labelledData['Prediction_Catboost'].unique())