In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import confusion_matrix
from collections import defaultdict, Counter
from imblearn.over_sampling import SMOTE
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
from itertools import count
from glob import glob
import pandas as pd
import igraph as ig
import numpy as np
import warnings
import re

In [None]:
langs = [path.split('xml\\')[1].split('.xml')[0] for path in glob('xml/*.xml')]
books = ['MAT', 'MAR', 'LUK', 'JOH']

In [None]:
for lang in []:#langs:
    print(lang)
    root = ET.fromstring(open(f'xml/{lang}.xml', encoding='utf-8').read())
    with open(f'txt/{lang}.txt', 'w', encoding='utf-8') as out:        
        for book in books:
            for seg in root.findall(f'.//div[@id="b.{book}"]/*seg'):             
                out.write("".join(seg.itertext()).strip(" 1234567890-").strip() + '\n')

In [None]:
for lang in []:#langs:
    print(lang)
    ids = defaultdict(count().__next__)
    edges = set()
    words = []
    with open(f'txt/{lang}.txt', 'r', encoding='utf-8') as f:
        for line in f:
            words.extend(re.sub('[^\w ]', '', line.lower().replace('&quot;', '')).split(' '))    
    for i, j in zip(words, words[1:]):
        source, target = ids[i], ids[j]
        if ((source, target) not in edges) and ((target, source) not in edges):
            edges.add((source, target))
    g = ig.Graph()
    g.add_edges(edges)
    g.save(f'net/{lang}.net')

In [None]:
metrics = {
    "vcount": ig.Graph.vcount,
    "ecount": ig.Graph.ecount,
    "density": ig.Graph.density,
    "transitivity": ig.Graph.transitivity_undirected,
    "assortativity_degree": ig.Graph.assortativity_degree,
    "transitivity_avglocal": ig.Graph.transitivity_avglocal_undirected,    
    # Slow
    # ig.Graph.average_path_length # 9s
}
meanMetrics = {
    "degree": ig.Graph.degree,    
    "pagerank": ig.Graph.pagerank,
    "coreness": ig.Graph.coreness,
    "hub_score": ig.Graph.hub_score,    
    "constraint": ig.Graph.constraint,
    "feedback_arc_set": ig.Graph.feedback_arc_set,
    # Duplicated
    # ig.Graph.strength,
    # ig.Graph.authority_score,
    # ig.Graph.personalized_pagerank,
    # ig.Graph.eigenvector_centrality,
    # Slow
    # ig.Graph.closeness,
    # ig.Graph.betweenness,
    # ig.Graph.eccentricity,
    # ig.Graph.similarity_dice,
    # ig.Graph.edge_betweenness,
    # ig.Graph.similarity_jaccard,
    # ig.Graph.similarity_inverse_log_weighted,
}

In [None]:
rows = []
for lang in []:#langs:    
    print(lang)
    row = {}
    g = ig.load(f'net\\{lang}.net') 
    for name, metric in metrics.items():
        row[name] = metric(g)
    for name, metric in meanMetrics.items():
        row[name] = np.mean(metric(g))
    rows.append(row)
df = pd.DataFrame(rows, langs)
# df.to_csv('langs.csv')

In [None]:
df = pd.read_csv('langs.csv', index_col=0)
df

In [None]:
#df['subgenus'] = df['genus']+df['subgenus']
X = df.drop(columns=['abbreviation', 'genus', 'subgenus'])

print('Frequency', dict(Counter(df['genus']).most_common()))
y = df['genus'].astype('category').cat.codes.values
print('Mapping', dict(zip(df['genus'], y)))

print('Frequency', dict(Counter(df['subgenus']).most_common()))
y = df['subgenus'].astype('category').cat.codes.values
print('Mapping', dict(zip(df['subgenus'], y)))
#y = (df['genus']+df['subgenus']).astype('category').cat.codes.values
#dict(zip(y, df['genus']))

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import metrics

lda = LinearDiscriminantAnalysis(solver='eigen', shrinkage='auto')
X2 = lda.fit_transform(X, y)
plt.scatter(X2[:, 0], X2[:, 1], c=y)
plt.show()

#print(metrics.silhouette_score(X2, y, random_state=0)) # higher is better
#print(metrics.calinski_harabasz_score(X2, y)) # higher is better
#print(metrics.davies_bouldin_score(X2, y)) # lower is better

In [None]:
#lda.explained_variance_ratio_
#lda.coef_
#lda.means_
#lda.priors_

In [None]:
from sklearn.neighbors import NeighborhoodComponentsAnalysis
nca = NeighborhoodComponentsAnalysis(n_components=2, random_state=0)
X2 = nca.fit_transform(X, y)
plt.scatter(X2[:, 0], X2[:, 1], c=y)
plt.show()

#print(metrics.silhouette_score(X2, y, random_state=0)) # higher is better
#print(metrics.calinski_harabasz_score(X2, y)) # higher is better
#print(metrics.davies_bouldin_score(X2, y)) # lower is better

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import confusion_matrix
import warnings

y = df['genus'].astype('category').cat.codes.values
y_pred = []
for train_index, test_index in LeaveOneOut().split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]    
    y_train, y_test = y[train_index], y[test_index]
    X_train, y_train = SMOTE(random_state=1, k_neighbors=1, n_jobs=-1).fit_resample(X_train, y_train)    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        dimRed = LinearDiscriminantAnalysis(solver='eigen', shrinkage='auto')
        X_train = dimRed.fit_transform(X_train, y_train)                
        predicted = dimRed.predict(X_test)
        #X_test = dimRed.transform(X_test)    
    y_pred.append(predicted)
    '''
    if predicted!=y_test:
        print(X_test, y_test, predicted)
        plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train)
        plt.scatter(X_test[:, 0], X_test[:, 1], c='red')
        plt.show() 
    '''    
print(np.mean([i==j for i,j in zip(y, y_pred)]))
cm = confusion_matrix(y, y_pred)
print(cm)

In [None]:
y = df['subgenus'].astype('category').cat.codes.values
y_pred = []
for train_index, test_index in LeaveOneOut().split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]    
    y_train, y_test = y[train_index], y[test_index]
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        dimRed = LinearDiscriminantAnalysis(solver='eigen', shrinkage='auto', n_components=4)
        X_train = dimRed.fit_transform(X_train, y_train)                
        X_test = dimRed.transform(X_test)
    clf = ExtraTreesClassifier(random_state=42, n_jobs=-1)
    clf.fit(X_train, y_train)
    predicted = clf.predict(X_test)    
    y_pred.append(predicted)    
print(np.mean([i==j for i,j in zip(y, y_pred)]))
cm = confusion_matrix(y, y_pred)
print(cm)