# Classify the documents using the graph of words

In [6]:
import pandas as pd
import networkx as nx
import itertools

## Load the graph

In [3]:
G = nx.read_gpickle('graph_w15.pkl')

In [5]:
print('Nb of edges: {}, nb of nodes: {}'.format(G.size(),len(G.nodes())))

Nb of edges: 1901537, nb of nodes: 206222


In [11]:
GD = nx.Graph()
for node,data in G.nodes(data=True):
    if 'paths' in data.keys():
        list_of_docs = data['paths'].keys()
        if len(list_of_docs)>1:
            for pair in itertools.combinations(list_of_docs, 2):
                if GD.has_edge(*pair):
                    GD[pair[0]][pair[1]]['weight']+=1
                else:
                    GD.add_edge(pair[0],pair[1],weight=1)

In [13]:
print('Nb of edges: {}, nb of nodes: {}'.format(GD.size(),len(GD.nodes())))

Nb of edges: 3684444, nb of nodes: 3665


In [15]:
import sys
dossier = '/home/benjamin/Documents/eviacybernetics/Projets/Grevia'
sys.path.append(dossier)
import grevia

In [20]:
threshold = 5
GD = grevia.remove_weak_links(GD,threshold,weight='weight')
GD.remove_nodes_from(nx.isolates(GD))

Initial size: 3684444
Final size: 2053793


In [24]:
nx.number_connected_components(GD)

11

In [17]:
grevia.top_values(GD,'edge','weight')

Unnamed: 0,node1,node2,weight
2539744,2984,2983,709
1419842,3475,337,692
208455,405,159,656
208974,405,158,656
2714907,159,158,656
2956067,3986,39,627
1516240,50,51,580
208594,405,1916,544
2714559,159,1916,544
2895466,1916,158,544


## Community detection

In [25]:
import community
#first compute the best partition
clusterDic = community.best_partition(GD)
nx.set_node_attributes(GD,'cluster',clusterDic)

In [31]:
nb_communities = len(set(clusterDic.values()))
print('Nb of cummunities: {}'.format(nb_communities))

Nb of cummunities: 25


### Exploration of the communities

In [29]:
df = pd.read_pickle('texts3.pkl')

In [38]:
doc_class = {}
for c_i in range(nb_communities):
    doc_list = [idx for idx,value in clusterDic.items() if value==c_i]
    doc_class[c_i] = doc_list

In [47]:
df.loc[1078,'filename']

'coptfont'

In [55]:
doc_class_name = {}
for key in doc_class.keys():
    list_of_names = []
    for idx in doc_class[key]:
        list_of_names.append(df.loc[int(idx),'filename'])
    doc_class_name[key]=list_of_names

In [66]:
doc_class_name

{0: ['coptfont',
  'tabu',
  'ntgclass',
  'gmutils',
  'pstricks-doc',
  'euenc',
  'labbook',
  'fibeamer',
  'pst-eucl-doc',
  'showexpl-test',
  'xcolor',
  'pst-news08',
  'pedigree',
  'arrayjobx',
  'Mathmode',
  'nag',
  'glossaries-code',
  'papermas-example',
  'monofill',
  'fixme',
  'demodoc',
  'selectp-doc',
  'pst-calendar-docDE',
  'simplified-intro',
  'VisuelPSTricks',
  'glossaries-irish',
  'aobs-tikz',
  'brief',
  'acro_en',
  'text2',
  'vh_set_example',
  'DE19717679C1',
  'minipage-marginpar',
  'mlist',
  'forarray',
  'a4wide',
  'testA-xetex-sol',
  'ifvtex',
  'colorwebuser',
  'asymptote',
  'nonfloat',
  'pst-soroban-doc',
  'pst-news06',
  'multienum',
  'pageslts',
  'scrjura',
  'noitcrul',
  'ulineno',
  'docgerman',
  'shadetest',
  'pst-circ-doc',
  'pgfplotsexample',
  'tikz-palattice_documentation',
  'bitset',
  'pgfmanual',
  'symbols-letter',
  'kpathsea',
  'examdoc',
  'psmatrix-docDE',
  'DE19719196A1',
  'changes.ngerman',
  'dt-sampl',
  

LE cluster 4 concerne tous les documents francais
On analyse le cluster 4

In [67]:
GDfr = GD.copy()

In [69]:
for node,data in GDfr.nodes(data=True):
    if not data['cluster']==4:
        GDfr.remove_node(node)

In [70]:
GDfr.size()

19473

In [71]:
clusterDic = community.best_partition(GDfr)
nx.set_node_attributes(GDfr,'cluster',clusterDic)

In [72]:
nb_communities = len(set(clusterDic.values()))
print('Nb of cummunities: {}'.format(nb_communities))

Nb of cummunities: 8


In [73]:
doc_class_fr = {}
for c_i in range(nb_communities):
    doc_list = [idx for idx,value in clusterDic.items() if value==c_i]
    doc_class_fr[c_i] = doc_list

In [74]:
doc_class_name_fr = {}
for key in doc_class_fr.keys():
    list_of_names = []
    for idx in doc_class_fr[key]:
        list_of_names.append(df.loc[int(idx),'filename'])
    doc_class_name_fr[key]=list_of_names

In [76]:
doc_class_name_fr

{0: ['contrats_de_travail',
  'Epson_08102016171329',
  'page1',
  'sfr-facture-09-B516-013324644',
  'CAFpaimentsuisse',
  'TF-Avis-PrimTIP-2016-1674429086270',
  'Epson_08102016170933',
  'dossierchomageEPFL',
  'fintravaux',
  'page3',
  'facture-29542025',
  'contrat2',
  'IR-Avis-PrimTIP-2016-16740621488132',
  'JOAssociationAPEM1998',
  'carte_AVS',
  'compteversevianLahille2',
  'attestation(1)',
  'accordassignation',
  'TH-Avis-PrimTIP-2016-16740451328150',
  'duplicataCG',
  'taxefonciere012',
  'contrat2015002',
  'RIBlahille',
  'Epson_16032014190240',
  'Banquemandat',
  'facture-2241',
  'IR-Avis-1TIP-2014-14740541348065',
  'contrat1415003',
  'reponseavocat',
  'vaccinRebeccaLahille',
  'compteversevianLahille',
  'facture-31377176',
  'avisimpots2013',
  'loyernovembre001',
  'RIBRicaudThonon',
  'contrat1',
  'demandeCG2CV',
  'attestation',
  'droitchomage',
  'declarationperteCG',
  'compteMarie2',
  'fintravaux2',
  'liasse',
  'Epson_08102016171211',
  'compteimpo

### Classification using the toolbox

In [245]:
import importlib
importlib.reload(grevia.graph_structure)
importlib.reload(grevia)

<module 'grevia' from '/home/benjamin/Documents/eviacybernetics/Projets/Grevia/grevia/__init__.py'>

In [80]:
GD,clusterDic = grevia.find_communities(GD)

Nb of communities found: 25


In [83]:
#df = pd.read_pickle('texts3.pkl')
cluster_dic_name = grevia.get_filenames_in_clusters(clusterDic,df)

In [98]:
G_sub0 = grevia.extract_cluster_as_subgraph(GD,cluster_id=24)

Nb of edges of the subgraph: 1, nb of nodes: 2


In [99]:
G_sub0.nodes(data=True)

[('1728', {'cluster': 24}), ('1727', {'cluster': 24})]

### Iterative community detection

In [120]:
def cluster_graph(graph_in):
    subgraph_list = []
    graph = graph_in.copy()
    if graph.size()>20:
        graph,clusterDic = grevia.find_communities(graph)
        nb_communities = len(set(clusterDic.values()))
        if nb_communities > 1:
            for c_i in range(nb_communities): 
                G_sub = grevia.extract_cluster_as_subgraph(graph,cluster_id=c_i)
                [subgraph_list.append(item) for item in cluster_graph(G_sub)]
        else:
            subgraph_list.append(graph)
    else:
        subgraph_list.append(graph)
    return subgraph_list

In [132]:
subgraph_list = grevia.cluster_graph(GD,20)

Nb of communities found: 26
Nb of edges of the subgraph: 122316, nb of nodes: 754
Nb of communities found: 6
Nb of edges of the subgraph: 10919, nb of nodes: 212
Nb of communities found: 6
Nb of edges of the subgraph: 1682, nb of nodes: 73
Nb of communities found: 4
Nb of edges of the subgraph: 560, nb of nodes: 42
Nb of communities found: 3
Nb of edges of the subgraph: 133, nb of nodes: 19
Nb of edges of the subgraph: 75, nb of nodes: 14
Nb of edges of the subgraph: 19, nb of nodes: 9
Nb of edges of the subgraph: 76, nb of nodes: 16
Nb of edges of the subgraph: 10, nb of nodes: 5
Nb of edges of the subgraph: 33, nb of nodes: 10
Nb of edges of the subgraph: 1240, nb of nodes: 65
Nb of communities found: 4
Nb of edges of the subgraph: 74, nb of nodes: 16
Nb of edges of the subgraph: 88, nb of nodes: 17
Nb of edges of the subgraph: 312, nb of nodes: 27
Nb of communities found: 2
Nb of edges of the subgraph: 104, nb of nodes: 15
Nb of edges of the subgraph: 54, nb of nodes: 12
Nb of edges

### cluster graph with hierarchy

In [246]:
subgraph_dic = grevia.cluster_graph_with_hierarchy(GD,20)

Copying graph...
Graph copied.
Nb of communities found: 26
Nb of edges of the subgraph: 122316, nb of nodes: 754
Copying graph...
Graph copied.
Nb of communities found: 6
Nb of edges of the subgraph: 10919, nb of nodes: 212
Copying graph...
Graph copied.
Nb of communities found: 6
Nb of edges of the subgraph: 1682, nb of nodes: 73
Copying graph...
Graph copied.
Nb of communities found: 4
Nb of edges of the subgraph: 560, nb of nodes: 42
Copying graph...
Graph copied.
Nb of communities found: 3
Nb of edges of the subgraph: 133, nb of nodes: 19
Copying graph...
Graph copied.
Nb of edges of the subgraph: 75, nb of nodes: 14
Copying graph...
Graph copied.
Nb of edges of the subgraph: 19, nb of nodes: 9
Copying graph...
Graph copied.
Nb of edges of the subgraph: 76, nb of nodes: 16
Copying graph...
Graph copied.
Nb of edges of the subgraph: 10, nb of nodes: 5
Copying graph...
Graph copied.
Nb of edges of the subgraph: 33, nb of nodes: 10
Copying graph...
Graph copied.
Nb of edges of the sub

In [247]:
subgraph_dic

{0: {0: {0: {0: {0: <networkx.classes.graph.Graph at 0x7fa035629c50>,
     1: <networkx.classes.graph.Graph at 0x7fa035656cc0>,
     2: <networkx.classes.graph.Graph at 0x7fa03a87e668>},
    1: <networkx.classes.graph.Graph at 0x7fa0356293c8>,
    2: <networkx.classes.graph.Graph at 0x7fa0c027f1d0>,
    3: <networkx.classes.graph.Graph at 0x7fa032365358>},
   1: {0: <networkx.classes.graph.Graph at 0x7fa0c027f860>,
    1: <networkx.classes.graph.Graph at 0x7fa035629940>,
    2: {0: <networkx.classes.graph.Graph at 0x7fa09caa6a20>,
     1: <networkx.classes.graph.Graph at 0x7fa08d135f28>},
    3: <networkx.classes.graph.Graph at 0x7fa032750780>},
   2: {0: <networkx.classes.graph.Graph at 0x7fa0381b2198>,
    1: <networkx.classes.graph.Graph at 0x7fa0381b2710>,
    2: <networkx.classes.graph.Graph at 0x7fa0c18c0f28>},
   3: {0: <networkx.classes.graph.Graph at 0x7fa0381d2f60>,
    1: <networkx.classes.graph.Graph at 0x7fa032750fd0>,
    2: <networkx.classes.graph.Graph at 0x7fa0381d2978

In [248]:
import copy
subgraph_dic2 = copy.deepcopy(subgraph_dic)

In [249]:
def walk(node):
    for key, item in node.items():
        if type(item) is dict:
            walk(item)
        else:
            node[key] = grevia.subgraph_to_filenames(item,df)

In [257]:
def walk_2(node,c_id):
    list1 = []
    for key, item in node.items():
        c_id2 = str(c_id)+'.'+str(key)
        if type(item) is dict:
            listc = walk_2(item,c_id2)
        else:
            print(c_id2)
            listc = [c_id2,item]
        [list1.append(subl) for subl in listc]
    return list1

In [251]:
walk(subgraph_dic2)

In [258]:
list2 = walk_2(subgraph_dic2,0)

0.0.0.0.0.0
0.0.0.0.0.1
0.0.0.0.0.2
0.0.0.0.1
0.0.0.0.2
0.0.0.0.3
0.0.0.1.0
0.0.0.1.1
0.0.0.1.2.0
0.0.0.1.2.1
0.0.0.1.3
0.0.0.2.0
0.0.0.2.1
0.0.0.2.2
0.0.0.3.0
0.0.0.3.1
0.0.0.3.2
0.0.0.3.3
0.0.0.3.4
0.0.0.4
0.0.0.5
0.0.1.0
0.0.1.1
0.0.1.2.0
0.0.1.2.1
0.0.1.2.2
0.0.1.3.0.0
0.0.1.3.0.1
0.0.1.3.0.2
0.0.1.3.1.0
0.0.1.3.1.1
0.0.1.3.1.2
0.0.2.0.0.0
0.0.2.0.0.1
0.0.2.0.1.0
0.0.2.0.1.1
0.0.2.0.1.2
0.0.2.0.1.3
0.0.2.0.2.0
0.0.2.0.2.1
0.0.2.1.0.0
0.0.2.1.0.1
0.0.2.1.0.2
0.0.2.1.1.0
0.0.2.1.1.1.0
0.0.2.1.1.1.1
0.0.2.1.1.1.2
0.0.2.1.2
0.0.2.1.3
0.0.2.2.0.0
0.0.2.2.0.1
0.0.2.2.0.2
0.0.2.2.1
0.0.2.2.2.0
0.0.2.2.2.1
0.0.2.2.2.2
0.0.2.3.0
0.0.2.3.1
0.0.2.3.2
0.0.2.3.3
0.0.2.3.4
0.0.2.4
0.0.3.0
0.0.3.1.0
0.0.3.1.1
0.0.3.1.2
0.0.3.2
0.0.3.3
0.0.3.4
0.0.3.5
0.0.4
0.0.5.0
0.0.5.1
0.0.5.2
0.0.5.3
0.1.0.0.0
0.1.0.0.1.0
0.1.0.0.1.1
0.1.0.0.2.0
0.1.0.0.2.1
0.1.0.0.2.2
0.1.0.0.3.0
0.1.0.0.3.1
0.1.0.0.3.2
0.1.0.0.4
0.1.0.1.0
0.1.0.1.1.0
0.1.0.1.1.1
0.1.0.1.2.0
0.1.0.1.2.1
0.1.0.1.2.2
0.1.0.1.2.3
0.1.0.1.3
0.1.

In [259]:
list2

['0.0.0.0.0.0',
 ['coptfont',
  'esami-doc-it',
  'isoman',
  'asymptote',
  'marvosym-doc',
  'subfig',
  'minitoc',
  'pstricks-doc',
  'babel',
  'NamedGraphs',
  'latexcourse-rug',
  'qpdf-manual',
  'simplified-intro',
  'fontinstallationguide',
  'forest',
  'pst-optic-doc',
  'mandi',
  'hyperref',
  'pst-solides3d-doc'],
 '0.0.0.0.0.1',
 ['lstdrvrs',
  'fontspec',
  'mathsPICmanual',
  'interfaces',
  'gentle',
  'datatool-user',
  'Vienna-Airport-line-timetable',
  'artdoc',
  'visualFAQ',
  'mtc-cri',
  'upmethodology-doc',
  'AroundTheBend',
  'sampleEqPg',
  'amsldoc'],
 '0.0.0.0.0.2',
 ['cartegrise406014',
  'pgfplotsexample',
  'pgfplotsexample-plain',
  'tkz-fct-screen',
  'pandas',
  'networkx_reference',
  'nestlemeetingMarch',
  'imm6614',
  'tcolorbox'],
 '0.0.0.0.1',
 ['changes.english.withcode',
  'test-pst',
  'examdoc',
  'blog',
  'deliverable_template',
  'changes.english',
  'menukeys',
  'texlinks',
  'skb',
  'rhodocyb',
  'ifnextok',
  'PSSsigned001',
  'hp

In [206]:
pd.concat(map(pd.DataFrame, subgraph_dic2.values()), keys=subgraph_dic2.keys()).stack().unstack(0)

Unnamed: 0,Unnamed: 1,0
0,0,{0: {0: {0: {0: {0: {0: {0: {0: {0: ['coptfont...
1,1,"{0: {0: {0: {0: {0: {0: {0: ['sample-article',..."
2,2,"{0: {0: {0: ['beamerugouterthemeshadow', 'beam..."
3,3,"{0: {0: {0: {0: {0: ['US4296405A', 'US4339974A..."
4,4,{0: {0: {0: {0: {0: {0: {0: ['RIBRicaudThonon'...
5,5,"{0: {0: {0: ['juillet16', 'juin15', 'octobre15..."
6,6,"{0: ['custcol_4', 'cmby_2', 'align', 'custcol_..."
7,7,"{0: ['lecturepresentation', 'lecturehandout']}"
8,8,"{0: ['EventCollection_plot__set_linewidth', 'E..."
9,9,"{0: ['sample-pressrelease4', 'sample-pressrele..."


In [210]:
pd.io.json.json_normalize(subgraph_dic2)

KeyError: '0'

In [None]:
pd.DataFrame.from_dict({(i,j): user_dict[i][j] 
                           for i in user_dict.keys() 
                           for j in user_dict[i].keys()},
                       orient='index')


End of hierarchical clustering

In [179]:
grevia.clusters_info(subgraph_list)

Nb of communities: 321
Community mean size: 10.10, min size: 2, max size: 47


In [134]:
# Check the result of the recursive process: the number of nodes must be the same.
nb_total_nodes = 0
for graph in subgraph_list:
    nb_total_nodes +=len(graph.nodes())
print('Nb of nodes in the graph {}, nb of nodes in the subgraphs {}.'.format(len(GD.nodes()),nb_total_nodes))

Nb of nodes in the graph 3242, nb of nodes in the subgraphs 3242.


In [160]:
cluster_name_list = []
for graph in subgraph_list:
    subgraph_names_list = []
    for node in graph:
        subgraph_names_list.append(df.loc[int(node),'filename'])
    subgraph_names_list.append(nx.density(graph))
    cluster_name_list.append(subgraph_names_list)

In [166]:
cluster_name_list = grevia.subgraphs_to_filenames(subgraph_list,df)

In [167]:
cluster_name_list

[['coptfont',
  'esami-doc-it',
  'isoman',
  'fontinstallationguide',
  'simplified-intro',
  'pst-solides3d-doc',
  'marvosym-doc',
  'subfig',
  'asymptote',
  'babel',
  'latexcourse-rug',
  'qpdf-manual',
  'pstricks-doc',
  'hyperref',
  'forest',
  'pst-optic-doc',
  'mandi',
  'NamedGraphs',
  'minitoc'],
 ['lstdrvrs',
  'fontspec',
  'mathsPICmanual',
  'interfaces',
  'gentle',
  'datatool-user',
  'Vienna-Airport-line-timetable',
  'artdoc',
  'visualFAQ',
  'mtc-cri',
  'upmethodology-doc',
  'AroundTheBend',
  'sampleEqPg',
  'amsldoc'],
 ['tcolorbox',
  'pgfplotsexample-plain',
  'tkz-fct-screen',
  'pgfplotsexample',
  'pandas',
  'networkx_reference',
  'nestlemeetingMarch',
  'imm6614',
  'cartegrise406014'],
 ['changes.english.withcode',
  'test-pst',
  'examdoc',
  'blog',
  'deliverable_template',
  'changes.english',
  'menukeys',
  'skb',
  'ifnextok',
  'rhodocyb',
  'changes.ngerman',
  'PSSsigned001',
  'hpsdiss',
  'pythontex',
  'texlinks',
  'cntperchap_exam

In [145]:
import numpy as np
print('Nb of clusters:',len(cluster_name_list))
print('Cluster mean size:',np.mean([len(flist) for flist in cluster_name_list]))
print('Max size:',np.max([len(flist) for flist in cluster_name_list]))
print('Min size:',np.min([len(flist) for flist in cluster_name_list]))

Nb of clusters: 321
Cluster mean size: 10.0996884735
Max size: 47
Min size: 2


In [171]:
for listx in cluster_name_list:
    for item in listx:
        if 'juin' in item:
            print(listx)

['tuner', 'tp3', 'propagationentremedias', 'xpl-fr', 'verb', 'echographie', 'tp2', 'tp1', 'TSIG2', 'AntillesESjuin2006', 'node-js-livre-debutant']
['juillet16', 'juin15', 'janvier16', 'mai15', 'fevrier16', 'avril15', 'aout15', 'septembre15', 'juin16', 'juillet15', 'octobre15']
['juillet16', 'juin15', 'janvier16', 'mai15', 'fevrier16', 'avril15', 'aout15', 'septembre15', 'juin16', 'juillet15', 'octobre15']
['juin14', 'avril14', 'janvier14', 'decembre14', 'novembre14', 'juillet14', 'Form', 'aout14', 'mars14', 'salaire0414', 'fevrier14', 'mai14', 'octobre14', 'septembre14']


### Save as a table

In [163]:
clusters_table = pd.DataFrame()
for idx,name_list in enumerate(cluster_name_list):
    df1=pd.DataFrame(name_list)
    clusters_table = pd.concat([clusters_table,df1], ignore_index=True, axis=1)

In [164]:
clusters_table

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,311,312,313,314,315,316,317,318,319,320
0,coptfont,lstdrvrs,tcolorbox,changes.english.withcode,tlbuild,classes,datetime2-sample-babel,rusnat-ex2-ru,zref,dvips,...,fig_atom_mod2,cal2016-2033_DK,cm-crop,tstlmts1,pgf_pdflatex,transparent-init,EventCollection_plot__extend_positions,scr,fig_sensor_heat_kernel2,EventCollection_plot__set_orientation
1,esami-doc-it,fontspec,pgfplotsexample-plain,test-pst,kpathsea,refman,etextools-examples,rusnat-ex1-ru,fixme,ffuserguide,...,fig_sensor_high_kernel,cal2016-2033_EN,pamath-crop,tstlmqx,pgf_xelatex,example_2,EventCollection_plot__set_positions,idc,fig_sensor_heat_kernel,EventCollection_plot__switch_orientation
2,isoman,mathsPICmanual,tkz-fct-screen,examdoc,latex2e,brief,doc_aq,mtc-ocf,minutes,glossaries-user,...,fig_sensor_high_kernel2,cal2016-2033_DE,cmbright-crop,tstlmot1,1,1,1,1,1,1
3,fontinstallationguide,interfaces,pgfplotsexample,blog,fftw3,letter,corrige,rusnat-doc-ru,uml,dt-sampl,...,1,1,pazo-crop,tstlmot4,,,,,,
4,simplified-intro,gentle,pandas,deliverable_template,web2c,chletter,datetime2-sample-hyperref,altverse-ex1-ru,colorwebfull,NumericPlots,...,,,lm-crop,tstlmt1,,,,,,
5,pst-solides3d-doc,datatool-user,networkx_reference,changes.english,1,a4wide,TeXbyTopic,pgf-umlcd-manual,matrixcookbook,circuitikzmanual,...,,,1,1,,,,,,
6,marvosym-doc,Vienna-Airport-line-timetable,nestlemeetingMarch,menukeys,,isoe,fsps,mtc-bk,latex4wp,glossaries-extra-code,...,,,,,,,,,,
7,subfig,artdoc,imm6614,skb,,ntgclass,Mathmode,assoccnt_example,texshade,datatool-code,...,,,,,,,,,,
8,asymptote,visualFAQ,cartegrise406014,ifnextok,,stepe,Malva,makecell-rus,filedate,manDCPiCpt,...,,,,,,,,,,
9,babel,mtc-cri,0.527778,rhodocyb,,guit,phil,tablists-rus,luatexref-t,sampleEq,...,,,,,,,,,,


In [165]:
clusters_table.to_csv('cluster_table.csv')

In [170]:
clusters_table = grevia.output_filename_classification(cluster_name_list,'cluster_table.csv')

Save to file cluster_table.csv
