In [1]:
%matplotlib
import logging
from importlib import reload
import pylab as pl
import numpy as np
from utils import scatter

Using matplotlib backend: Qt5Agg


In [2]:
from pickle import load, dump
import os
import bz2

def parseMails():
    from tokenizer import iterTokenizedSentences
    from emailparser import mailLoaderGen
    
    mails = []
    for mail in mailLoaderGen():
        mails.append(mail)
        mail.sents = list(iterTokenizedSentences(mail.description))
    return mails

dbfile = "tokmail.pk.bz2"
if os.path.exists(dbfile):
    with bz2.open(dbfile, "rb") as f:
        mails = load(f)
else:        
    mails = parseMails()
    with bz2.open(dbfile, "wb") as f:
        dump(mails, f)

In [3]:
from Stemmer import Stemmer
from collections import Counter

# Premier compatage du vocabulaire
vocabCount = Counter(w for mail in mails 
                       for sent in mail.sents
                       for w in sent)

# Création du stemmer avec ce vocabulaire
stem = Stemmer({w for w,count in vocabCount.items() if count >= 3})

# Applique le stemmer sur l'ensemble des phrases:
for mail in mails:
    mail.words = [w for s in mail.sents for w in s]
    mail.stemSents = [[stem.stem(w) for w in s] for s in mail.sents]
    mail.stemWords = [w for s in mail.stemSents for w in s]
    
# Second comptage du compatage du vocabulaire
vocabCount = Counter(w for mail in mails for w in mail.words)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.manifold import TSNE
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans, AffinityPropagation, DBSCAN

tf = TfidfVectorizer(analyzer=lambda mail: mail.stemWords, min_df=3, max_df=0.7)
svd = TruncatedSVD(n_components=500)
lsa = make_pipeline(tf, svd)

X = lsa.fit_transform(mails)

Y = TSNE().fit_transform(X)

In [5]:
km = KMeans(n_clusters=20, init='k-means++', max_iter=100, n_init=20)
km.fit(X)
pl.scatter(Y.T[0], Y.T[1], c =km.labels_);

In [6]:
mailLogLength = np.log([len(mail.words) for mail in mails])
sujets = [mail.sujet for mail in mails]
scatter(Y, sujets, km.labels_, 0.1*mailLogLength)

In [18]:
for cl in range(km.labels_.max()+1):
    ct = Counter()
    n = 0
    for i in np.where(km.labels_==cl)[0]:
        ct.update(mails[i].tags)
        n += 1
    print(cl, n, ct)
        

0 98 Counter({'CDD': 7, 'CDD Ingénieur': 4, 'Post-doc / IR': 3, 'Emploi': 1, "CMSR@ECCB'14": 1, 'Fwd:  Formation ENVOL 2014 : Tests et Validation': 1, 'MdC': 1, 'PostDoc': 1, 'CDI': 1})
1 69 Counter({'CDD': 19, 'Post-doc / IR': 19, 'Thèse': 7, 'Stage': 3, 'Gtpb': 1})
2 120 Counter({'CDD': 56, 'CDD Ingénieur': 37, 'CDI': 24, 'CDI autre': 15, 'CDD autre': 10, 'Post-doc / IR': 8, 'Stage': 5, 'IE': 4, 'PR': 3, 'IR': 2, 'Emploi': 2})
3 4 Counter()
4 6 Counter({'CDI': 1, 'CDI autre': 1})
5 185 Counter({'CDD': 80, 'Post-doc / IR': 67, 'Thèse': 9, 'CDD Ingénieur': 8, 'CDD autre': 5, 'CDI autre': 5, 'CDI': 5, 'Stage': 4, 'Emploi': 2, 'Call for Participations': 1, 'CDD/Ingénieur': 1})
6 11 Counter({'CDI': 3, 'MdC': 3})
7 3 Counter()
8 46 Counter({'Stage': 7, 'Thèse': 6})
9 71 Counter({"CMSR@ECCB'14": 1, "ECCB'14": 1})
10 26 Counter({'CDI': 8, 'CDD': 5, 'Stage': 4, 'IR': 3, 'CDD Ingénieur': 3, 'CDI autre': 3, 'PR': 1, 'Post-doc / IR': 1, 'CDD autre': 1, 'IE': 1, 'Thèse': 1})
11 122 Counter({'Post

In [19]:
 for i in np.where(km.labels_==9)[0]:
        print(i)
        print(mails[i].sujet)

7
3rd call for peer-reviewed short papers, aSSB'15 Thematic school "Advances in Systems & Synthetic Biology"
30
AlCoB 2015: 2nd call for papers
39
CFP: The International Technology Management Conference (ITMC2015)
46
Deadline December 19 for peer-reviewed short papers, aSSB'15 Thematic school "Advances in Systems & Synthetic Biology"
55
Computational Methods in Systems Biology 2015, Nantes, September 16-18 2015
57
Call for papers - MLJ Special issue on Dynamic Networks & Knowledge Discovery
84
AlCoB 2015: 1st call for papers
89
CFP: IJCNN 2015 Special Session on "Multiple Clusterings"
151
international conference 'Perspectives in Environmental and Systems Biology' (Grenoble, France, April 13-15, 2015) - First announcement
153
CFP 8th Workshop on  Biomedical and Bioinformatics Challenges for Computer,Science (BBC 2015)
167
Announcement: FMMB 2014
193
2nd CFP  - Deadline extension - Special issue of BMC Bioinformatics on BioNLP Shared Task 2013
199
ESWC 2014 Final Call for PhD Symposium


In [9]:
from gensim.models import Word2Vec
from numpy.random import shuffle
import os

sents = [sent for mail in mails for sent in mail.stemSents]

m = Word2Vec(min_count=10, workers = 4)
m.build_vocab(sents)

for i in range(100):
    if i%5 == 0:
        print(i, ' ', end='')

    shuffle(sents)
    m.train(sents)

m.init_sims(replace=True)

05101520253035404550556065707580859095

In [10]:
wordSNE = TSNE().fit_transform(m.syn0norm[:1000])

In [15]:
wordLabels = AffinityPropagation().fit_predict(m.syn0norm[:1000])

In [16]:
freqs = Counter(w for sent in sents for w in sent)
wordsLogFreqs = np.array([np.log(freqs[w]) for w in m.index2word[:1000]])/np.log(max(freqs.values()))

scatter(wordSNE, m.index2word, wordLabels, wordsLogFreqs)

In [17]:
words = np.array(m.index2word)
for l in range(max(wordLabels)):
    lwords = words[wordLabels == l]
    print("cluster#{} (len:{})".format(l, len(lwords)), ' '.join(lwords))

cluster#0 (len:18) de ' et le la un à en du pour dans sur par grand sous dédié atelier actuellement
cluster#1 (len:12) and of in for with or including between database level provide technical
cluster#2 (len:5) d l L s aide
cluster#3 (len:12) data analysis genomic process integration dataset high-throughput control resource quality proteomic visualization
cluster#4 (len:5) at [ ] fr com
cluster#5 (len:14) vous votre présent merci faire si intéresser cordialement êtes entrepris inviter nombreux avez chargé
cluster#6 (len:12) connaissance technique système bon maîtrise anglais pratique général solid ingénierie serait apprentissage
cluster#7 (len:11) équipe projet développement travail cadre programme objectif context activité appel thématique
cluster#8 (len:14) au est sont être tout seront ouvert notamment disponible tous adresse ayant particulièrement lié
cluster#9 (len:17) outil méthode utiliser logiciel solution adapter visualiser permettant nécessaire pipeline nouveau améliorer implém