https://openclassrooms.com/fr/courses/4470541-analysez-vos-donnees-textuelles/4854971-nettoyez-et-normalisez-les-donnees

<div style="display: flex; background-color: Blue; padding: 15px;" >

# Entraînez-vous à classifier du texte
</div>

<div style="display: flex; background-color: Green; padding: 7px;" >

### Names : nltk.NaiveBayesClassifier
</div>

https://www.nltk.org/book/ch06.html

In [43]:
import nltk
from nltk.corpus import names
import random

In [46]:
def gender_features(word):
    return {'last_letter': word[-1]}

In [61]:
gender_features('Shrek')

{'last_letter': 'k'}

In [45]:
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
random.shuffle(labeled_names)

In [48]:
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [50]:
print(classifier.classify(gender_features('Neo')))
print(classifier.classify(gender_features('Trinity')))
print(nltk.classify.accuracy(classifier, test_set))

male
female
0.754


In [51]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     33.4 : 1.0
             last_letter = 'k'              male : female =     32.4 : 1.0
             last_letter = 'p'              male : female =     17.4 : 1.0
             last_letter = 'f'              male : female =     15.8 : 1.0
             last_letter = 'd'              male : female =     10.5 : 1.0


In [53]:
print(classifier.classify(gender_features('Yann')))

male


<div style="display: flex; background-color: indigo;" >

### apply_features
</div>

When working with large corpora, constructing a single list that contains the features of every instance can use up a large amount of memory. In these cases, use the function `nltk.classify.apply_features`, which returns an object that acts like a list but does not store all the feature sets in memory:

In [54]:
from nltk.classify import apply_features

In [55]:
train_set = apply_features(gender_features, labeled_names[500:])
test_set = apply_features(gender_features, labeled_names[:500])

In [56]:
print(train_set)

[({'last_letter': 'e'}, 'female'), ({'last_letter': 'b'}, 'male'), ...]


<div style="display: flex; background-color: indigo;" >

### Choosing The Right Features
</div>

In [None]:
def gender_features2(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [59]:
print(gender_features2('John'))

{'first_letter': 'j', 'last_letter': 'n', 'count(a)': 0, 'has(a)': False, 'count(b)': 0, 'has(b)': False, 'count(c)': 0, 'has(c)': False, 'count(d)': 0, 'has(d)': False, 'count(e)': 0, 'has(e)': False, 'count(f)': 0, 'has(f)': False, 'count(g)': 0, 'has(g)': False, 'count(h)': 1, 'has(h)': True, 'count(i)': 0, 'has(i)': False, 'count(j)': 1, 'has(j)': True, 'count(k)': 0, 'has(k)': False, 'count(l)': 0, 'has(l)': False, 'count(m)': 0, 'has(m)': False, 'count(n)': 1, 'has(n)': True, 'count(o)': 1, 'has(o)': True, 'count(p)': 0, 'has(p)': False, 'count(q)': 0, 'has(q)': False, 'count(r)': 0, 'has(r)': False, 'count(s)': 0, 'has(s)': False, 'count(t)': 0, 'has(t)': False, 'count(u)': 0, 'has(u)': False, 'count(v)': 0, 'has(v)': False, 'count(w)': 0, 'has(w)': False, 'count(x)': 0, 'has(x)': False, 'count(y)': 0, 'has(y)': False, 'count(z)': 0, 'has(z)': False}


In [62]:
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.748


In [63]:
train_names = labeled_names[1500:]
devtest_names = labeled_names[500:1500]
test_names = labeled_names[:500]

In [66]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
test_set = [(gender_features(n), gender) for (n, gender) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.746


In [67]:
errors = []
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

In [71]:
for (tag, guess, name) in sorted(errors):
    print('correct={:<8} guess={:<8s} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Abigail                       
correct=female   guess=male     name=Aigneis                       
correct=female   guess=male     name=Alex                          
correct=female   guess=male     name=Alexis                        
correct=female   guess=male     name=Alleen                        
correct=female   guess=male     name=Allyn                         
correct=female   guess=male     name=Amabel                        
correct=female   guess=male     name=Amber                         
correct=female   guess=male     name=Anne-Mar                      
correct=female   guess=male     name=April                         
correct=female   guess=male     name=Ardeen                        
correct=female   guess=male     name=Arleen                        
correct=female   guess=male     name=Aryn                          
correct=female   guess=male     name=Astrid                        
correct=female   guess=male     name=Beatriz    

In [72]:
def gender_features(word):
    return {'suffix1': word[-1:],
            'suffix2': word[-2:]}

In [73]:
train_set = [(gender_features(n), gender) for (n, gender) in train_names]
devtest_set = [(gender_features(n), gender) for (n, gender) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.77


<div style="display: flex; background-color: indigo;" >

### Document Classification
</div>

In [74]:
from nltk.corpus import movie_reviews

In [77]:
documents = [(list(movie_reviews.words(fileid)), category)
                    for category in movie_reviews.categories()
                    for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [78]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [81]:
document_features(movie_reviews.words('pos/cv957_8737.txt'))

{'contains(,)': True,
 'contains(the)': True,
 'contains(.)': True,
 'contains(a)': True,
 'contains(and)': True,
 'contains(of)': True,
 'contains(to)': True,
 "contains(')": True,
 'contains(is)': True,
 'contains(in)': True,
 'contains(s)': True,
 'contains(")': True,
 'contains(it)': True,
 'contains(that)': True,
 'contains(-)': True,
 'contains())': True,
 'contains(()': True,
 'contains(as)': True,
 'contains(with)': True,
 'contains(for)': True,
 'contains(his)': True,
 'contains(this)': True,
 'contains(film)': False,
 'contains(i)': False,
 'contains(he)': True,
 'contains(but)': True,
 'contains(on)': True,
 'contains(are)': True,
 'contains(t)': False,
 'contains(by)': True,
 'contains(be)': True,
 'contains(one)': True,
 'contains(movie)': True,
 'contains(an)': True,
 'contains(who)': True,
 'contains(not)': True,
 'contains(you)': True,
 'contains(from)': True,
 'contains(at)': False,
 'contains(was)': False,
 'contains(have)': True,
 'contains(they)': True,
 'contains(h

In [80]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [96]:
train_set

[({'endswith(e)': False,
   'endswith(,)': False,
   'endswith(.)': False,
   'endswith(s)': False,
   'endswith(d)': False,
   'endswith(t)': False,
   'endswith(he)': False,
   'endswith(n)': False,
   'endswith(a)': False,
   'endswith(of)': False,
   'endswith(the)': False,
   'endswith(y)': False,
   'endswith(r)': True,
   'endswith(to)': False,
   'endswith(in)': False,
   'endswith(f)': False,
   'endswith(o)': False,
   'endswith(ed)': False,
   'endswith(nd)': False,
   'endswith(is)': False,
   'endswith(on)': False,
   'endswith(l)': False,
   'endswith(g)': False,
   'endswith(and)': False,
   'endswith(ng)': False,
   'endswith(er)': False,
   'endswith(as)': False,
   'endswith(ing)': False,
   'endswith(h)': False,
   'endswith(at)': False,
   'endswith(es)': False,
   'endswith(or)': False,
   'endswith(re)': False,
   'endswith(it)': False,
   'endswith(``)': False,
   'endswith(an)': False,
   "endswith('')": False,
   'endswith(m)': False,
   'endswith(;)': False,
 

In [83]:
print(nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features()

0.77
Most Informative Features
   contains(outstanding) = True              pos : neg    =     10.3 : 1.0
         contains(mulan) = True              pos : neg    =      9.2 : 1.0
        contains(seagal) = True              neg : pos    =      8.1 : 1.0
         contains(damon) = True              pos : neg    =      7.4 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.7 : 1.0
          contains(lame) = True              neg : pos    =      6.6 : 1.0
         contains(flynt) = True              pos : neg    =      5.8 : 1.0
         contains(awful) = True              neg : pos    =      5.5 : 1.0
         contains(waste) = True              neg : pos    =      5.3 : 1.0
        contains(poorly) = True              neg : pos    =      5.2 : 1.0


<div style="display: flex; background-color: indigo;" >

### Part-of-Speech Tagging
</div>

In [85]:
from nltk.corpus import brown

In [86]:
suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

In [87]:
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
print(common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [88]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
    return features

In [89]:
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]

In [95]:
featuresets

[({'endswith(e)': True,
   'endswith(,)': False,
   'endswith(.)': False,
   'endswith(s)': False,
   'endswith(d)': False,
   'endswith(t)': False,
   'endswith(he)': True,
   'endswith(n)': False,
   'endswith(a)': False,
   'endswith(of)': False,
   'endswith(the)': True,
   'endswith(y)': False,
   'endswith(r)': False,
   'endswith(to)': False,
   'endswith(in)': False,
   'endswith(f)': False,
   'endswith(o)': False,
   'endswith(ed)': False,
   'endswith(nd)': False,
   'endswith(is)': False,
   'endswith(on)': False,
   'endswith(l)': False,
   'endswith(g)': False,
   'endswith(and)': False,
   'endswith(ng)': False,
   'endswith(er)': False,
   'endswith(as)': False,
   'endswith(ing)': False,
   'endswith(h)': False,
   'endswith(at)': False,
   'endswith(es)': False,
   'endswith(or)': False,
   'endswith(re)': False,
   'endswith(it)': False,
   'endswith(``)': False,
   'endswith(an)': False,
   "endswith('')": False,
   'endswith(m)': False,
   'endswith(;)': False,
   

In [90]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

In [91]:
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.6270512182993535

In [92]:
classifier.classify(pos_features('cats'))

'NNS'

In [94]:
print(classifier.pseudocode(depth=4))

if endswith(the) == False: 
  if endswith(,) == False: 
    if endswith(s) == False: 
      if endswith(.) == False: return '.'
      if endswith(.) == True: return '.'
    if endswith(s) == True: 
      if endswith(is) == False: return 'PP$'
      if endswith(is) == True: return 'BEZ'
  if endswith(,) == True: return ','
if endswith(the) == True: return 'AT'



<div style="display: flex; background-color: Green; padding: 7px;" >

### RCV1 : nltk.NaiveBayesClassifier
</div>

- [ ] Charger les données
- [ ] Créer différents classifieurs (au moins 3)
- [ ] Effectuer une validation croisée sur les différents classifieurs
- [ ] Afficher les différentes performances

In [22]:
from sklearn.datasets import fetch_rcv1
from nltk.classify import NaiveBayesClassifier, DecisionTreeClassifier, rte_classifier, accuracy

In [19]:
train = fetch_rcv1(subset='train')
test = fetch_rcv1(subset='test')

In [25]:
X_train = train.data
X_test = test.data

y_train = train.target
y_test = test.target

X_panel = X_train[0:5000]
y_panel = y_train[0:5000]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
print(X_panel.shape, y_panel.shape)

(23149, 47236) (23149, 103)
(781265, 47236) (781265, 103)
(5000, 47236) (5000, 103)


In [38]:
print(train.target_names[:3].tolist())
print(test.target_names[:3].tolist())

['C11', 'C12', 'C13']
['C11', 'C12', 'C13']


In [33]:
type(y_panel)

scipy.sparse._csr.csr_matrix

In [35]:
y_panel[0:5,:]

<5x103 sparse matrix of type '<class 'numpy.uint8'>'
	with 20 stored elements in Compressed Sparse Row format>

In [32]:
type(X_panel)

scipy.sparse._csr.csr_matrix

In [27]:
train_data = []  # type: list
train_data.extend(zip(X_panel, y_panel))

In [None]:
# featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
# train_set, test_set = featuresets[500:], featuresets[:500]

In [30]:
classifier = NaiveBayesClassifier.train(train_data)

TypeError: unhashable type: 'csr_matrix'

In [None]:
classifier.show_most_informative_features(n=25)
print(accuracy(classifier, test))

<div style="display: flex; background-color: Blue; padding: 15px;" >

# Exercices du cours
</div>

In [1]:
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

In [2]:
from sklearn.decomposition import LatentDirichletAllocation
n_topics = 20

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)

In [4]:
# Créer le modèle LDA
lda = LatentDirichletAllocation(
        n_components=n_topics, 
        max_iter=5, 
        learning_method='online', 
        learning_offset=50.,
        random_state=0)

# Fitter sur les données
lda.fit(tf)

LatentDirichletAllocation(learning_method='online', learning_offset=50.0,
                          max_iter=5, n_components=20, random_state=0)

In [5]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic {}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [10]:
no_top_words = 10
tf_feature_names = tf_vectorizer.get_feature_names_out()
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
people gun state control right guns crime states law police
Topic 1:
time question book years did like don space answer just
Topic 2:
mr line rules science stephanopoulos title current define int yes
Topic 3:
key chip keys clipper encryption number des algorithm use bit
Topic 4:
edu com cs vs w7 cx mail uk 17 send
Topic 5:
use does window problem way used point different case value
Topic 6:
windows thanks know help db does dos problem like using
Topic 7:
bike water effect road design media dod paper like turn
Topic 8:
don just like think know people good ve going say
Topic 9:
car new price good power used air sale offer ground
Topic 10:
file available program edu ftp information files use image version
Topic 11:
ax max b8f g9v a86 145 pl 1d9 0t 34u
Topic 12:
government law privacy security legal encryption court fbi technology information
Topic 13:
card bit memory output video color data mode monitor 16
Topic 14:
drive scsi disk mac hard apple drives controller software port
T

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, 
    min_df=2, 
    max_features=1000, 
    stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd')
nmf.fit(tfidf)

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)




Topic 0:
people time right did good said say make way government
Topic 1:
window problem using server application screen display motif manager running
Topic 2:
god jesus bible christ faith believe christian christians sin church
Topic 3:
game team year games season players play hockey win league
Topic 4:
new 00 sale 10 price offer shipping condition 20 15
Topic 5:
thanks mail advance hi looking info help information address appreciated
Topic 6:
windows file files dos program version ftp ms directory running
Topic 7:
edu soon cs university ftp internet article email pub david
Topic 8:
key chip clipper encryption keys escrow government public algorithm nsa
Topic 9:
drive scsi drives hard disk ide floppy controller cd mac
Topic 10:
just ll thought tell oh little fine work wanted mean
Topic 11:
does know anybody mean work say doesn help exist program
Topic 12:
card video monitor cards drivers bus vga driver color memory
Topic 13:
like sounds looks look bike sound lot things really thing
To

<div style="display: flex; background-color: Blue; padding: 15px;" >

# NaiveBayesClassifier

In [None]:
import nltk
import os
from tools import ap

def format_sentence(sent):
    return ({ word: True for word in nltk.word_tokenize(sent.decode('utf-8')) })


def load_training_set():
    training = []

    for fp in os.listdir(ap('aclImdb/train/pos')):
        example = '{}/{}'.format(ap('aclImdb/train/pos'), fp)
        with open(example) as fp:
            for i in fp:
                training.append([format_sentence(i), 'pos'])

    for fp in os.listdir(ap('aclImdb/train/neg')):
        example = '{}/{}'.format(ap('aclImdb/train/neg'), fp)
        with open(example) as fp:
            for i in fp:
                training.append([format_sentence(i), 'neg'])

    return training

training = load_training_set()

In [None]:
training[:3]

In [None]:
from nltk.classify import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(training)

In [None]:
classifier.show_most_informative_features(n=25)

In [None]:
print(accuracy(classifier, test))

<div style="display: flex; background-color: Blue; padding: 15px;" >

# ANNEXES

In [None]:
db = load_all_sentences();
print('chargement de {} vers dans la db'.format(len(db.keys())))

In [None]:
from collections import defaultdict

base_artistes = defaultdict(set)
for k,v in db.iteritems():
    base_artistes[v['artistes']].add(k)
artistes = { k:v for k,v in artistes.iteritems() if len(v) > 200 }
print('{} artistes'.format(len(artistes))

In [1]:
import nltk
test = "Bonjour, je suis un texte d'exemple pour le cours d'Openclassrooms. Soyez attentifs à ce cours !"

nltk.word_tokenize(test)

['Bonjour',
 ',',
 'je',
 'suis',
 'un',
 'texte',
 "d'exemple",
 'pour',
 'le',
 'cours',
 "d'Openclassrooms",
 '.',
 'Soyez',
 'attentifs',
 'à',
 'ce',
 'cours',
 '!']

In [2]:
tokenizer = nltk.RegexpTokenizer(r'\w+')
tokenizer.tokenize(test)

['Bonjour',
 'je',
 'suis',
 'un',
 'texte',
 'd',
 'exemple',
 'pour',
 'le',
 'cours',
 'd',
 'Openclassrooms',
 'Soyez',
 'attentifs',
 'à',
 'ce',
 'cours']

In [None]:
tokenizer = nltk.RegexpTokenizer(r'\w+')

def freq_stats_corpora():
    corpora = defaultdict(list)

    # Création d'un corpus de tokens par artiste
    for artiste,sentence_id in artistes.iteritems():
        for sentence_id in sentence_id:
            corpora[artiste] += tokenizer.tokenize(
                                    db[sentence_id]['text'].decode('utf-8').lower()
                                )

    stats, freq = dict(), dict()

    for k, v in corpora.iteritems():
        freq[k] = fq = nltk.FreqDist(v)
        stats[k] = {'total': len(v)} 
        
    return (freq, stats, corpora)


# Récupération des comptages
freq, stats, corpora = freq_stats_corpora()
df = pd.DataFrame.from_dict(stats, orient='index')

# Affichage des fréquences
df.sort(columns='total', ascending=False)
df.plot(kind='bar', color="#f56900", title='Top 50 Rappeurs par nombre de mots')

In [None]:
def get_corpus_word_frequence(diccle_texte, dic_cle_fre):
    # Premièrement, on récupère la fréquence totale de chaque mot sur tout le corpus d'artistes
    freq_totale = nltk.Counter()
    for k, v in diccle_texte.iteritems():
        freq_totale += dic_cle_fre[k]

    # Deuxièmement on décide manière un peu arbitraire du nombre de mots les plus fréquents à supprimer. 
    # On pourrait afficher un graphe d'évolution du nombre de mots pour se rendre compte et avoir une meilleure heuristique. 
    most_freq = zip(*freq_totale)
    return freq_totale

In [None]:
def get_most_frequence(freq_totale, limit=100):
    # Deuxièmement on décide manière un peu arbitraire du nombre de mots les plus fréquents à supprimer. On pourrait afficher un graphe d'évolution du nombre de mots pour se rendre compte et avoir une meilleure heuristique. 
    most_freq = zip(*freq2.most_common(limit))[0]
    return most_freq

In [None]:
# Premièrement, on récupère la fréquence totale de chaque mot sur tout le corpus d'artistes
freq_totale = nltk.Counter()
for k, v in corpora.iteritems():
    freq_totale += freq[k]

# Deuxièmement on décide manière un peu arbitraire du nombre de mots les plus fréquents à supprimer. On pourrait afficher un graphe d'évolution du nombre de mots pour se rendre compte et avoir une meilleure heuristique. 
most_freq = zip(*freq2.most_common(100))[0]

# On créé notre set de stopwords final qui cumule ainsi les 100 mots les plus fréquents du corpus ainsi que l'ensemble de stopwords par défaut présent dans la librairie NLTK
sw = set()
sw.update(stopwords)
sw.update(tuple(nltk.corpus.stopwords.words('french')))


In [None]:
def freq_stats_corpora2(lookup_table=[]):
    corpora = defaultdict(list)
    for artist, block_ids in lt_artists.iteritems():
        for block_id in block_ids:
            tokens = tokenizer.tokenize(db_flat[block_id]['text'].decode('utf-8'))
            corpora[artist] += [w for w in tokens if not w in list(sw)]

    stats, freq = dict(), dict()

    for k, v in corpora.iteritems():
        freq[k] = fq = nltk.FreqDist(v)
        stats[k] = {'total': len(v), 'unique': len(fq.keys())}
    return (freq, stats, corpora)

freq2, stats2, corpora2 = freq_stats_corpora2()

<div style="display: flex; background-color: Green; padding: 7px;" >

### RCV1 : keras
</div>

In [None]:
from sklearn.datasets import fetch_rcv1
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding
from keras.layers import LSTM


train = fetch_rcv1(subset='train')
test = fetch_rcv1(subset='test')

X_train = train.data
X_test = test.data


y_train = train.target
y_test = test.target

model = Sequential()
model.add(Embedding(input_dim=(None,47236),output_dim=300,dropout=0.25))
model.add(LSTM(350, dropout_W=0.4, dropout_U=0.4))  
model.add(Dense(103))
model.add(Activation('sigmoid'))


model.compile(loss='binary_crossentropy',
          optimizer='adam',
          metrics=['accuracy'])

print('Train...')
model.fit(X_train, y_train, batch_size=32, nb_epoch=15,
      validation_data=(X_test, y_test))
score, acc = model.evaluate(X_test, y_test,
                        batch_size=32)
print('Test score:', score)
print('Test accuracy:', acc)