In [1]:
import sys
sys.path.append('../')
import pandas as pd
import numpy as np

In [2]:
import json
from nltk import word_tokenize

with open('../data/partijen-metadata.json', 'r') as f:
     meta = json.load(f)

party_dict = {}
for party_data in meta['partijen']:
    party_id = party_data['lijst']
    party_name = party_data['naam'].encode('utf-8')
    party_dict["{:02d}".format(party_id)] = party_name
    
raw = pd.read_csv('../data/processed/dataframe.csv') 
raw['file'] = raw['file'].replace(to_replace=party_dict, regex=True).apply(lambda x: x.split('-')[1])
raw.rename(columns={'file': 'party'}, inplace=True)

party_labels = raw['party']
n_parties = len(party_labels)

def chunk_string(string, length):
    return [string[0+i:length+i] for i in range(0, len(string), length)]

df = raw['text'].apply(lambda x: pd.Series(chunk_string(x, length=600))).stack().reset_index()
df.rename(columns={0: 'text', 'level_0': 'party', 'level_1': 'snippet'}, inplace=True)

print "{:d} text snippets".format(len(df))
print "{:d} charachters in corpus".format(df['text'].apply(len).sum())
print "{:d} words in corpus".format(df['text'].apply(lambda x: word_tokenize(x, language='dutch')).apply(len).sum())
df.head()

4897 text snippets
2931367 charachters in corpus
418936 words in corpus


Unnamed: 0,party,snippet,text
0,0,0,n e d n o b r e v n ee g n i v e l sa m e n ve...
1,0,1,twerp paul pollmann inhoudsopgave essay ee...
2,0,2,rismebestrijding privacy justitie en rechtstaa...
3,0,3,ing werk en gezin pensioenen werkende ouderen ...
4,0,4,iedereen publieke waarden centraal van marktw...


In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

from sklearn.decomposition import LatentDirichletAllocation

from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
from sklearn.pipeline import Pipeline

from nltk.corpus import stopwords
from nltk.stem.snowball import DutchStemmer

stemmer = DutchStemmer()
def tokenize(text):
    """Converts text to tokens."""
    tokens = word_tokenize(text, language='dutch')
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

n_topics = 1000

preproc_steps = [('vectorizer', TfidfVectorizer(input='content', encoding='utf-8',
                                                decode_error='strict', strip_accents=None,
                                                lowercase=True, preprocessor=None, tokenizer=tokenize,
                                                analyzer='word', stop_words=stopwords.words('dutch'), 
                                                ngram_range=(1, 3), max_df=0.5, min_df=2, max_features=10000,
                                                vocabulary=None, binary=False, dtype=np.int64,
                                                norm='l2', use_idf=True, smooth_idf=False, sublinear_tf=False)),
                 
                 ('topic_model', LatentDirichletAllocation(n_topics=n_topics, doc_topic_prior=None, topic_word_prior=None,
                                                           learning_method='online', learning_decay=0.7, learning_offset=10., 
                                                           max_iter=10, batch_size=128, evaluate_every=-1, total_samples=1000000.0,
                                                           mean_change_tol=0.001, max_doc_update_iter=100, n_jobs=1, verbose=0,
                                                           random_state=42))]

# specification of different model types and their defaults
equal_priors = np.ones(n_parties) * 1. / n_parties
model_steps_dict = {'mnb': [('mnb', MultinomialNB(class_prior=equal_priors))],
                   'dummy': [('dummy', DummyClassifier(strategy='prior', random_state=11))]}

# extract data from data frame
X = df['text'].values 
y = np.array(df['party'])

# for model_type in ['dummy','mnb']:
#     model_steps = model_steps_dict[model_type]
#     estimator = Pipeline(steps=(preproc_steps + model_steps))
#     scores = cross_val_score(estimator, X, y, cv=cv)
#     print "Average {:s} classifier score: {:.4f} ".format(model_type, np.mean(scores))

# fit on whole data set
model_steps = model_steps_dict['mnb']
estimator = Pipeline(steps=(preproc_steps + model_steps))
estimator.fit(X, y)
estimator

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

def plot_predictions(results):
    figsize=(12, .5 * len(results))
    fig, ax = plt.subplots(figsize=figsize)   
    sns.heatmap(results, ax=ax)
    ax.set_ylabel('')
    
def classify_text(estimator, texts):
    pred_proba = estimator.predict_proba(texts)
    capped_texts = map(lambda x: x[:40] + '...', texts)
    results = pd.DataFrame(pred_proba, columns=[party_labels], index=capped_texts)
    plot_predictions(results)

texts = ['veiligheid immigratie islam terrorisme',
         'economie bedrijven belastingen werkgelegenheid',
         'zorg ouderen pensioenen, pensioensleeftijd',
         'milieu duurzaamheid klimaat energie',
         'gezin samenleving christelijk landbouw',
         'ik vind belasting niet leuk, daarom minder belasting',
         'mileubelasting kilometerheffing openbaar vervoer natuur']

classify_text(estimator, texts)