In [2]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import sys
import logging
import numpy as np
import string
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn import svm
from datetime import datetime
import os
from gensim.models.word2vec import Word2Vec
import gensim
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from collections import defaultdict

import embeddingvectorizer
from nltk.corpus import stopwords

from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import TfidfTransformer
import json

In [3]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [37]:
path_to_data ='../data/'
df = pd.read_pickle(path_to_data + "data_geannoteerd.pkl")
data = df['text']
labels = df['topic']

In [38]:
basepath = '/home/anne/tmpanne/fullsample/'

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.02, random_state=42)

class word2vec_analyzer():
    '''This class tests the efficacy of Word2Vec models in downstream tasks.'''

    def __init__(self):
        self.nmodel = 0
        self.vectorizer = 'Tfidf'

    def get_w2v_model(self):
        '''yields a dict with one item. key is the filename, value the gensim model'''
        
        filenames = [e for e in os.listdir(basepath) if not e.startswith('.')]

        for fname in filenames:
            model = {}
            path = os.path.join(basepath, fname)
            logger.info("\nLoading gensim model")
            mod = gensim.models.Word2Vec.load(path)
            model['gensimmodel'] = dict(zip(mod.wv.index2word, mod.wv.syn0))
            model['filename'] = fname
            self.nmodel +=1
            logger.info("loaded gensim model nr {}, named: {}".format(self.nmodel, model['filename']))
            yield model
            
    def get_baseline_results(self, X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test):
        
        results = []
            
        if self.vectorizer == "Tfidf":
            logger.info(">>>> defining pipes for baseline model with {} vectorizer".format( self.vectorizer))

            svm = Pipeline([("vect", TfidfVectorizer()),
                            ("svm", SGDClassifier(loss='hinge', penalty='elasticnet', tol=1e-4, alpha=1e-6, max_iter=5000, random_state=42))
                            ])

            
            ET = Pipeline([
                ("vect", TfidfVectorizer()),
                ("ExtraTrees", ExtraTreesClassifier(n_estimators=200))
                ])

            
        else:
            logger.info(">>>> defining pipes for baseline model with {} vectorizer".format( self.vectorizer))

            svm = Pipeline([("vect", CountVectorizer()),
                            ("svm", SGDClassifier(loss='hinge', penalty='elasticnet', tol=1e-4, alpha=1e-6, max_iter=5000, random_state=42))
                            ])
            
            ET = Pipeline([
                ("vect", CountVectorizer()),
                ("ExtraTrees", ExtraTreesClassifier(n_estimators=200))
                ])


        classifier = "SGDClassifier"
        predicted_svm = svm.fit(X_train, y_train).predict(X_test)
        
        precision, recall, fscore , support = score(y_test, predicted_svm, average='macro')
        accuracy = accuracy_score(y_test, predicted_svm)
            
        logging.info("these are the BASELINE results: precision ({}), recall ({}), fscore ({}), accuracy ({}) results from {} using {}".format(precision, recall, fscore, accuracy, classifier, self.vectorizer))

        results.append({'precision_svm': precision, 'recall_svm': recall, 'f1_svm': fscore, 'accuracy': accuracy,
                    'classifier': classifier, 'model' : "baseline"})

        
        classifier = "ExtraTrees"
        predicted_ET = ET.fit(X_train, y_train).predict(X_test)
        precision, recall, fscore , support = score(y_test, predicted_ET, average='macro')
        accuracy = accuracy_score(y_test, predicted_ET)
            
        logging.info("these are the BASELINE results: precision ({}), recall ({}), fscore ({}), accuracy ({}) results from {} using {}".format(precision, recall, fscore, accuracy, classifier, self.vectorizer))

        
        results.append({'precision_svm': precision, 'recall_svm': recall, 'f1_svm': fscore, 'accuracy': accuracy,
                    'classifier': classifier, 'model' : "baseline"})

        return results
               
    def get_scores_wv2(self, X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test):
        
        results = []
        
        for model in self.get_w2v_model():
            if self.vectorizer == "Tfidf":
                logger.info(">>>> defining pipes for model {} with {} vectorizer".format(model['filename'], self.vectorizer))

                w2v_svm = Pipeline([
                ("word2vec TfidF vectorizer", embeddingvectorizer.EmbeddingTfidfVectorizer(model['gensimmodel'])),
                ("svm", SGDClassifier(loss='hinge', tol=1e-4, alpha=1e-6, max_iter=5000, random_state=42, penalty='elasticnet'))
                ])

                w2v_ET = Pipeline([
                ("word2vec tfidf vectorizer", embeddingvectorizer.EmbeddingTfidfVectorizer(model['gensimmodel'])), 
                ("ExtraTrees", ExtraTreesClassifier(n_estimators=200))
                ])

            else:
                logger.info(">>>> defining pipes for model {} with {} vectorizer".format(model['filename'], self.vectorizer))

                w2v_svm = Pipeline([
                ("word2vec TfidF vectorizer", embeddingvectorizer.EmbeddingCountVectorizer(model['gensimmodel'])),
                ("svm", SGDClassifier(loss='hinge', tol=1e-4, alpha=1e-6, max_iter=5000, random_state=42, penalty='elasticnet'))
                ])

                w2v_ET = Pipeline([
                ("word2vec tfidf vectorizer", embeddingvectoriembeddingvectorizer.EmbeddingCountVectorizer(model['gensimmodel'])), 
                ("ExtraTrees", ExtraTreesClassifier(n_estimators=200))
                ])

            classifier = "SGDClassifier"
            predicted_svm = w2v_svm.fit(X_train, y_train).predict(X_test)
            
            precision, recall, fscore , support = score(y_test, predicted_svm, average='macro')
            accuracy = accuracy_score(y_test, predicted_svm)
            
            logging.info("these are the w2v results: precision ({}), recall ({}), fscore ({}), accuracy ({}) results from {} using {}".format(precision, recall, fscore, accuracy, classifier, self.vectorizer))

            results.append({'precision_svm': precision, 'recall_svm': recall, 'f1_svm': fscore, 'accuracy': accuracy, 
                        'classifier': classifier, 'model' : model['filename']})

            classifier = "ExtraTrees"
            predicted_ET = w2v_ET.fit(X_train, y_train).predict(X_test)
            precision, recall, fscore , support = score(y_test, predicted_ET, average='macro')
            accuracy = accuracy_score(y_test, predicted_ET)
            
            logging.info("these are the w2v results: precision ({}), recall ({}), fscore ({}), accuracy ({}) results from {} using {}".format(precision, recall, fscore, accuracy, classifier, self.vectorizer))
     
            results.append({'precision_svm': precision, 'recall_svm': recall, 'f1_svm': fscore, 'accuracy': accuracy, 
                        'classifier': classifier, 'model' : model['filename']})

        return results
        
    def get_final(self):
        results_wv2 = self.get_scores_wv2() 
        results_baseline = self.get_baseline_results()
        return results_wv2 + results_baseline
            
if __name__ == "__main__":

    logger = logging.getLogger()
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    
    myanalyzer = word2vec_analyzer()
    my_results = myanalyzer.get_final()
    
    with open('output_my_results.json',mode='w') as fo:
        fo.write('[')
        
        for result in my_results:
            fo.write(json.dumps(result))
            fo.write(',\n')
        fo.write('[]]')
        print("\n\n\nSave results\n\n\n")

    df = pd.DataFrame.from_dict(my_results)
    print('Created dataframe')
    print(df)
    df.to_csv('w2v_evaluation.csv')

INFO:root:
Loading gensim model
INFO:gensim.utils:loading Word2Vec object from /home/anne/tmpanne/fullsample/w2v_model_nr_5_window_10_size_300_negsample_5
INFO:gensim.utils:loaded /home/anne/tmpanne/fullsample/w2v_model_nr_5_window_10_size_300_negsample_5
INFO:root:loaded gensim model nr 1, named: w2v_model_nr_5_window_10_size_300_negsample_5
INFO:root:>>>> defining pipes for model w2v_model_nr_5_window_10_size_300_negsample_5 with Tfidf vectorizer
INFO:root:these are the w2v results: precision (0.7309523809523809), recall (0.7171328671328671), fscore (0.7124069247952346), accuracy (0.8) results from SGDClassifier using Tfidf
INFO:root:these are the w2v results: precision (0.7835686600221484), recall (0.7235431235431234), fscore (0.740761271249076), accuracy (0.8142857142857143) results from ExtraTrees using Tfidf
INFO:root:
Loading gensim model
INFO:gensim.utils:loading Word2Vec object from /home/anne/tmpanne/fullsample/w2v_model_nr_1_window_5_size_300_negsample_5
INFO:gensim.utils:lo

INFO:gensim.utils:loading Word2Vec object from /home/anne/tmpanne/fullsample/w2v_model_nr_7_window_10_size_300_negsample_15
INFO:gensim.utils:loaded /home/anne/tmpanne/fullsample/w2v_model_nr_7_window_10_size_300_negsample_15
INFO:root:loaded gensim model nr 9, named: w2v_model_nr_7_window_10_size_300_negsample_15
INFO:root:>>>> defining pipes for model w2v_model_nr_7_window_10_size_300_negsample_15 with Tfidf vectorizer
INFO:root:these are the w2v results: precision (0.7761278195488721), recall (0.7671328671328671), fscore (0.7554329004329003), accuracy (0.8285714285714286) results from SGDClassifier using Tfidf
INFO:root:these are the w2v results: precision (0.7835686600221484), recall (0.7235431235431234), fscore (0.740761271249076), accuracy (0.8142857142857143) results from ExtraTrees using Tfidf
INFO:root:
Loading gensim model
INFO:gensim.utils:loading Word2Vec object from /home/anne/tmpanne/fullsample/w2v_model_nr_6_window_10_size_100_negsample_15
INFO:gensim.utils:loaded /home/




Save results



Created dataframe
    accuracy     classifier    f1_svm  \
0   0.800000  SGDClassifier  0.712407   
1   0.814286     ExtraTrees  0.740761   
2   0.842857  SGDClassifier  0.785216   
3   0.814286     ExtraTrees  0.740761   
4   0.800000  SGDClassifier  0.716071   
5   0.800000     ExtraTrees  0.722892   
6   0.828571  SGDClassifier  0.768939   
7   0.771429     ExtraTrees  0.691508   
8   0.785714  SGDClassifier  0.672554   
9   0.842857     ExtraTrees  0.777779   
10  0.814286  SGDClassifier  0.776163   
11  0.771429     ExtraTrees  0.671524   
12  0.814286  SGDClassifier  0.760002   
13  0.828571     ExtraTrees  0.761090   
14  0.757143  SGDClassifier  0.652815   
15  0.785714     ExtraTrees  0.714310   
16  0.828571  SGDClassifier  0.755433   
17  0.814286     ExtraTrees  0.740761   
18  0.814286  SGDClassifier  0.708900   
19  0.814286     ExtraTrees  0.740761   
20  0.771429  SGDClassifier  0.643954   
21  0.814286     ExtraTrees  0.746701   
22  0.728571  SGDCla

In [39]:
df


Unnamed: 0,accuracy,classifier,f1_svm,model,precision_svm,recall_svm
0,0.8,SGDClassifier,0.712407,w2v_model_nr_5_window_10_size_300_negsample_5,0.730952,0.717133
1,0.814286,ExtraTrees,0.740761,w2v_model_nr_5_window_10_size_300_negsample_5,0.783569,0.723543
2,0.842857,SGDClassifier,0.785216,w2v_model_nr_1_window_5_size_300_negsample_5,0.823302,0.792133
3,0.814286,ExtraTrees,0.740761,w2v_model_nr_1_window_5_size_300_negsample_5,0.783569,0.723543
4,0.8,SGDClassifier,0.716071,w2v_model_nr_8_window_47615_size_100_negsample_5,0.816734,0.719406
5,0.8,ExtraTrees,0.722892,w2v_model_nr_8_window_47615_size_100_negsample_5,0.771104,0.698543
6,0.828571,SGDClassifier,0.768939,w2v_model_nr_11_window_47615_size_300_negsampl...,0.782211,0.785723
7,0.771429,ExtraTrees,0.691508,w2v_model_nr_11_window_47615_size_300_negsampl...,0.770513,0.669406
8,0.785714,SGDClassifier,0.672554,w2v_model_nr_4_window_10_size_100_negsample_5,0.861667,0.638636
9,0.842857,ExtraTrees,0.777779,w2v_model_nr_4_window_10_size_100_negsample_5,0.807417,0.773543
