One vs All Method

Train NMF for each topic separately.

Use all Wiki articles as Background Corpus.

In [1]:
import pandas as pd
import numpy as np
from time import time

import nltk
from nltk.corpus import brown
from nltk.tokenize.moses import MosesDetokenizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

import matplotlib.pyplot as plt
from math import pi

from omterms.interface import *

from ipywidgets import interact, fixed

import pickle

import libs.text_preprocess as tp



## Plots and Prints

In [2]:
categories=['universalism', 'hedonism', 'achievement', 'power',
       'self-direction', 'benevolence', 'conformity', 'tradition', 'stimulation',
       'security']

def plot_radar_chart(doc_topic_cumul, doc):
    # ------- PART 1: Create background
 
    # number of variablecategories
    schwartz =['universalism', 'benevolence', 'conformity', 'tradition',
       'security', 'power', 'achievement', 'hedonism', 'stimulation',
       'self-direction']
    
    schwartz_dist = []
    for sch in schwartz:
        schwartz_dist.append(doc_topic_cumul[doc][categories.index(sch)])
    
    N = len(schwartz)
    
    # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += angles[:1]

    plt.figure(figsize=(8,8))
    # Initialise the spider plot
    ax = plt.subplot(111, polar=True)

    # If you want the first axis to be on top:
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)

    # Draw one axe per variable + add labels labels yet
    plt.xticks(angles[:-1], schwartz)

    # Draw ylabels
    ax.set_rlabel_position(0)
    plt.yticks([25,50,75], ["25","50","75"], color="grey", size=7)
    plt.ylim(0,100)


    # ------- PART 2: Add plots

    # Plot each individual = each line of the data
    # I don't do a loop, because plotting more than 3 groups makes the chart unreadable

    # Ind1
    values = list(schwartz_dist) + list(schwartz_dist[:1])
    ax.plot(angles, values, linewidth=1, linestyle='solid')
    ax.fill(angles, values, 'b', alpha=0.1)

    # Add legend
    #plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.title("Schwartz Chart - Doc " + str(doc))
    plt.show()
    
    
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'
    
    
def print_top_words(model, theme, tfidf_vectorizer, n_top_words, n_topics=3):
    feature_names = tfidf_vectorizer.get_feature_names()
    print(color.CYAN + color.BOLD + categories[theme] + color.END)
    for topic_idx, topic in enumerate(model[theme].components_):
        if topic_idx / n_topics == 1:
            break
        message = color.BOLD + "Topic #%d: " % topic_idx + color.END
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
def print_cumulative_train_doc_topics(data, doc_topic, doc, n_best):
    test_theme = data.iloc[doc]['theme']
    print(color.BOLD + "Doc " + str(doc) + color.RED +  " (" + test_theme + ")\t: " + color.END, end='')
    dt = doc_topic[doc]
    for i in dt.argsort()[:-n_best - 1:-1]:
        print("(", end='')
        try:
            print(color.CYAN + color.BOLD + categories[i] + color.END, end='')
        except:
            print(color.CYAN + color.BOLD + "General" + color.END, end='')
        print(", %d, %.2lf)  " %(i, dt[i]), end='')    
    print()
    
def print_cumulative_test_doc_topics(doc_topic, doc, n_best):
    print(color.BOLD + "Doc " + str(doc) + "\t: " + color.END, end='')
    dt = doc_topic[doc]
    for i in dt.argsort()[:-n_best - 1:-1]:
        print("(", end='')
        try:
            print(color.CYAN + color.BOLD + categories[i] + color.END, end='')
        except:
            print(color.CYAN + color.BOLD + "General" + color.END, end='')
        print(", %d, %.2lf)  " %(i, dt[i]), end='')    
    print()

def print_doc_topics(doc_topic, doc, n_best):
    print(color.BOLD + "Doc " + str(doc) + "\t: " + color.END, end='')
    for i in doc_topic[doc].argsort()[:-n_best - 1:-1]:
        print("(", end='')
        try:
            print(color.CYAN + color.BOLD + categories[i//3] + color.END, end='')
        except:
            print(color.CYAN + color.BOLD + "General" + color.END, end='')
        print(", %d, %.2lf)  " %(i, doc_topic[doc][i]), end='')    
    print()

def print_train_results(doc_topic, doc, corpus, data):
    print(color.BOLD + "Document " + str(doc) + color.END)
    print()
    print(color.BOLD + "Text: " + color.END)
    print("..." + corpus[doc][len(corpus[doc])//3:len(corpus[doc])//3+500] + "...")
    print()
    print()
    
    print(color.BOLD + "Topic Distribution: " + color.END)
    #print(pd.DataFrame(data=[W_test_norm[doc]], index = [doc], columns=categories+['general']))
    print_cumulative_train_doc_topics(data, doc_topic, doc, 11) 
    print()
    
    plot_radar_chart(doc_topic, doc)
    
def print_test_results(doc_topic, doc, corpus):
    print(color.BOLD + "Document " + str(doc) + color.END)
    print()
    print(color.BOLD + "Text: " + color.END)
    print("..." + corpus[doc][len(corpus[doc])//3:len(corpus[doc])//3+500] + "...")
    print()
    print()
    
    print(color.BOLD + "Topic Distribution: " + color.END)
    
    #print(pd.DataFrame(data=[W_test_norm[doc]], index = [doc], columns=categories+['general']))
    print_cumulative_test_doc_topics(doc_topic, doc, 11)
    print()
    
    plot_radar_chart(doc_topic, doc)
    
    

## Helper Functions

In [3]:
def cumulate_W(W, n_topics):
    W_cumul = []
    for d in W:
        temp = []
        for i in range(W.shape[1]//n_topics):
            temp.append(d[i*n_topics:(i+1)*n_topics].sum())
        W_cumul.append(temp)

    W_cumul = np.asarray(W_cumul)
    
    return W_cumul

def normalize_W(W):
    W_cumul_norm = W/(W.sum(axis=1).reshape(W.shape[0], 1))
    W_cumul_norm *= 100
    
    return W_cumul_norm

def export_to_excel(W, docs, filepath):
    '''
    Take cumulated W as input.
    Don't forget to put xlsx as file extension '''
    
    df = pd.DataFrame(data=W,index = range(len(W)), columns=categories)
    df['Text'] = docs
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    df.to_excel(filepath)
    return df

def export_to_csv(W, docs, filepath):
    '''
    Take cumulated W as input.
    Don't forget to put csv as file extension '''
    
    df = pd.DataFrame(data=W,index = range(len(W)), columns=categories)
    df['Text'] = docs
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    df.to_csv(filepath)
    return df

## Main Functions

In [4]:
def read_data(filepath):
    data = pd.read_json(filepath)
    data = data[data['text']!=""]
    data = data.sort_values('theme.id')
    
    return data
    
def extract_corpus(data):    
    corpus = list(data['text'])
    return corpus

def preprocess_corpus(corpus):
    PPcorpus = [' '.join(list((extract_terms(doc, extra_process = ['stem'])['Stem']+' ')*extract_terms(doc, extra_process = ['stem'])['TF'])) for doc in corpus]
    return PPcorpus

def train_corpus(corpus, data, brown_corpus, n_topics=3, betaloss = 'kullback-leibler', bckg_brown = False):
    N = len(data)
    
    theme_counts = data.groupby(['theme.id','theme']).count().iloc[:,1]
    pd_theme_counts = pd.DataFrame(theme_counts)
    n_themes = len(theme_counts)
    
    n_top_words = 5
    n_components = n_topics*(n_themes)
    
    
    print("Extracting tf-idf features for NMF...")
    tfidf_vectorizer = TfidfVectorizer() # optionally add maxfeatures = n_features to enforce number of features
    t0 = time()
    
    W_list = []
    
    if bckg_brown:
        tfidf = tfidf_vectorizer.fit_transform(corpus+brown_corpus)
        tc_sum = 0
        for tc in theme_counts:
            W = np.zeros((N+len(brown_corpus),2*n_topics))
            W[N:, n_topics:] = np.random.random((len(brown_corpus),n_topics))
            W[tc_sum:tc_sum+tc, :] = np.random.random((tc,2*n_topics))

            tc_sum += tc
            W_list.append(W)
    else:
        tfidf = tfidf_vectorizer.fit_transform(corpus)
        tc_sum = 0
        for tc in theme_counts:
            W = np.zeros((N,2*n_topics))
            W[:, n_topics:] = np.random.random((N,n_topics))
            W[tc_sum:tc_sum+tc, :n_topics] = np.random.random((tc,n_topics))

            tc_sum += tc
            W_list.append(W)
        
    n_features = tfidf.shape[1]
    print(n_features)
    print("done in %0.2fs." % (time() - t0))
    
    X = tfidf 
    nmf_list = []

    for i, W in enumerate(W_list):
        print("Fitting NMF for " + str(theme_counts.index[i][1]))
        t0 = time()
        H = np.random.rand(2*n_topics, n_features)

        nmf = NMF(n_components= 2*n_topics, solver='mu', beta_loss=betaloss,
                  alpha=.1, l1_ratio=.5, init = 'custom')

        nmf.fit_transform(X=X,W=W,H=H)
        print("done in %0.2fs." % (time() - t0))

        nmf_list.append(nmf)
    
    
    return nmf_list, W_list, tfidf, tfidf_vectorizer
    
def evaluate_docs(docs, nmf, tfidf_test, betaloss = 'kullback-leibler'):
    X_test = tfidf_test
    H_test = nmf.components_
    
    # Fit the NMF model
    t0 = time()

    W_test = nmf.transform(X_test)
    
    return W_test

## Training Model

In [5]:
#https://github.com/bulentozel/OpenMaker/blob/master/Semantics/data/corpuses/schwartz.json
# schwartz.json or pruned_schwartz.json
filepath = 'pruned_schwartz.json'

data = read_data(filepath)
# corpus = extract_corpus(data)
# corpusPP = preprocess_corpus(corpus)

corpusPP = list(data.text.apply(tp.clean_text))

Fix bad wording:  0.006991863250732422 s
Tokenize:  0.012935161590576172 s
Remove stopwords and Lemmatize:  1.8916518688201904 s

Fix bad wording:  0.0039730072021484375 s
Tokenize:  0.013935327529907227 s
Remove stopwords and Lemmatize:  0.02895379066467285 s

Fix bad wording:  0.0039615631103515625 s
Tokenize:  0.0059816837310791016 s
Remove stopwords and Lemmatize:  0.015967845916748047 s

Fix bad wording:  0.006997823715209961 s
Tokenize:  0.012973308563232422 s
Remove stopwords and Lemmatize:  0.03392839431762695 s

Fix bad wording:  0.0019452571868896484 s
Tokenize:  0.0030264854431152344 s
Remove stopwords and Lemmatize:  0.08374190330505371 s

Fix bad wording:  0.012964963912963867 s
Tokenize:  0.025928974151611328 s
Remove stopwords and Lemmatize:  0.0498960018157959 s

Fix bad wording:  0.000995635986328125 s
Tokenize:  0.0029985904693603516 s
Remove stopwords and Lemmatize:  0.005976676940917969 s

Fix bad wording:  0.0 s
Tokenize:  0.0009968280792236328 s
Remove stopwords a

Remove stopwords and Lemmatize:  0.06284451484680176 s

Fix bad wording:  0.005972385406494141 s
Tokenize:  0.010000228881835938 s
Remove stopwords and Lemmatize:  0.02590203285217285 s

Fix bad wording:  0.004987478256225586 s
Tokenize:  0.009961605072021484 s
Remove stopwords and Lemmatize:  0.02490234375 s

Fix bad wording:  0.008983135223388672 s
Tokenize:  0.014946222305297852 s
Remove stopwords and Lemmatize:  0.04307365417480469 s

Fix bad wording:  0.022968769073486328 s
Tokenize:  0.044850826263427734 s
Remove stopwords and Lemmatize:  0.12521839141845703 s

Fix bad wording:  0.005954265594482422 s
Tokenize:  0.008975744247436523 s
Remove stopwords and Lemmatize:  0.025945663452148438 s

Fix bad wording:  0.000982522964477539 s
Tokenize:  0.0029990673065185547 s
Remove stopwords and Lemmatize:  0.008999347686767578 s

Fix bad wording:  0.003985404968261719 s
Tokenize:  0.009947538375854492 s
Remove stopwords and Lemmatize:  0.029919862747192383 s

Fix bad wording:  0.001998901

Fix bad wording:  0.009977340698242188 s
Tokenize:  0.01694965362548828 s
Remove stopwords and Lemmatize:  0.05318593978881836 s

Fix bad wording:  0.022019386291503906 s
Tokenize:  0.04088759422302246 s
Remove stopwords and Lemmatize:  0.11369705200195312 s

Fix bad wording:  0.011966466903686523 s
Tokenize:  0.01995372772216797 s
Remove stopwords and Lemmatize:  0.05983710289001465 s

Fix bad wording:  0.0019948482513427734 s
Tokenize:  0.00496220588684082 s
Remove stopwords and Lemmatize:  0.01495981216430664 s

Fix bad wording:  0.0030031204223632812 s
Tokenize:  0.0059947967529296875 s
Remove stopwords and Lemmatize:  0.012956619262695312 s

Fix bad wording:  0.0050051212310791016 s
Tokenize:  0.008953571319580078 s
Remove stopwords and Lemmatize:  0.029919147491455078 s

Fix bad wording:  0.0049877166748046875 s
Tokenize:  0.005982637405395508 s
Remove stopwords and Lemmatize:  0.014991998672485352 s

Fix bad wording:  0.0009658336639404297 s
Tokenize:  0.0019941329956054688 s
Re

Remove stopwords and Lemmatize:  0.0179750919342041 s

Fix bad wording:  0.002963542938232422 s
Tokenize:  0.005984067916870117 s
Remove stopwords and Lemmatize:  0.014904260635375977 s

Fix bad wording:  0.0029659271240234375 s
Tokenize:  0.0070078372955322266 s
Remove stopwords and Lemmatize:  0.02300262451171875 s

Fix bad wording:  0.0 s
Tokenize:  0.0009968280792236328 s
Remove stopwords and Lemmatize:  0.003989458084106445 s

Fix bad wording:  0.009947061538696289 s
Tokenize:  0.015985488891601562 s
Remove stopwords and Lemmatize:  0.044851064682006836 s

Fix bad wording:  0.0009980201721191406 s
Tokenize:  0.0009980201721191406 s
Remove stopwords and Lemmatize:  0.003987550735473633 s

Fix bad wording:  0.010974407196044922 s
Tokenize:  0.01695108413696289 s
Remove stopwords and Lemmatize:  0.0428164005279541 s

Fix bad wording:  0.0009975433349609375 s
Tokenize:  0.003000497817993164 s
Remove stopwords and Lemmatize:  0.006946086883544922 s

Fix bad wording:  0.0039896965026855

Tokenize:  0.01692509651184082 s
Remove stopwords and Lemmatize:  0.05285835266113281 s

Fix bad wording:  0.0009982585906982422 s
Tokenize:  0.0019941329956054688 s
Remove stopwords and Lemmatize:  0.0029931068420410156 s

Fix bad wording:  0.0019936561584472656 s
Tokenize:  0.002992391586303711 s
Remove stopwords and Lemmatize:  0.009998559951782227 s

Fix bad wording:  0.0 s
Tokenize:  0.0 s
Remove stopwords and Lemmatize:  0.001997232437133789 s

Fix bad wording:  0.005980491638183594 s
Tokenize:  0.01196908950805664 s
Remove stopwords and Lemmatize:  0.033935546875 s

Fix bad wording:  0.003971576690673828 s
Tokenize:  0.006010532379150391 s
Remove stopwords and Lemmatize:  0.014929533004760742 s

Fix bad wording:  0.0019943714141845703 s
Tokenize:  0.002992391586303711 s
Remove stopwords and Lemmatize:  0.007978439331054688 s

Fix bad wording:  0.0010142326354980469 s
Tokenize:  0.0029745101928710938 s
Remove stopwords and Lemmatize:  0.005014657974243164 s

Fix bad wording:  0.0

Remove stopwords and Lemmatize:  0.026929616928100586 s

Fix bad wording:  0.0019943714141845703 s
Tokenize:  0.003971099853515625 s
Remove stopwords and Lemmatize:  0.010976314544677734 s

Fix bad wording:  0.0010101795196533203 s
Tokenize:  0.0029897689819335938 s
Remove stopwords and Lemmatize:  0.005957841873168945 s

Fix bad wording:  0.0070078372955322266 s
Tokenize:  0.012964248657226562 s
Remove stopwords and Lemmatize:  0.037872314453125 s

Fix bad wording:  0.0 s
Tokenize:  0.0010256767272949219 s
Remove stopwords and Lemmatize:  0.0009694099426269531 s

Fix bad wording:  0.0009989738464355469 s
Tokenize:  0.0019941329956054688 s
Remove stopwords and Lemmatize:  0.007978439331054688 s

Fix bad wording:  0.008996248245239258 s
Tokenize:  0.010980844497680664 s
Remove stopwords and Lemmatize:  0.029917001724243164 s

Fix bad wording:  0.0029909610748291016 s
Tokenize:  0.003989696502685547 s
Remove stopwords and Lemmatize:  0.01294398307800293 s

Fix bad wording:  0.00102591514

Remove stopwords and Lemmatize:  0.10970878601074219 s

Fix bad wording:  0.00598907470703125 s
Tokenize:  0.012959480285644531 s
Remove stopwords and Lemmatize:  0.03587627410888672 s

Fix bad wording:  0.003977060317993164 s
Tokenize:  0.007977962493896484 s
Remove stopwords and Lemmatize:  0.023935556411743164 s

Fix bad wording:  0.010004520416259766 s
Tokenize:  0.01794886589050293 s
Remove stopwords and Lemmatize:  0.055850982666015625 s

Fix bad wording:  0.0009980201721191406 s
Tokenize:  0.0029904842376708984 s
Remove stopwords and Lemmatize:  0.0049896240234375 s

Fix bad wording:  0.0009951591491699219 s
Tokenize:  0.0009949207305908203 s
Remove stopwords and Lemmatize:  0.004960536956787109 s

Fix bad wording:  0.005984306335449219 s
Tokenize:  0.012964963912963867 s
Remove stopwords and Lemmatize:  0.030954360961914062 s

Fix bad wording:  0.002994060516357422 s
Tokenize:  0.005975246429443359 s
Remove stopwords and Lemmatize:  0.01596665382385254 s

Fix bad wording:  0.00

In [None]:
mdetok = MosesDetokenizer()

brown_files_sent = []
for fid in brown.fileids():
    brown_files_sent.append([mdetok.detokenize(' '.join(sent).replace('``', '"').replace("''", '"').replace('`', "'").split(), return_str=True)  for sent in brown.sents(fid)])
    
brown_natural = [' '.join(bfs) for bfs in brown_files_sent]
brown_naturalPP = preprocess_corpus(brown_natural)

In [None]:
nmf_list, W_list, tfidf, tfidf_vectorizer = train_corpus(corpusPP, data, [], n_topics=6, betaloss = 'kullback-leibler', bckg_brown = False)

In [9]:
print("\nTopics in NMF model:")
for i in range(10):
    print_top_words(nmf_list, i, tfidf_vectorizer, n_top_words=5, n_topics=3)


Topics in NMF model:
[96m[1muniversalism[0m
[1mTopic #0: [0mrefer state regime political gift
[1mTopic #1: [0moccur often critical give political
[1mTopic #2: [0missue one canada goal synthesize

[96m[1mhedonism[0m
[1mTopic #0: [0msuicide shame positive time state
[1mTopic #1: [0mshyness translation relation uncover immediate
[1mTopic #2: [0mtime remove tip opposite important

[96m[1machievement[0m
[1mTopic #0: [0mrelationship large valid luxury product
[1mTopic #1: [0moccur procedure modern oxford motivational
[1mTopic #2: [0msocial well great medical production

[96m[1mpower[0m
[1mTopic #0: [0msystem unchosen effectively supervise remain
[1mTopic #1: [0malso relative air pain unsourced
[1mTopic #2: [0malso locke lead interest violent

[96m[1mself-direction[0m
[1mTopic #0: [0mlike sarawak list undertake incorporate
[1mTopic #1: [0mrevolutionary issue back interest best
[1mTopic #2: [0mwell romanticism redistribute strengthbased undermine



In [25]:
print("\nTopics in NMF model:")
for i in range(10):
    print_top_words(nmf_list, i, tfidf_vectorizer, n_top_words=5, n_topics=3)


Topics in NMF model:
[96m[1muniversalism[0m
[1mTopic #0: [0moften face use first way
[1mTopic #1: [0mdefine concern negative law world
[1mTopic #2: [0mcritical often force help state

[96m[1mhedonism[0m
[1mTopic #0: [0msadness shock selfconfidence sometimes others
[1mTopic #1: [0msuggest still say people thus
[1mTopic #2: [0mmust point say psychological social

[96m[1machievement[0m
[1mTopic #0: [0mtheory merely mark occur sincere
[1mTopic #1: [0msocial relation life student level
[1mTopic #2: [0mlead use procedure message production

[96m[1mpower[0m
[1mTopic #0: [0malong mean finally task draw
[1mTopic #1: [0magency one liability italian service
[1mTopic #2: [0malexandre form put separately tyranny

[96m[1mself-direction[0m
[1mTopic #0: [0mattractive technology increasingly use proper
[1mTopic #1: [0mlate quite specific toward katanga
[1mTopic #2: [0mquestion public invoke situation develop

[96m[1mbenevolence[0m
[1mTopic #0: [0mpower

In [107]:
# Sum up sub topics
W_train_norm_list = []
for W in W_list:
    W_train_cumul = cumulate_W(W, n_topics=3)
    W_train_norm = normalize_W(W_train_cumul)
    W_train_norm_list.append(W_train_norm)
W_train_norm = np.asarray(W_train_norm_list).T[0]

  


In [108]:
interact(print_train_results, doc_topic=fixed(W_train_norm), doc = (0, len(W_train_norm)-1, 1), corpus=fixed(corpus), data=fixed(data))

<function __main__.print_train_results>

In [10]:
df = export_to_excel(W_train_norm, corpus, filepath = 'output.xlsx')
df.head()

Unnamed: 0,Text,universalism,hedonism,achievement,power,self-direction,benevolence,conformity,tradition,stimulation,security
0,Critical thinking \n Sculpture of Socrates \n ...,66.123691,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Environmental justice \n This article has mult...,98.831153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Natural resource \n ""Primary resource"" redirec...",99.580788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Ceasefire \n ""Truce"" redirects here For other ...",99.969178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,International community \n The \n internationa...,99.999451,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df = export_to_csv(W_train_norm, corpus, filepath = 'output.csv')
df.head()

In [10]:
pickle.dump( [nmf_list, tfidf_vectorizer], open( "nmf2_pretrained_pruned_lem2.p", "wb" ) )
#pickle.dump( [nmf_list, tfidf_vectorizer], open( "nmf2_pretrained_pruned_brown.p", "wb" ) )

## Evaluating Different Documents

To evaluate your documents, simply append them to _docs list_ as a whole string.

Two example documents.

In [12]:
test_corpus = []
f = open("pope.txt", "r") #Pope ted talk, https://www.ted.com/speakers/pope_francis
pope = f.read()
test_corpus.append(pope)
f.close()

f = open("dod.txt", "r")  # US Department of Defense, https://www.defense.gov/About/
dod = f.read()
test_corpus.append(dod)
f.close()

In [13]:
test_corpusPP = preprocess_corpus(test_corpus)

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1857
Cleaning process: Initial size of tokens = 1857
Reduction due to punctuations and stopwords = 1332.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1332
Percentage = 72%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom sto

In [14]:
print("Extracting tf-idf features for NMF...")
t0 = time()
tfidf_test = tfidf_vectorizer.transform(test_corpusPP)
n_features = tfidf_test.shape[1]
print("done in %0.2fs." % (time() - t0))

W_test_list = []
for i, nmf in enumerate(nmf_list):
    print("Fitting NMF for " + str(categories[i]))
    W_test = evaluate_docs(test_corpusPP, nmf, tfidf_test, betaloss = 'kullback-leibler')
    W_test_list.append(W_test)

Extracting tf-idf features for NMF...
done in 0.00s.
Fitting NMF for universalism
Fitting NMF for hedonism
Fitting NMF for achievement
Fitting NMF for power
Fitting NMF for self-direction
Fitting NMF for benevolence
Fitting NMF for conformity
Fitting NMF for tradition
Fitting NMF for stimulation
Fitting NMF for security


In [15]:
# Sum up sub topics
W_test_norm_list = []
for W in W_test_list:
    W_test_cumul = cumulate_W(W, n_topics=3)
    W_test_norm = normalize_W(W_test_cumul)
    W_test_norm_list.append(W_test_norm)
W_test_norm = np.asarray(W_test_norm_list).T[0]

In [16]:
interact(print_test_results, doc_topic=fixed(W_test_norm), doc = (0, len(W_test_norm)-1, 1), corpus=fixed(test_corpus))

<function __main__.print_test_results>

In [17]:
df = export_to_excel(W_test_norm, test_corpus, filepath = 'output.xlsx')
df.head()

Unnamed: 0,Text,universalism,hedonism,achievement,power,self-direction,benevolence,conformity,tradition,stimulation,security
0,"Good evening â€“ or, good morning, I am not su...",26.917947,53.195741,21.905193,33.749939,14.305954,65.909071,76.491778,38.390092,32.039518,36.997533
1,\nOn behalf of the Secretary of Defense and De...,75.736422,5.911529,40.480161,76.072999,61.269648,0.049261,1.888345,0.001672,58.705763,79.90054


In [21]:
df = export_to_csv(W_test_norm, test_corpus, filepath = 'output.csv')
df.head()

Unnamed: 0,Text,universalism,hedonism,achievement,power,self-direction,benevolence,conformity,tradition,stimulation,security,general
0,"Good evening â€“ or, good morning, I am not su...",3.210163,7.430994,4.855774,6.453324,0.137821,20.459419,27.277332,9.677749,3.74642,5.519147,11.231858
1,\nOn behalf of the Secretary of Defense and De...,19.748271,0.946942,8.978493,16.955744,13.825183,0.004017,2.1e-05,1.3e-05,12.855169,26.148252,0.537895


In [70]:
categories=['universalism', 'hedonism', 'achievement', 'power',
       'self-direction', 'benevolence', 'conformity', 'tradition', 'stimulation',
       'security']

schwartz =['universalism', 'benevolence', 'conformity', 'tradition',
       'security', 'power', 'achievement', 'hedonism', 'stimulation',
       'self-direction']

In [82]:
feature_names = tfidf_vectorizer.get_feature_names()

cumul = 0
word_list = []

theme_counts = data.groupby(['theme.id','theme']).count().iloc[:,1]
pd_theme_counts = pd.DataFrame(theme_counts)

for i, row in pd.DataFrame(theme_counts).iterrows():
    tmp_list = []
    tfidf_avg = np.average(tfidf[cumul:cumul+int(row['document.id'])].toarray(), axis=0)
    cumul += int(row['document.id'])
    
    for idx in list(reversed(tfidf_avg.argsort())):
        tmp_list.append((feature_names[idx], np.round(tfidf_avg[idx], 4)))
        
    word_list.append(tmp_list)

schwartz_word_score = []
for sch in schwartz:
    schwartz_word_score.append(word_list[categories.index(sch)])

df_list = []
for i, a in enumerate(schwartz_word_score):
    df_list.append(pd.DataFrame(a, columns=[schwartz[i]+" - word", schwartz[i]+" - score"]))
score_df = pd.concat(df_list, axis=1)
score_df

Unnamed: 0,universalism - word,universalism - score,benevolence - word,benevolence - score,conformity - word,conformity - score,tradition - word,tradition - score,security - word,security - score,power - word,power - score,achievement - word,achievement - score,hedonism - word,hedonism - score,stimulation - word,stimulation - score,self-direction - word,self-direction - score
0,environment,0.0701,moral,0.0810,god,0.0843,virtu,0.1417,reciproc,0.1349,power,0.1135,capit,0.1522,pleasur,0.0727,travel,0.1690,creativ,0.0880
1,peac,0.0590,good,0.0739,command,0.0582,temper,0.0985,secur,0.1242,author,0.1116,statu,0.0813,happi,0.0631,sport,0.1499,independ,0.0648
2,ecolog,0.0542,ethic,0.0622,cultur,0.0511,humil,0.0921,social,0.0613,social,0.0481,social,0.0796,emot,0.0525,adventur,0.1471,intellig,0.0532
3,social,0.0448,truth,0.0485,parent,0.0477,tradit,0.0705,norm,0.0578,domin,0.0468,need,0.0458,pain,0.0407,tourism,0.1462,ye,0.0518
4,right,0.0425,god,0.0410,disciplin,0.0466,moral,0.0562,clean,0.0543,revolut,0.0414,human,0.0426,hedon,0.0402,explor,0.1033,invent,0.0468
5,human,0.0338,evil,0.0406,group,0.0463,sophrosyn,0.0547,contamin,0.0535,control,0.0405,individu,0.0424,person,0.0385,stimul,0.1031,innov,0.0386
6,intern,0.0332,forgiv,0.0393,polit,0.0450,christian,0.0527,risk,0.0449,polit,0.0379,manag,0.0413,psycholog,0.0350,genr,0.0664,territori,0.0376
7,war,0.0314,one,0.0354,behavior,0.0433,myth,0.0469,pollut,0.0354,state,0.0365,intellectu,0.0382,one,0.0342,fiction,0.0620,govern,0.0372
8,natur,0.0307,theori,0.0351,norm,0.0374,charact,0.0457,hygien,0.0352,collaps,0.0363,person,0.0381,feel,0.0321,game,0.0467,process,0.0368
9,resourc,0.0292,valu,0.0342,use,0.0369,one,0.0419,wast,0.0346,time,0.0363,societi,0.0379,love,0.0303,stori,0.0464,idea,0.0356


In [72]:
score_df.to_excel("wiki_tfidf_average.xlsx")