One vs All Method

Train NMF for each topic separately.

Use all Wiki articles as Background Corpus.

In [1]:
import pandas as pd
import numpy as np
from time import time

import nltk
from nltk.corpus import brown
from nltk.tokenize.moses import MosesDetokenizer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF

import matplotlib.pyplot as plt
from math import pi

from omterms.interface import *

from ipywidgets import interact, fixed

import pickle

import libs.text_preprocess as tp

import warnings
warnings.filterwarnings("ignore")



In [2]:
categories=['universalism', 'hedonism', 'achievement', 'power',
       'self-direction', 'benevolence', 'conformity', 'tradition', 'stimulation',
       'security']

schwartz =['universalism', 'benevolence', 'conformity', 'tradition',
       'security', 'power', 'achievement', 'hedonism', 'stimulation',
       'self-direction']

## Plots and Prints

In [3]:
categories=['universalism', 'hedonism', 'achievement', 'power',
       'self-direction', 'benevolence', 'conformity', 'tradition', 'stimulation',
       'security']

def plot_radar_chart(doc_topic_cumul, doc):
    # ------- PART 1: Create background
 
    # number of variablecategories
    schwartz =['universalism', 'benevolence', 'conformity', 'tradition',
       'security', 'power', 'achievement', 'hedonism', 'stimulation',
       'self-direction']
    
    schwartz_dist = []
    for sch in schwartz:
        schwartz_dist.append(doc_topic_cumul[doc][categories.index(sch)])
    
    N = len(schwartz)
    
    # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += angles[:1]

    plt.figure(figsize=(8,8))
    # Initialise the spider plot
    ax = plt.subplot(111, polar=True)

    # If you want the first axis to be on top:
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)

    # Draw one axe per variable + add labels labels yet
    plt.xticks(angles[:-1], schwartz)

    # Draw ylabels
    ax.set_rlabel_position(0)
    plt.yticks([25,50,75], ["25","50","75"], color="grey", size=7)
    plt.ylim(0,100)


    # ------- PART 2: Add plots

    # Plot each individual = each line of the data
    # I don't do a loop, because plotting more than 3 groups makes the chart unreadable

    # Ind1
    values = list(schwartz_dist) + list(schwartz_dist[:1])
    ax.plot(angles, values, linewidth=1, linestyle='solid')
    ax.fill(angles, values, 'b', alpha=0.1)

    # Add legend
    #plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.title("Schwartz Chart - Doc " + str(doc))
    plt.show()
    
    
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'
    
    
def print_top_words(model, theme, tfidf_vectorizer, n_top_words, n_topics=3):
    feature_names = tfidf_vectorizer.get_feature_names()
    print(color.CYAN + color.BOLD + categories[theme] + color.END)
    for topic_idx, topic in enumerate(model[theme].components_):
        if topic_idx / n_topics == 1:
            break
        message = color.BOLD + "Topic #%d: " % topic_idx + color.END
        message += " - ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
def print_cumulative_train_doc_topics(data, doc_topic, doc, n_best):
    test_theme = data.iloc[doc]['theme']
    print(color.BOLD + "Doc " + str(doc) + color.RED +  " (" + test_theme + ")\t: " + color.END, end='')
    dt = doc_topic[doc]
    for i in dt.argsort()[:-n_best - 1:-1]:
        print("(", end='')
        try:
            print(color.CYAN + color.BOLD + categories[i] + color.END, end='')
        except:
            print(color.CYAN + color.BOLD + "General" + color.END, end='')
        print(", %d, %.2lf)  " %(i, dt[i]), end='')    
    print()
    
def print_cumulative_test_doc_topics(doc_topic, doc, n_best):
    print(color.BOLD + "Doc " + str(doc) + "\t: " + color.END, end='')
    dt = doc_topic[doc]
    for i in dt.argsort()[:-n_best - 1:-1]:
        print("(", end='')
        try:
            print(color.CYAN + color.BOLD + categories[i] + color.END, end='')
        except:
            print(color.CYAN + color.BOLD + "General" + color.END, end='')
        print(", %d, %.2lf)  " %(i, dt[i]), end='')    
    print()

def print_doc_topics(doc_topic, doc, n_best):
    print(color.BOLD + "Doc " + str(doc) + "\t: " + color.END, end='')
    for i in doc_topic[doc].argsort()[:-n_best - 1:-1]:
        print("(", end='')
        try:
            print(color.CYAN + color.BOLD + categories[i//3] + color.END, end='')
        except:
            print(color.CYAN + color.BOLD + "General" + color.END, end='')
        print(", %d, %.2lf)  " %(i, doc_topic[doc][i]), end='')    
    print()

def print_train_results(doc_topic, doc, corpus, data):
    print(color.BOLD + "Document " + str(doc) + color.END)
    print()
    print(color.BOLD + "Text: " + color.END)
    print("..." + corpus[doc][len(corpus[doc])//3:len(corpus[doc])//3+500] + "...")
    print()
    print()
    
    print(color.BOLD + "Topic Distribution: " + color.END)
    #print(pd.DataFrame(data=[W_test_norm[doc]], index = [doc], columns=categories+['general']))
    print_cumulative_train_doc_topics(data, doc_topic, doc, 11) 
    print()
    
    plot_radar_chart(doc_topic, doc)
    
def print_test_results(doc_topic, doc, corpus):
    print(color.BOLD + "Document " + str(doc) + color.END)
    print()
    print(color.BOLD + "Text: " + color.END)
    print("..." + corpus[doc][len(corpus[doc])//3:len(corpus[doc])//3+500] + "...")
    print()
    print()
    
    print(color.BOLD + "Topic Distribution: " + color.END)
    
    #print(pd.DataFrame(data=[W_test_norm[doc]], index = [doc], columns=categories+['general']))
    print_cumulative_test_doc_topics(doc_topic, doc, 11)
    print()
    
    plot_radar_chart(doc_topic, doc)
    
    

## Helper Functions

In [4]:
def cumulate_W(W, n_topics):
    W_cumul = []
    for d in W:
        temp = []
        for i in range(W.shape[1]//n_topics):
            temp.append(d[i*n_topics:(i+1)*n_topics].sum())
        W_cumul.append(temp)

    W_cumul = np.asarray(W_cumul)
    
    return W_cumul

def normalize_W(W):
    W_cumul_norm = W/(W.sum(axis=1).reshape(W.shape[0], 1))
    W_cumul_norm *= 100
    
    return W_cumul_norm

def export_to_excel(W, docs, filepath):
    '''
    Take cumulated W as input.
    Don't forget to put xlsx as file extension '''
    
    df = pd.DataFrame(data=W,index = range(len(W)), columns=categories)
    df['Text'] = docs
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    df.to_excel(filepath)
    return df

def export_to_csv(W, docs, filepath):
    '''
    Take cumulated W as input.
    Don't forget to put csv as file extension '''
    
    df = pd.DataFrame(data=W,index = range(len(W)), columns=categories)
    df['Text'] = docs
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    df.to_csv(filepath)
    return df

## Main Functions

In [5]:
# just seperates list
# https://stackoverflow.com/a/35518205
def partition_list2(a, k):
    if k <= 1: return [a]
    if k >= len(a): return [[x] for x in a]
    partition_between = [(i+1)*len(a) // k for i in range(k-1)]
    average_height = float(sum(a[:,0]))/k
    best_score = None
    best_partitions = None
    count = 0

    while True:
        starts = [0] + partition_between
        ends = partition_between + [len(a)]
        partitions = [a[starts[i]:ends[i]] for i in range(k)]
        heights = [np.sum(p[:,0]) for p in partitions]
        #heights = list(map(sum, np.array(partitions)[:,0]))

        abs_height_diffs = list(map(lambda x: abs(average_height - x), heights))
        worst_partition_index = abs_height_diffs.index(max(abs_height_diffs))
        worst_height_diff = average_height - heights[worst_partition_index]

        if best_score is None or abs(worst_height_diff) < best_score:
            best_score = abs(worst_height_diff)
            best_partitions = partitions
            no_improvements_count = 0
        else:
            no_improvements_count += 1

        if worst_height_diff == 0 or no_improvements_count > 5 or count > 100:
            return best_partitions
        count += 1

        move = -1 if worst_height_diff < 0 else 1
        bound_to_move = 0 if worst_partition_index == 0\
                        else k-2 if worst_partition_index == k-1\
                        else worst_partition_index-1 if (worst_height_diff < 0) ^ (heights[worst_partition_index-1] > heights[worst_partition_index+1])\
                        else worst_partition_index
        direction = -1 if bound_to_move < worst_partition_index else 1
        partition_between[bound_to_move] += move * direction

def print_best_partition(a, k):
    print('Partitioning {0} into {1} partitions'.format(a, k))
    p = partition_list(a, k)
    print('The best partitioning is {0}\n    With heights {1}\n'.format(p, list(map(sum, p))))
    
def initialize_H3(X, theme_counts, n_topics, p):
    X_sum = list(np.sum(X>0, axis=1))
    X_sum = [(x, i) for i, x in enumerate(X_sum)]
    #X_sum.sort(reverse=True)
    #X_sum = np.array(X_sum)

    X_sum_list = []
    X_parts_list = []

    tc_sum = 0
    for tc in theme_counts:
        X_sum_list.append(X_sum[tc_sum:tc+tc_sum])
        X_sum_list[-1].sort(reverse=True)
        X_sum_list[-1] = np.array(X_sum_list[-1][:p*n_topics])
        X_parts_list.append(partition_list2(X_sum_list[-1], n_topics))


        tc_sum += tc

    H_list = []
    tc_sum = 0
    for i, tc in enumerate(theme_counts):
        H = np.zeros((n_topics+1, X.shape[1]))

        for j in range(0, n_topics):
            H[j] = np.average(X[X_parts_list[i][j%len(X_parts_list[i])][:, 1]], axis=0)
            #H[j][np.where(H[j]==0)] += np.average(X[tc_sum:tc+tc_sum], axis=0)[np.where(H[j]==0)]

        for j in range(n_topics, n_topics+1):
            bckg_parts = []
            for k in range(len(theme_counts)):
                bckg_parts.extend(X_parts_list[k][(j-n_topics)%len(X_parts_list[k])][:int(p//2), 1])
            H[j] = np.average(X[bckg_parts], axis=0)
            #H[j][np.where(H[j]==0)] += np.average(X, axis=0)[np.where(H[j]==0)]
        tc_sum += tc
            

        H_list.append(H)
    return H_list


In [6]:
def read_data(filepath):
    data = pd.read_json(filepath)
    data = data[data['text']!=""]
    data = data.sort_values('theme.id')
    
    return data
    
def extract_corpus(data):    
    corpus = list(data['text'])
    return corpus

def preprocess_corpus(corpus):
    PPcorpus = [' '.join(list((extract_terms(doc, extra_process = ['stem'])['Stem']+' ')*extract_terms(doc, extra_process = ['stem'])['TF'])) for doc in corpus]
    return PPcorpus

def train_corpus(corpus, data, brown_corpus, n_topics=3, betaloss = 'kullback-leibler', bckg_brown = False):
    N = len(data)
    
    theme_counts = data.groupby(['theme.id','theme']).count().iloc[:,1]
    pd_theme_counts = pd.DataFrame(theme_counts)
    n_themes = len(theme_counts)
    
    n_top_words = 5
    n_components = n_topics*(n_themes)
    
    
    print("Extracting tf-idf features for NMF...")
    #tfidf_vectorizer= TfidfVectorizer(min_df=1, ngram_range=(1,3), max_features=50000)
    tfidf_vectorizer= CountVectorizer(min_df=1, ngram_range=(1,3), max_features=50000)
    t0 = time()
    
    W_list = []
    
    if bckg_brown:
        tfidf = tfidf_vectorizer.fit_transform(corpus+brown_corpus)
        tc_sum = 0
        for tc in theme_counts:
            W = np.zeros((N+len(brown_corpus),n_topics+1))
            W[N:, n_topics:] = np.random.random((len(brown_corpus),1))
            W[tc_sum:tc_sum+tc, :] = np.random.random((tc,n_topics+1))

            tc_sum += tc
            W_list.append(W)
    else:
        tfidf = tfidf_vectorizer.fit_transform(corpus)
        tc_sum = 0
        for tc in theme_counts:
            W = np.zeros((N,n_topics+1))
            W[:, n_topics:] = np.random.random((N,1))
            W[tc_sum:tc_sum+tc, :n_topics] = np.random.random((tc,n_topics))

            tc_sum += tc
            W_list.append(W)
        
    n_features = tfidf.shape[1]
    print(n_features)
    print("done in %0.2fs." % (time() - t0))
    
    X = tfidf 
    nmf_list = []
    H_list = initialize_H3(X.toarray(), theme_counts, n_topics, p=20)

    for i, W in enumerate(W_list):
        print("Fitting NMF for " + str(theme_counts.index[i][1]))
        t0 = time()
        H = H_list[i]
        #H = np.random.rand(n_topics+1, n_features)

        nmf = NMF(n_components= n_topics+1, solver='mu', beta_loss=betaloss,
                  alpha=.1, l1_ratio=.5, init = 'custom')

        nmf.fit_transform(X=X,W=W,H=H)
        print("done in %0.2fs." % (time() - t0))

        nmf_list.append(nmf)
    
    
    return nmf_list, W_list, tfidf, tfidf_vectorizer
    
def get_pretrained_words(pre_nmf_list, pre_tfidf_vectorizer, word_count, normalized=False, anti=0):
    n_topics = pre_nmf_list[0].components_.shape[0]-1
    word_list = []
    feature_names = pre_tfidf_vectorizer.get_feature_names()
    
    nmf_comps = []
    for pnmf in pre_nmf_list:
        aa = pnmf.components_
        nmf_comps.append(aa/np.sum(aa,axis=1)[:, np.newaxis])
    
    for theme in range(10):
        #word_topic = cumulate_W(pre_nmf_list[theme].components_.T,n_topics).T[anti]
        for nt in range(n_topics):
            if normalized:
                word_topic = nmf_comps[theme][nt]
            else:
                word_topic = pre_nmf_list[theme].components_[nt]
            tmp_list = []
            for i, idx in enumerate(list(reversed(word_topic.argsort()))):
                if i == word_count:
                    break
                tmp_list.append((feature_names[idx], np.round(word_topic[idx], 3)))
            word_list.append(tmp_list)
    
    schwartz_word_score = []
    for sch in schwartz:
        for nt in range(n_topics):
            schwartz_word_score.append(word_list[n_topics*categories.index(sch)+nt])
        
    df_list = []
    for i, a in enumerate(schwartz_word_score):
        df_list.append(pd.DataFrame(a, columns=[schwartz[i//n_topics]+ " (" + str(i%n_topics)+ ") - word",
                                                schwartz[i//n_topics]+ " (" + str(i%n_topics)+ ") - score"]))
    score_df = pd.concat(df_list, axis=1)
    
    return score_df

def export_pretrained_excel(pre_nmf_list, pre_tfidf_vectorizer, filepath, word_count=-1, anti=0):
    df = get_pretrained_words(pre_nmf_list, pre_tfidf_vectorizer, word_count, anti)
    df.to_excel(filepath)

## Training Model

In [7]:
#https://github.com/bulentozel/OpenMaker/blob/master/Semantics/data/corpuses/schwartz.json
# schwartz.json or pruned_schwartz.json
filepath = 'pruned_schwartz.json'

data = read_data(filepath)
# corpus = extract_corpus(data)
# corpusPP = preprocess_corpus(corpus)

corpusPP = list(data.text.apply(tp.clean_text))

Fix bad wording:  0.009974241256713867 s
Tokenize:  0.009972333908081055 s
Remove stopwords and Lemmatize:  2.500488042831421 s

Fix bad wording:  0.004986763000488281 s
Tokenize:  0.008976459503173828 s
Remove stopwords and Lemmatize:  0.029918909072875977 s

Fix bad wording:  0.004986763000488281 s
Tokenize:  0.006981611251831055 s
Remove stopwords and Lemmatize:  0.013963460922241211 s

Fix bad wording:  0.005983114242553711 s
Tokenize:  0.021940946578979492 s
Remove stopwords and Lemmatize:  0.05884361267089844 s

Fix bad wording:  0.003989458084106445 s
Tokenize:  0.0059850215911865234 s
Remove stopwords and Lemmatize:  0.011965751647949219 s

Fix bad wording:  0.00997304916381836 s
Tokenize:  0.02693009376525879 s
Remove stopwords and Lemmatize:  0.07181620597839355 s

Fix bad wording:  0.001993417739868164 s
Tokenize:  0.003989219665527344 s
Remove stopwords and Lemmatize:  0.00997304916381836 s

Fix bad wording:  0.0 s
Tokenize:  0.0009965896606445312 s
Remove stopwords and Lem

Remove stopwords and Lemmatize:  0.027925968170166016 s

Fix bad wording:  0.000997781753540039 s
Tokenize:  0.0009965896606445312 s
Remove stopwords and Lemmatize:  0.0019943714141845703 s

Fix bad wording:  0.0009970664978027344 s
Tokenize:  0.002991914749145508 s
Remove stopwords and Lemmatize:  0.00997471809387207 s

Fix bad wording:  0.002991199493408203 s
Tokenize:  0.005983829498291016 s
Remove stopwords and Lemmatize:  0.01795220375061035 s

Fix bad wording:  0.0139617919921875 s
Tokenize:  0.023941516876220703 s
Remove stopwords and Lemmatize:  0.07978677749633789 s

Fix bad wording:  0.0069811344146728516 s
Tokenize:  0.011968374252319336 s
Remove stopwords and Lemmatize:  0.03789830207824707 s

Fix bad wording:  0.008976936340332031 s
Tokenize:  0.014958858489990234 s
Remove stopwords and Lemmatize:  0.03789806365966797 s

Fix bad wording:  0.010972261428833008 s
Tokenize:  0.00997304916381836 s
Remove stopwords and Lemmatize:  0.040891408920288086 s

Fix bad wording:  0.039

Tokenize:  0.0059854984283447266 s
Remove stopwords and Lemmatize:  0.029918193817138672 s

Fix bad wording:  0.0019953250885009766 s
Tokenize:  0.0029990673065185547 s
Remove stopwords and Lemmatize:  0.009966611862182617 s

Fix bad wording:  0.011983394622802734 s
Tokenize:  0.01992964744567871 s
Remove stopwords and Lemmatize:  0.032915592193603516 s

Fix bad wording:  0.008977890014648438 s
Tokenize:  0.02592778205871582 s
Remove stopwords and Lemmatize:  0.09873843193054199 s

Fix bad wording:  0.03989267349243164 s
Tokenize:  0.06781697273254395 s
Remove stopwords and Lemmatize:  0.15658140182495117 s

Fix bad wording:  0.01695561408996582 s
Tokenize:  0.03191733360290527 s
Remove stopwords and Lemmatize:  0.06183195114135742 s

Fix bad wording:  0.002991199493408203 s
Tokenize:  0.003989219665527344 s
Remove stopwords and Lemmatize:  0.021941423416137695 s

Fix bad wording:  0.003989219665527344 s
Tokenize:  0.006982088088989258 s
Remove stopwords and Lemmatize:  0.0179517269134

Remove stopwords and Lemmatize:  0.026927709579467773 s

Fix bad wording:  0.004986763000488281 s
Tokenize:  0.005984783172607422 s
Remove stopwords and Lemmatize:  0.024932861328125 s

Fix bad wording:  0.003989696502685547 s
Tokenize:  0.005983591079711914 s
Remove stopwords and Lemmatize:  0.01496577262878418 s

Fix bad wording:  0.0029859542846679688 s
Tokenize:  0.004987001419067383 s
Remove stopwords and Lemmatize:  0.021941184997558594 s

Fix bad wording:  0.002991914749145508 s
Tokenize:  0.004986763000488281 s
Remove stopwords and Lemmatize:  0.017951250076293945 s

Fix bad wording:  0.005984783172607422 s
Tokenize:  0.011970043182373047 s
Remove stopwords and Lemmatize:  0.03689885139465332 s

Fix bad wording:  0.000997304916381836 s
Tokenize:  0.0019953250885009766 s
Remove stopwords and Lemmatize:  0.00797891616821289 s

Fix bad wording:  0.01496124267578125 s
Tokenize:  0.02393507957458496 s
Remove stopwords and Lemmatize:  0.07280731201171875 s

Fix bad wording:  0.000997

Remove stopwords and Lemmatize:  0.05485343933105469 s

Fix bad wording:  0.0029888153076171875 s
Tokenize:  0.006981849670410156 s
Remove stopwords and Lemmatize:  0.012965679168701172 s

Fix bad wording:  0.003989219665527344 s
Tokenize:  0.005984067916870117 s
Remove stopwords and Lemmatize:  0.016954660415649414 s

Fix bad wording:  0.0049860477447509766 s
Tokenize:  0.01596236228942871 s
Remove stopwords and Lemmatize:  0.027927160263061523 s

Fix bad wording:  0.001995563507080078 s
Tokenize:  0.0039882659912109375 s
Remove stopwords and Lemmatize:  0.016954421997070312 s

Fix bad wording:  0.005984783172607422 s
Tokenize:  0.010970830917358398 s
Remove stopwords and Lemmatize:  0.04288506507873535 s

Fix bad wording:  0.015956640243530273 s
Tokenize:  0.02892327308654785 s
Remove stopwords and Lemmatize:  0.08178162574768066 s

Fix bad wording:  0.0009970664978027344 s
Tokenize:  0.0019948482513427734 s
Remove stopwords and Lemmatize:  0.003989696502685547 s

Fix bad wording:  0

Remove stopwords and Lemmatize:  0.041889190673828125 s

Fix bad wording:  0.020943880081176758 s
Tokenize:  0.03191423416137695 s
Remove stopwords and Lemmatize:  0.0608367919921875 s

Fix bad wording:  0.01296544075012207 s
Tokenize:  0.023936748504638672 s
Remove stopwords and Lemmatize:  0.09674239158630371 s

Fix bad wording:  0.006979703903198242 s
Tokenize:  0.014960765838623047 s
Remove stopwords and Lemmatize:  0.041887521743774414 s

Fix bad wording:  0.0009970664978027344 s
Tokenize:  0.0029921531677246094 s
Remove stopwords and Lemmatize:  0.00598454475402832 s

Fix bad wording:  0.0039882659912109375 s
Tokenize:  0.0069811344146728516 s
Remove stopwords and Lemmatize:  0.02094292640686035 s

Fix bad wording:  0.007979393005371094 s
Tokenize:  0.012965679168701172 s
Remove stopwords and Lemmatize:  0.029919862747192383 s

Fix bad wording:  0.008976459503173828 s
Tokenize:  0.016954421997070312 s
Remove stopwords and Lemmatize:  0.04488039016723633 s

Fix bad wording:  0.004

Remove stopwords and Lemmatize:  0.05186152458190918 s

Fix bad wording:  0.0029921531677246094 s
Tokenize:  0.008976459503173828 s
Remove stopwords and Lemmatize:  0.01296377182006836 s

Fix bad wording:  0.000997304916381836 s
Tokenize:  0.001995086669921875 s
Remove stopwords and Lemmatize:  0.004986763000488281 s

Fix bad wording:  0.000997304916381836 s
Tokenize:  0.0009970664978027344 s
Remove stopwords and Lemmatize:  0.0019960403442382812 s

Fix bad wording:  0.008974790573120117 s
Tokenize:  0.026927709579467773 s
Remove stopwords and Lemmatize:  0.051860809326171875 s

Fix bad wording:  0.0059833526611328125 s
Tokenize:  0.014961481094360352 s
Remove stopwords and Lemmatize:  0.04687380790710449 s

Fix bad wording:  0.003989696502685547 s
Tokenize:  0.010970592498779297 s
Remove stopwords and Lemmatize:  0.020943641662597656 s

Fix bad wording:  0.001995086669921875 s
Tokenize:  0.004987001419067383 s
Remove stopwords and Lemmatize:  0.01197504997253418 s

Fix bad wording:  0

In [None]:
mdetok = MosesDetokenizer()

brown_files_sent = []
for fid in brown.fileids():
    brown_files_sent.append([mdetok.detokenize(' '.join(sent).replace('``', '"').replace("''", '"').replace('`', "'").split(), return_str=True)  for sent in brown.sents(fid)])
    
brown_natural = [' '.join(bfs) for bfs in brown_files_sent]
brown_naturalPP = preprocess_corpus(brown_natural)

In [8]:
nmf_list, W_list, tfidf, tfidf_vectorizer = train_corpus(corpusPP, data, [], n_topics=3, betaloss = 'kullback-leibler', bckg_brown = False)

Extracting tf-idf features for NMF...
50000
done in 13.28s.
Fitting NMF for universalism
done in 11.24s.
Fitting NMF for hedonism
done in 9.11s.
Fitting NMF for achievement
done in 9.13s.
Fitting NMF for power
done in 5.92s.
Fitting NMF for self-direction
done in 5.56s.
Fitting NMF for benevolence
done in 8.53s.
Fitting NMF for conformity
done in 8.25s.
Fitting NMF for tradition
done in 5.27s.
Fitting NMF for stimulation
done in 5.25s.
Fitting NMF for security
done in 5.26s.


In [13]:
# new
print("\nTopics in NMF model:")
for i in range(10):
    print_top_words(nmf_list, i, tfidf_vectorizer, n_top_words=5, n_topics=4)


Topics in NMF model:
[96m[1muniversalism[0m
[1mTopic #0: [0menvironmental - movement - state - marriage - social
[1mTopic #1: [0mright - environmental - law - social - human
[1mTopic #2: [0menergy - ecology - peace - use - human
[1mTopic #3: [0mone - social - may - also - use

[96m[1mhedonism[0m
[1mTopic #0: [0mpain - love - orgasm - one - empathy
[1mTopic #1: [0mone - happiness - pleasure - social - desire
[1mTopic #2: [0mmay - one - experience - also - emotion
[1mTopic #3: [0msocial - one - use - state - also

[96m[1machievement[0m
[1mTopic #0: [0msocial - class - capital - society - labour
[1mTopic #1: [0mwork - hour - individual - social - goal
[1mTopic #2: [0mcapital - status - social - human - need
[1mTopic #3: [0mone - social - state - use - also

[96m[1mpower[0m
[1mTopic #0: [0mpower - use - experiment - milgram - make
[1mTopic #1: [0mtime - state - wealth - power - collapse
[1mTopic #2: [0mauthority - power - veto - bill - social
[1m

In [15]:
get_pretrained_words(nmf_list, tfidf_vectorizer, word_count=10)

Unnamed: 0,universalism (0) - word,universalism (0) - score,universalism (1) - word,universalism (1) - score,universalism (2) - word,universalism (2) - score,benevolence (0) - word,benevolence (0) - score,benevolence (1) - word,benevolence (1) - score,...,stimulation (1) - word,stimulation (1) - score,stimulation (2) - word,stimulation (2) - score,self-direction (0) - word,self-direction (0) - score,self-direction (1) - word,self-direction (1) - score,self-direction (2) - word,self-direction (2) - score
0,environmental,49.667,right,45.119,energy,36.749,law,33.988,good,22.219,...,tourism,34.625,sport,31.517,creativity,48.524,innovation,20.31,yes,22.895
1,movement,40.511,environmental,40.54,ecology,33.34,truth,29.938,evil,21.605,...,travel,13.601,travel,8.751,play,22.123,idea,15.863,independence,14.82
2,state,39.03,law,33.303,peace,32.374,ethic,26.576,one,19.961,...,million,7.945,adventure,7.758,creative,21.152,unite,14.226,invention,11.506
3,marriage,35.007,social,31.894,use,28.147,forgiveness,26.164,justice,18.05,...,tourist,7.608,exploration,7.689,intelligence,11.612,intelligence,12.747,bully,10.333
4,social,34.964,human,30.453,human,27.457,theory,24.849,pardon,18.045,...,international,7.487,use,6.662,new,10.511,territory,12.589,positive,9.789
5,party,33.618,peace,30.217,think,25.11,good,23.249,lie,17.507,...,country,7.279,include,6.363,process,10.126,state,11.556,task,9.681
6,samesex,33.164,war,28.521,system,24.941,one,21.965,trust,16.348,...,billion,6.202,game,6.12,theory,10.048,new,11.161,individual,9.346
7,green,30.595,state,28.432,one,23.574,natural,19.912,individual,15.752,...,world,5.996,may,6.01,work,9.819,group,11.093,emotion,9.025
8,woman,26.753,use,26.748,social,23.401,may,15.696,moral,15.039,...,destination,5.189,also,5.84,also,9.063,curiosity,10.226,yes yes,8.915
9,right,26.69,specie,26.609,theory,22.065,natural law,14.742,social,14.72,...,unite,5.111,explorer,5.78,state,8.952,music,9.654,performance,8.694


In [None]:
export_pretrained_excel(nmf_list, tfidf_vectorizer, "ssnmf_theme_words_t3_1109.xlsx", word_count=-1, anti=0)

In [10]:
pickle.dump( [nmf_list, tfidf_vectorizer], open( "pretrained_v3_t3_h10_1409.p", "wb" ) )
#pickle.dump( [nmf_list, tfidf_vectorizer], open( "nmf2_pretrained_pruned_brown.p", "wb" ) )