One vs All Method

Train NMF for each topic separately.

Use all Wiki articles as Background Corpus.

In [87]:
import pandas as pd
import numpy as np
from time import time

import nltk
from nltk.corpus import brown
from nltk.tokenize.moses import MosesDetokenizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

import matplotlib.pyplot as plt
from math import pi

from omterms.interface import *

from ipywidgets import interact, fixed

import pickle

## Plots and Prints

In [88]:
categories=['universalism', 'hedonism', 'achievement', 'power',
       'self-direction', 'benevolence', 'conformity', 'tradition', 'stimulation',
       'security']

def plot_radar_chart(doc_topic_cumul, doc):
    # ------- PART 1: Create background
 
    # number of variablecategories
    schwartz =['universalism', 'benevolence', 'conformity', 'tradition',
       'security', 'power', 'achievement', 'hedonism', 'stimulation',
       'self-direction']
    
    schwartz_dist = []
    for sch in schwartz:
        schwartz_dist.append(doc_topic_cumul[doc][categories.index(sch)])
    
    N = len(schwartz)
    
    # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += angles[:1]

    plt.figure(figsize=(8,8))
    # Initialise the spider plot
    ax = plt.subplot(111, polar=True)

    # If you want the first axis to be on top:
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)

    # Draw one axe per variable + add labels labels yet
    plt.xticks(angles[:-1], schwartz)

    # Draw ylabels
    ax.set_rlabel_position(0)
    plt.yticks([25,50,75], ["25","50","75"], color="grey", size=7)
    plt.ylim(0,100)


    # ------- PART 2: Add plots

    # Plot each individual = each line of the data
    # I don't do a loop, because plotting more than 3 groups makes the chart unreadable

    # Ind1
    values = list(schwartz_dist) + list(schwartz_dist[:1])
    ax.plot(angles, values, linewidth=1, linestyle='solid')
    ax.fill(angles, values, 'b', alpha=0.1)

    # Add legend
    #plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.title("Schwartz Chart - Doc " + str(doc))
    plt.show()
    
    
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'
    
    
def print_top_words(model, theme, tfidf_vectorizer, n_top_words, n_topics=3):
    feature_names = tfidf_vectorizer.get_feature_names()
    print(color.CYAN + color.BOLD + categories[theme] + color.END)
    for topic_idx, topic in enumerate(model[theme].components_):
        if topic_idx / n_topics == 1:
            break
        message = color.BOLD + "Topic #%d: " % topic_idx + color.END
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
def print_cumulative_train_doc_topics(data, doc_topic, doc, n_best):
    test_theme = data.iloc[doc]['theme']
    print(color.BOLD + "Doc " + str(doc) + color.RED +  " (" + test_theme + ")\t: " + color.END, end='')
    dt = doc_topic[doc]
    for i in dt.argsort()[:-n_best - 1:-1]:
        print("(", end='')
        try:
            print(color.CYAN + color.BOLD + categories[i] + color.END, end='')
        except:
            print(color.CYAN + color.BOLD + "General" + color.END, end='')
        print(", %d, %.2lf)  " %(i, dt[i]), end='')    
    print()
    
def print_cumulative_test_doc_topics(doc_topic, doc, n_best):
    print(color.BOLD + "Doc " + str(doc) + "\t: " + color.END, end='')
    dt = doc_topic[doc]
    for i in dt.argsort()[:-n_best - 1:-1]:
        print("(", end='')
        try:
            print(color.CYAN + color.BOLD + categories[i] + color.END, end='')
        except:
            print(color.CYAN + color.BOLD + "General" + color.END, end='')
        print(", %d, %.2lf)  " %(i, dt[i]), end='')    
    print()

def print_doc_topics(doc_topic, doc, n_best):
    print(color.BOLD + "Doc " + str(doc) + "\t: " + color.END, end='')
    for i in doc_topic[doc].argsort()[:-n_best - 1:-1]:
        print("(", end='')
        try:
            print(color.CYAN + color.BOLD + categories[i//3] + color.END, end='')
        except:
            print(color.CYAN + color.BOLD + "General" + color.END, end='')
        print(", %d, %.2lf)  " %(i, doc_topic[doc][i]), end='')    
    print()

def print_train_results(doc_topic, doc, corpus, data):
    print(color.BOLD + "Document " + str(doc) + color.END)
    print()
    print(color.BOLD + "Text: " + color.END)
    print("..." + corpus[doc][len(corpus[doc])//3:len(corpus[doc])//3+500] + "...")
    print()
    print()
    
    print(color.BOLD + "Topic Distribution: " + color.END)
    #print(pd.DataFrame(data=[W_test_norm[doc]], index = [doc], columns=categories+['general']))
    print_cumulative_train_doc_topics(data, doc_topic, doc, 11) 
    print()
    
    plot_radar_chart(doc_topic, doc)
    
def print_test_results(doc_topic, doc, corpus):
    print(color.BOLD + "Document " + str(doc) + color.END)
    print()
    print(color.BOLD + "Text: " + color.END)
    print("..." + corpus[doc][len(corpus[doc])//3:len(corpus[doc])//3+500] + "...")
    print()
    print()
    
    print(color.BOLD + "Topic Distribution: " + color.END)
    
    #print(pd.DataFrame(data=[W_test_norm[doc]], index = [doc], columns=categories+['general']))
    print_cumulative_test_doc_topics(doc_topic, doc, 11)
    print()
    
    plot_radar_chart(doc_topic, doc)
    
    

## Helper Functions

In [89]:
def cumulate_W(W, n_topics):
    W_cumul = []
    for d in W:
        temp = []
        for i in range(W.shape[1]//n_topics):
            temp.append(d[i*n_topics:(i+1)*n_topics].sum())
        W_cumul.append(temp)

    W_cumul = np.asarray(W_cumul)
    
    return W_cumul

def normalize_W(W):
    W_cumul_norm = W/(W.sum(axis=1).reshape(W.shape[0], 1))
    W_cumul_norm *= 100
    
    return W_cumul_norm

def export_to_excel(W, docs, filepath):
    '''
    Take cumulated W as input.
    Don't forget to put xlsx as file extension '''
    
    df = pd.DataFrame(data=W,index = range(len(W)), columns=categories)
    df['Text'] = docs
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    df.to_excel(filepath)
    return df

def export_to_csv(W, docs, filepath):
    '''
    Take cumulated W as input.
    Don't forget to put csv as file extension '''
    
    df = pd.DataFrame(data=W,index = range(len(W)), columns=categories)
    df['Text'] = docs
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    df.to_csv(filepath)
    return df

## Main Functions

In [101]:
def read_data(filepath):
    data = pd.read_json(filepath)
    data = data[data['text']!=""]
    data = data.sort_values('theme.id')
    
    return data
    
def extract_corpus(data):    
    corpus = list(data['text'])
    return corpus

def preprocess_corpus(corpus):
    PPcorpus = [' '.join(list((extract_terms(doc, extra_process = ['stem'])['Stem']+' ')*extract_terms(doc, extra_process = ['stem'])['TF'])) for doc in corpus]
    return PPcorpus

def train_corpus(corpus, data, brown_corpus, n_topics=3, betaloss = 'kullback-leibler', bckg_brown = False):
    N = len(data)
    
    theme_counts = data.groupby(['theme.id','theme']).count().iloc[:,1]
    pd_theme_counts = pd.DataFrame(theme_counts)
    n_themes = len(theme_counts)
    
    n_top_words = 5
    n_components = n_topics*(n_themes)
    
    
    print("Extracting tf-idf features for NMF...")
    tfidf_vectorizer = TfidfVectorizer() # optionally add maxfeatures = n_features to enforce number of features
    t0 = time()
    tfidf = tfidf_vectorizer.fit_transform(corpus+brown_corpus)
    n_features = tfidf.shape[1]
    print("done in %0.2fs." % (time() - t0))
    
    W_list = []
    
    if bckg_brown:
        tc_sum = 0
        for tc in theme_counts:
            W = np.zeros((N+len(brown_corpus),2*n_topics))
            W[N:, n_topics:] = np.random.random((len(brown_corpus),n_topics))
            W[tc_sum:tc_sum+tc, :] = np.random.random((tc,2*n_topics))

            tc_sum += tc
            W_list.append(W)
    else:
        tc_sum = 0
        for tc in theme_counts:
            W = np.zeros((N,2*n_topics))
            W[:, n_topics:] = np.random.random((N,n_topics))
            W[tc_sum:tc_sum+tc, :n_topics] = np.random.random((tc,n_topics))

            tc_sum += tc
            W_list.append(W)
        
    X = tfidf 
    nmf_list = []

    for i, W in enumerate(W_list):
        print("Fitting NMF for " + str(theme_counts.index[i][1]))
        t0 = time()
        H = np.random.rand(2*n_topics, n_features)

        nmf = NMF(n_components= 2*n_topics, solver='mu', beta_loss=betaloss,
                  alpha=.1, l1_ratio=.5, init = 'custom')

        nmf.fit_transform(X=X,W=W,H=H)
        print("done in %0.2fs." % (time() - t0))

        nmf_list.append(nmf)
    
    
    return nmf_list, W_list, tfidf, tfidf_vectorizer
    
def evaluate_docs(docs, nmf, tfidf_test, betaloss = 'kullback-leibler'):
    X_test = tfidf_test
    H_test = nmf.components_
    
    # Fit the NMF model
    t0 = time()

    W_test = nmf.transform(X_test)
    
    return W_test

## Training Model

In [91]:
#https://github.com/bulentozel/OpenMaker/blob/master/Semantics/data/corpuses/schwartz.json
# schwartz.json or pruned_schwartz.json
filepath = 'pruned_schwartz.json'

data = read_data(filepath)
corpus = extract_corpus(data)
corpusPP = preprocess_corpus(corpus)

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3909
Cleaning process: Initial size of tokens = 3909
Reduction due to punctuations and stopwords = 2792.
Reduction due to all numeral terms = 8
Reduction due to short terms = 8
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2811
Percentage = 72%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom sto

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 6385
Cleaning process: Initial size of tokens = 6385
Reduction due to punctuations and stopwords = 4804.
Reduction due to all numeral terms = 24
Reduction due to short terms = 10
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 11
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 4849
Percentage = 76%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text clean

Done. Number of terms: 32877
Cleaning process: Initial size of tokens = 32877
Reduction due to punctuations and stopwords = 27352.
Reduction due to all numeral terms = 164
Reduction due to short terms = 17
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 31
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 27564
Percentage = 84%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 32877
Cleaning process: Initial size of tokens = 32877
Reduction due to punctuations and sto

Done. Number of terms: 10721
Cleaning process: Initial size of tokens = 10721
Reduction due to punctuations and stopwords = 8545.
Reduction due to all numeral terms = 16
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 7
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 8571
Percentage = 80%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 10721
Cleaning process: Initial size of tokens = 10721
Reduction due to punctuations and stopword

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1562
Cleaning process: Initial size of tokens = 1562
Reduction due to punctuations and stopwords = 1057.
Reduction due to all numeral terms = 2
Reduction due to short terms = 2
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1062
Percentage = 68%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 757
Cleaning process: Initial size of tokens = 757
Reduction due to punctuations and stopwords = 514.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 515
Percentage = 68%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopwor

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 2248
Cleaning process: Initial size of tokens = 2248
Reduction due to punctuations and stopwords = 1560.
Reduction due to all numeral terms = 3
Reduction due to short terms = 1
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1567
Percentage = 70%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Done. Number of terms: 4125
Cleaning process: Initial size of tokens = 4125
Reduction due to punctuations and stopwords = 2822.
Reduction due to all numeral terms = 13
Reduction due to short terms = 9
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 4
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2848
Percentage = 69%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3851
Cleaning process: Initial size of tokens = 3851
Reduction due to punctuations and stopwords = 

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 6499
Cleaning process: Initial size of tokens = 6499
Reduction due to punctuations and stopwords = 4711.
Reduction due to all numeral terms = 2
Reduction due to short terms = 7
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 4723
Percentage = 73%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Reduction due to all numeral terms = 0
Reduction due to short terms = 1
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1475
Percentage = 70%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 11229
Cleaning process: Initial size of tokens = 11229
Reduction due to punctuations and stopwords = 8565.
Reduction due to all numeral terms = 2
Reduction due to short terms = 7
Reduction due to rare terms = 0
Reduction due to 

Reduction due to all numeral terms = 0
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 489
Percentage = 64%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 2054
Cleaning process: Initial size of tokens = 2054
Reduction due to punctuations and stopwords = 1385.
Reduction due to all numeral terms = 0
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to par

Done. Number of terms: 4033
Cleaning process: Initial size of tokens = 4033
Reduction due to punctuations and stopwords = 2843.
Reduction due to all numeral terms = 6
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 5
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2857
Percentage = 71%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 363
Cleaning process: Initial size of tokens = 363
Reduction due to punctuations and stopwords = 223

Reduction due to punctuations and stopwords = 2724.
Reduction due to all numeral terms = 0
Reduction due to short terms = 9
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 4
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2737
Percentage = 73%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1986
Cleaning process: Initial size of tokens = 1986
Reduction due to punctuations and stopwords = 1404.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0

Reduction due to partially numeral terms = 6
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 3505
Percentage = 72%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 4859
Cleaning process: Initial size of tokens = 4859
Reduction due to punctuations and stopwords = 3477.
Reduction due to all numeral terms = 19
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 6
Reduction due to terms with not allowed symbols = 0
The total term count redu

Done. Number of terms: 959
Cleaning process: Initial size of tokens = 959
Reduction due to punctuations and stopwords = 604.
Reduction due to all numeral terms = 0
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 607
Percentage = 63%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 959
Cleaning process: Initial size of tokens = 959
Reduction due to punctuations and stopwords = 604.
Re

Reduction due to all numeral terms = 10
Reduction due to short terms = 1
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 3337
Percentage = 72%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 4603
Cleaning process: Initial size of tokens = 4603
Reduction due to punctuations and stopwords = 3325.
Reduction due to all numeral terms = 10
Reduction due to short terms = 1
Reduction due to rare terms = 0
Reduction due to 

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 274
Cleaning process: Initial size of tokens = 274
Reduction due to punctuations and stopwords = 187.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 187
Percentage = 68%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopwor

Reduction due to partially numeral terms = 2
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1240
Percentage = 64%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3760
Cleaning process: Initial size of tokens = 3760
Reduction due to punctuations and stopwords = 2583.
Reduction due to all numeral terms = 4
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduc

Reduction due to all numeral terms = 27
Reduction due to short terms = 15
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 11
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 4676
Percentage = 72%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3839
Cleaning process: Initial size of tokens = 3839
Reduction due to punctuations and stopwords = 2401.
Reduction due to all numeral terms = 0
Reduction due to short terms = 6
Reduction due to rare terms = 0
Reduction due to

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 8750
Cleaning process: Initial size of tokens = 8750
Reduction due to punctuations and stopwords = 7093.
Reduction due to all numeral terms = 31
Reduction due to short terms = 13
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 10
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 7147
Percentage = 82%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom 

Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2249
Percentage = 70%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 6013
Cleaning process: Initial size of tokens = 6013
Reduction due to punctuations and stopwords = 4483.
Reduction due to all numeral terms = 13
Reduction due to short terms = 5
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 8
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 4509
Per

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 7196
Cleaning process: Initial size of tokens = 7196
Reduction due to punctuations and stopwords = 5406.
Reduction due to all numeral terms = 10
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 10
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 5430
Percentage = 75%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleane

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 11064
Cleaning process: Initial size of tokens = 11064
Reduction due to punctuations and stopwords = 8675.
Reduction due to all numeral terms = 13
Reduction due to short terms = 6
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 8697
Percentage = 79%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text clean

Reduction due to all numeral terms = 27
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 12
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 4240
Percentage = 71%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 5948
Cleaning process: Initial size of tokens = 5948
Reduction due to punctuations and stopwords = 4197.
Reduction due to all numeral terms = 27
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1000
Cleaning process: Initial size of tokens = 1000
Reduction due to punctuations and stopwords = 573.
Reduction due to all numeral terms = 0
Reduction due to short terms = 2
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 575
Percentage = 58%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ..

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1848
Cleaning process: Initial size of tokens = 1848
Reduction due to punctuations and stopwords = 1206.
Reduction due to all numeral terms = 5
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1215
Percentage = 66%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Done. Number of terms: 14293
Cleaning process: Initial size of tokens = 14293
Reduction due to punctuations and stopwords = 11748.
Reduction due to all numeral terms = 69
Reduction due to short terms = 14
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 15
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 11846
Percentage = 83%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 14293
Cleaning process: Initial size of tokens = 14293
Reduction due to punctuations and stop

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1435
Cleaning process: Initial size of tokens = 1435
Reduction due to punctuations and stopwords = 722.
Reduction due to all numeral terms = 0
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 5
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 731
Percentage = 51%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopw

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 606
Cleaning process: Initial size of tokens = 606
Reduction due to punctuations and stopwords = 351.
Reduction due to all numeral terms = 0
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 355
Percentage = 59%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...


Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 503
Cleaning process: Initial size of tokens = 503
Reduction due to punctuations and stopwords = 250.
Reduction due to all numeral terms = 0
Reduction due to short terms = 2
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 2
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 254
Percentage = 50%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...


File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 672
Cleaning process: Initial size of tokens = 672
Reduction due to punctuations and stopwords = 427.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 427
Percentage = 64%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, da

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3704
Cleaning process: Initial size of tokens = 3704
Reduction due to punctuations and stopwords = 2512.
Reduction due to all numeral terms = 6
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2524
Percentage = 68%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom sto

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 5329
Cleaning process: Initial size of tokens = 5329
Reduction due to punctuations and stopwords = 3855.
Reduction due to all numeral terms = 16
Reduction due to short terms = 10
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 57
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 3938
Percentage = 74%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom 

Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1461
Percentage = 70%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 5785
Cleaning process: Initial size of tokens = 5785
Reduction due to punctuations and stopwords = 4373.
Reduction due to all numeral terms = 4
Reduction due to short terms = 1
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduc

Reduction due to partially numeral terms = 9
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 7593
Percentage = 79%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 511
Cleaning process: Initial size of tokens = 511
Reduction due to punctuations and stopwords = 302.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 2
Reduction due to terms with not allowed symbols = 0
The total term count reductio

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3816
Cleaning process: Initial size of tokens = 3816
Reduction due to punctuations and stopwords = 2610.
Reduction due to all numeral terms = 2
Reduction due to short terms = 14
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 5
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2631
Percentage = 69%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1338
Cleaning process: Initial size of tokens = 1338
Reduction due to punctuations and stopwords = 923.
Reduction due to all numeral terms = 0
Reduction due to short terms = 2
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 2
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 927
Percentage = 69%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopw

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 801
Cleaning process: Initial size of tokens = 801
Reduction due to punctuations and stopwords = 439.
Reduction due to all numeral terms = 0
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 443
Percentage = 55%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopwor

Reduction due to all numeral terms = 1
Reduction due to short terms = 6
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1297
Percentage = 65%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1760
Cleaning process: Initial size of tokens = 1760
Reduction due to punctuations and stopwords = 1088.
Reduction due to all numeral terms = 0
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to pa

Done. Number of terms: 5018
Cleaning process: Initial size of tokens = 5018
Reduction due to punctuations and stopwords = 3654.
Reduction due to all numeral terms = 2
Reduction due to short terms = 8
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 3665
Percentage = 73%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 885
Cleaning process: Initial size of tokens = 885
Reduction due to punctuations and stopwords = 575

Reduction due to punctuations and stopwords = 8968.
Reduction due to all numeral terms = 6
Reduction due to short terms = 7
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 5
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 8986
Percentage = 80%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 11279
Cleaning process: Initial size of tokens = 11279
Reduction due to punctuations and stopwords = 8968.
Reduction due to all numeral terms = 6
Reduction due to short terms =

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3274
Cleaning process: Initial size of tokens = 3274
Reduction due to punctuations and stopwords = 2352.
Reduction due to all numeral terms = 1
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2359
Percentage = 72%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2013
Percentage = 74%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 2714
Cleaning process: Initial size of tokens = 2714
Reduction due to punctuations and stopwords = 1966.
Reduction due to all numeral terms = 29
Reduction due to short terms = 13
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 5
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2013
Pe

Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2166
Percentage = 70%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 5688
Cleaning process: Initial size of tokens = 5688
Reduction due to punctuations and stopwords = 4303.
Reduction due to all numeral terms = 10
Reduction due to short terms = 7
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 2
Reduction due to terms with not allowed symbo

Reduction due to punctuations and stopwords = 2209.
Reduction due to all numeral terms = 7
Reduction due to short terms = 6
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2223
Percentage = 75%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1100
Cleaning process: Initial size of tokens = 1100
Reduction due to punctuations and stopwords = 769.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0


Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 6167
Cleaning process: Initial size of tokens = 6167
Reduction due to punctuations and stopwords = 4784.
Reduction due to all numeral terms = 28
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 10
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 4826
Percentage = 78%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleane

Reduction due to all numeral terms = 72
Reduction due to short terms = 10
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 14
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 6428
Percentage = 74%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 8643
Cleaning process: Initial size of tokens = 8643
Reduction due to punctuations and stopwords = 6332.
Reduction due to all numeral terms = 72
Reduction due to short terms = 10
Reduction due to rare terms = 0
Reduction due 

Reduction due to all numeral terms = 0
Reduction due to short terms = 8
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 955
Percentage = 64%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1489
Cleaning process: Initial size of tokens = 1489
Reduction due to punctuations and stopwords = 947.
Reduction due to all numeral terms = 0
Reduction due to short terms = 8
Reduction due to rare terms = 0
Reduction due to part

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 8244
Cleaning process: Initial size of tokens = 8244
Reduction due to punctuations and stopwords = 6570.
Reduction due to all numeral terms = 6
Reduction due to short terms = 5
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 9
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 6590
Percentage = 80%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1345
Cleaning process: Initial size of tokens = 1345
Reduction due to punctuations and stopwords = 864.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 865
Percentage = 64%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ..

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 5661
Cleaning process: Initial size of tokens = 5661
Reduction due to punctuations and stopwords = 4084.
Reduction due to all numeral terms = 33
Reduction due to short terms = 12
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 9
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 4138
Percentage = 73%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleane

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 995
Cleaning process: Initial size of tokens = 995
Reduction due to punctuations and stopwords = 569.
Reduction due to all numeral terms = 0
Reduction due to short terms = 2
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 571
Percentage = 57%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...


Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 4598
Cleaning process: Initial size of tokens = 4598
Reduction due to punctuations and stopwords = 3218.
Reduction due to all numeral terms = 5
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 3230
Percentage = 70%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Reduction due to punctuations and stopwords = 112.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 112
Percentage = 54%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 5657
Cleaning process: Initial size of tokens = 5657
Reduction due to punctuations and stopwords = 4123.
Reduction due to all numeral terms = 7
Reduction due to short terms = 6
R

Reduction due to punctuations and stopwords = 4068.
Reduction due to all numeral terms = 6
Reduction due to short terms = 11
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 4
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 4089
Percentage = 71%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 5728
Cleaning process: Initial size of tokens = 5728
Reduction due to punctuations and stopwords = 4068.
Reduction due to all numeral terms = 6
Reduction due to short terms = 

Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2270
Percentage = 73%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3115
Cleaning process: Initial size of tokens = 3115
Reduction due to punctuations and stopwords = 2266.
Reduction due to all numeral terms = 1
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbol

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 5458
Cleaning process: Initial size of tokens = 5458
Reduction due to punctuations and stopwords = 3953.
Reduction due to all numeral terms = 3
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 4
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 3963
Percentage = 73%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom sto

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 2562
Cleaning process: Initial size of tokens = 2562
Reduction due to punctuations and stopwords = 1953.
Reduction due to all numeral terms = 0
Reduction due to short terms = 5
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 2
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1960
Percentage = 77%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom sto

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 11457
Cleaning process: Initial size of tokens = 11457
Reduction due to punctuations and stopwords = 8990.
Reduction due to all numeral terms = 49
Reduction due to short terms = 15
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 17
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 9071
Percentage = 79%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custo

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 6490
Cleaning process: Initial size of tokens = 6490
Reduction due to punctuations and stopwords = 5285.
Reduction due to all numeral terms = 51
Reduction due to short terms = 18
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 39
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 5393
Percentage = 83%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text clean

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 10105
Cleaning process: Initial size of tokens = 10105
Reduction due to punctuations and stopwords = 7773.
Reduction due to all numeral terms = 47
Reduction due to short terms = 16
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 15
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 7851
Percentage = 78%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custo

Reduction due to all numeral terms = 2
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 2
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1986
Percentage = 70%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 2822
Cleaning process: Initial size of tokens = 2822
Reduction due to punctuations and stopwords = 1979.
Reduction due to all numeral terms = 2
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to pa

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 5053
Cleaning process: Initial size of tokens = 5053
Reduction due to punctuations and stopwords = 3490.
Reduction due to all numeral terms = 0
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 6
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 3499
Percentage = 69%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 5210
Percentage = 81%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 6437
Cleaning process: Initial size of tokens = 6437
Reduction due to punctuations and stopwords = 5189.
Reduction due to all numeral terms = 9
Reduction due to short terms = 9
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduc

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1942
Cleaning process: Initial size of tokens = 1942
Reduction due to punctuations and stopwords = 1268.
Reduction due to all numeral terms = 5
Reduction due to short terms = 5
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1279
Percentage = 66%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 7456
Cleaning process: Initial size of tokens = 7456
Reduction due to punctuations and stopwords = 5916.
Reduction due to all numeral terms = 2
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 2
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 5924
Percentage = 79%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1496
Cleaning process: Initial size of tokens = 1496
Reduction due to punctuations and stopwords = 935.
Reduction due to all numeral terms = 0
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 941
Percentage = 63%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ..

Reduction due to punctuations and stopwords = 2918.
Reduction due to all numeral terms = 4
Reduction due to short terms = 8
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2931
Percentage = 73%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 4317
Cleaning process: Initial size of tokens = 4317
Reduction due to punctuations and stopwords = 3079.
Reduction due to all numeral terms = 17
Reduction due to short terms = 

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 5048
Cleaning process: Initial size of tokens = 5048
Reduction due to punctuations and stopwords = 3585.
Reduction due to all numeral terms = 16
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 3605
Percentage = 71%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom st

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1085
Cleaning process: Initial size of tokens = 1085
Reduction due to punctuations and stopwords = 689.
Reduction due to all numeral terms = 6
Reduction due to short terms = 6
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 701
Percentage = 65%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopw

Reduction due to punctuations and stopwords = 2080.
Reduction due to all numeral terms = 0
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2084
Percentage = 73%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 2853
Cleaning process: Initial size of tokens = 2853
Reduction due to punctuations and stopwords = 2080.
Reduction due to all numeral terms = 0
Reduction due to short terms = 4

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1494
Cleaning process: Initial size of tokens = 1494
Reduction due to punctuations and stopwords = 959.
Reduction due to all numeral terms = 0
Reduction due to short terms = 2
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 962
Percentage = 64%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ..

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1278
Cleaning process: Initial size of tokens = 1278
Reduction due to punctuations and stopwords = 796.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 796
Percentage = 62%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ..

Reduction due to punctuations and stopwords = 1203.
Reduction due to all numeral terms = 0
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 8
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1214
Percentage = 65%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 865
Cleaning process: Initial size of tokens = 865
Reduction due to punctuations and stopwords = 500.
Reduction due to all numeral terms = 0
Reduction due to short terms = 3
Re

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 13743
Cleaning process: Initial size of tokens = 13743
Reduction due to punctuations and stopwords = 10595.
Reduction due to all numeral terms = 4
Reduction due to short terms = 9
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 20
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 10628
Percentage = 77%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cle

Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2965
Percentage = 73%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 4063
Cleaning process: Initial size of tokens = 4063
Reduction due to punctuations and stopwords = 2951.
Reduction due to all numeral terms = 7
Reduction due to short terms = 6
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduc

Reduction due to all numeral terms = 15
Reduction due to short terms = 13
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 2
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2043
Percentage = 72%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 501
Cleaning process: Initial size of tokens = 501
Reduction due to punctuations and stopwords = 283.
Reduction due to all numeral terms = 0
Reduction due to short terms = 2
Reduction due to rare terms = 0
Reduction due to par

Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2731
Percentage = 73%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1165
Cleaning process: Initial size of tokens = 1165
Reduction due to punctuations and stopwords = 715.
Reduction due to all numeral terms = 3
Reduction due to short terms = 1
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 722
Percen

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 7104
Cleaning process: Initial size of tokens = 7104
Reduction due to punctuations and stopwords = 5283.
Reduction due to all numeral terms = 17
Reduction due to short terms = 13
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 5314
Percentage = 75%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom s

Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 3623
Percentage = 72%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 7966
Cleaning process: Initial size of tokens = 7966
Reduction due to punctuations and stopwords = 6140.
Reduction due to all numeral terms = 2
Reduction due to short terms = 5
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 4
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 6151
Perc

Done.
COMPLETED.


In [None]:
mdetok = MosesDetokenizer()

brown_files_sent = []
for fid in brown.fileids():
    brown_files_sent.append([mdetok.detokenize(' '.join(sent).replace('``', '"').replace("''", '"').replace('`', "'").split(), return_str=True)  for sent in brown.sents(fid)])
    
brown_natural = [' '.join(bfs) for bfs in brown_files_sent]
brown_naturalPP = preprocess_corpus(brown_natural)

In [102]:
nmf_list, W_list, tfidf, tfidf_vectorizer = train_corpus(corpusPP, data, brown_naturalPP, n_topics=3, betaloss = 'kullback-leibler', bckg_brown = False)

Extracting tf-idf features for NMF...
done in 2.44s.
Fitting NMF for universalism
done in 16.82s.
Fitting NMF for hedonism
done in 17.33s.
Fitting NMF for achievement
done in 16.27s.
Fitting NMF for power
done in 17.32s.
Fitting NMF for self-direction
done in 16.62s.
Fitting NMF for benevolence
done in 18.14s.
Fitting NMF for conformity
done in 17.57s.
Fitting NMF for tradition
done in 17.15s.
Fitting NMF for stimulation
done in 24.83s.
Fitting NMF for security
done in 11.26s.


In [106]:
print("\nTopics in NMF model:")
for i in range(10):
    print_top_words(nmf_list, i, tfidf_vectorizer, n_top_words=5, n_topics=3)


Topics in NMF model:
[96m[1muniversalism[0m
[1mTopic #0: [0mindividu theori known topic interpret
[1mTopic #1: [0mstate principl group peopl becam
[1mTopic #2: [0mgroup gener first form disarma

[96m[1mhedonism[0m
[1mTopic #0: [0mrepres time shock sometim world
[1mTopic #1: [0mimport someth see studi unfamiliar
[1mTopic #2: [0mthink see psycholog studi research

[96m[1machievement[0m
[1mTopic #0: [0mrole theori primari work tribe
[1mTopic #1: [0mlower peopl motiv other top
[1mTopic #2: [0msocial merchant humanist use privat

[96m[1mpower[0m
[1mTopic #0: [0mlower tool bia unreli unit
[1mTopic #1: [0mcompos partner idea use son
[1mTopic #2: [0mleadership articl may specialti law

[96m[1mself-direction[0m
[1mTopic #0: [0muse domin resent well london
[1mTopic #1: [0mbenedek romantic take made ratifi
[1mTopic #2: [0mgener resourc known liberti olivero

[96m[1mbenevolence[0m
[1mTopic #0: [0mnatur thought renew shuv idea
[1mTopic #1: [0msee

In [107]:
# Sum up sub topics
W_train_norm_list = []
for W in W_list:
    W_train_cumul = cumulate_W(W, n_topics=3)
    W_train_norm = normalize_W(W_train_cumul)
    W_train_norm_list.append(W_train_norm)
W_train_norm = np.asarray(W_train_norm_list).T[0]

  


In [108]:
interact(print_train_results, doc_topic=fixed(W_train_norm), doc = (0, len(W_train_norm)-1, 1), corpus=fixed(corpus), data=fixed(data))

<function __main__.print_train_results>

In [10]:
df = export_to_excel(W_train_norm, corpus, filepath = 'output.xlsx')
df.head()

Unnamed: 0,Text,universalism,hedonism,achievement,power,self-direction,benevolence,conformity,tradition,stimulation,security
0,Critical thinking \n Sculpture of Socrates \n ...,66.123691,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Environmental justice \n This article has mult...,98.831153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Natural resource \n ""Primary resource"" redirec...",99.580788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Ceasefire \n ""Truce"" redirects here For other ...",99.969178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,International community \n The \n internationa...,99.999451,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
df = export_to_csv(W_train_norm, corpus, filepath = 'output.csv')
df.head()

ValueError: Length of values does not match length of index

In [104]:
#pickle.dump( [nmf_list, tfidf_vectorizer], open( "nmf2_pretrained_pruned.p", "wb" ) )
#pickle.dump( [nmf_list, tfidf_vectorizer], open( "nmf2_pretrained_pruned_brown.p", "wb" ) )

## Evaluating Different Documents

To evaluate your documents, simply append them to _docs list_ as a whole string.

Two example documents.

In [12]:
test_corpus = []
f = open("pope.txt", "r") #Pope ted talk, https://www.ted.com/speakers/pope_francis
pope = f.read()
test_corpus.append(pope)
f.close()

f = open("dod.txt", "r")  # US Department of Defense, https://www.defense.gov/About/
dod = f.read()
test_corpus.append(dod)
f.close()

In [13]:
test_corpusPP = preprocess_corpus(test_corpus)

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1857
Cleaning process: Initial size of tokens = 1857
Reduction due to punctuations and stopwords = 1332.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1332
Percentage = 72%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom sto

In [14]:
print("Extracting tf-idf features for NMF...")
t0 = time()
tfidf_test = tfidf_vectorizer.transform(test_corpusPP)
n_features = tfidf_test.shape[1]
print("done in %0.2fs." % (time() - t0))

W_test_list = []
for i, nmf in enumerate(nmf_list):
    print("Fitting NMF for " + str(categories[i]))
    W_test = evaluate_docs(test_corpusPP, nmf, tfidf_test, betaloss = 'kullback-leibler')
    W_test_list.append(W_test)

Extracting tf-idf features for NMF...
done in 0.00s.
Fitting NMF for universalism
Fitting NMF for hedonism
Fitting NMF for achievement
Fitting NMF for power
Fitting NMF for self-direction
Fitting NMF for benevolence
Fitting NMF for conformity
Fitting NMF for tradition
Fitting NMF for stimulation
Fitting NMF for security


In [15]:
# Sum up sub topics
W_test_norm_list = []
for W in W_test_list:
    W_test_cumul = cumulate_W(W, n_topics=3)
    W_test_norm = normalize_W(W_test_cumul)
    W_test_norm_list.append(W_test_norm)
W_test_norm = np.asarray(W_test_norm_list).T[0]

In [16]:
interact(print_test_results, doc_topic=fixed(W_test_norm), doc = (0, len(W_test_norm)-1, 1), corpus=fixed(test_corpus))

<function __main__.print_test_results>

In [17]:
df = export_to_excel(W_test_norm, test_corpus, filepath = 'output.xlsx')
df.head()

Unnamed: 0,Text,universalism,hedonism,achievement,power,self-direction,benevolence,conformity,tradition,stimulation,security
0,"Good evening â€“ or, good morning, I am not su...",26.917947,53.195741,21.905193,33.749939,14.305954,65.909071,76.491778,38.390092,32.039518,36.997533
1,\nOn behalf of the Secretary of Defense and De...,75.736422,5.911529,40.480161,76.072999,61.269648,0.049261,1.888345,0.001672,58.705763,79.90054


In [21]:
df = export_to_csv(W_test_norm, test_corpus, filepath = 'output.csv')
df.head()

Unnamed: 0,Text,universalism,hedonism,achievement,power,self-direction,benevolence,conformity,tradition,stimulation,security,general
0,"Good evening â€“ or, good morning, I am not su...",3.210163,7.430994,4.855774,6.453324,0.137821,20.459419,27.277332,9.677749,3.74642,5.519147,11.231858
1,\nOn behalf of the Secretary of Defense and De...,19.748271,0.946942,8.978493,16.955744,13.825183,0.004017,2.1e-05,1.3e-05,12.855169,26.148252,0.537895


In [62]:
categories=['universalism', 'hedonism', 'achievement', 'power',
       'self-direction', 'benevolence', 'conformity', 'tradition', 'stimulation',
       'security']

schwartz =['universalism', 'benevolence', 'conformity', 'tradition',
       'security', 'power', 'achievement', 'hedonism', 'stimulation',
       'self-direction']

In [82]:
feature_names = tfidf_vectorizer.get_feature_names()

cumul = 0
word_list = []

for i, row in pd_theme_counts.iterrows():
    tmp_list = []
    tfidf_avg = np.average(tfidf[cumul:cumul+int(row['document.id'])].toarray(), axis=0)
    cumul += int(row['document.id'])
    
    for idx in list(reversed(tfidf_avg.argsort())):
        tmp_list.append((feature_names[idx], np.round(tfidf_avg[idx], 4)))
        
    word_list.append(tmp_list)

schwartz_word_score = []
for sch in schwartz:
    schwartz_word_score.append(word_list[categories.index(sch)])

df_list = []
for i, a in enumerate(schwartz_word_score):
    df_list.append(pd.DataFrame(a, columns=[schwartz[i]+" - word", schwartz[i]+" - score"]))
score_df = pd.concat(df_list, axis=1)
score_df

Unnamed: 0,universalism - word,universalism - score,benevolence - word,benevolence - score,conformity - word,conformity - score,tradition - word,tradition - score,security - word,security - score,power - word,power - score,achievement - word,achievement - score,hedonism - word,hedonism - score,stimulation - word,stimulation - score,self-direction - word,self-direction - score
0,environment,0.0701,moral,0.0810,god,0.0843,virtu,0.1417,reciproc,0.1349,power,0.1135,capit,0.1522,pleasur,0.0727,travel,0.1690,creativ,0.0880
1,peac,0.0590,good,0.0739,command,0.0582,temper,0.0985,secur,0.1242,author,0.1116,statu,0.0813,happi,0.0631,sport,0.1499,independ,0.0648
2,ecolog,0.0542,ethic,0.0622,cultur,0.0511,humil,0.0921,social,0.0613,social,0.0481,social,0.0796,emot,0.0525,adventur,0.1471,intellig,0.0532
3,social,0.0448,truth,0.0485,parent,0.0477,tradit,0.0705,norm,0.0578,domin,0.0468,need,0.0458,pain,0.0407,tourism,0.1462,ye,0.0518
4,right,0.0425,god,0.0410,disciplin,0.0466,moral,0.0562,clean,0.0543,revolut,0.0414,human,0.0426,hedon,0.0402,explor,0.1033,invent,0.0468
5,human,0.0338,evil,0.0406,group,0.0463,sophrosyn,0.0547,contamin,0.0535,control,0.0405,individu,0.0424,person,0.0385,stimul,0.1031,innov,0.0386
6,intern,0.0332,forgiv,0.0393,polit,0.0450,christian,0.0527,risk,0.0449,polit,0.0379,manag,0.0413,psycholog,0.0350,genr,0.0664,territori,0.0376
7,war,0.0314,one,0.0354,behavior,0.0433,myth,0.0469,pollut,0.0354,state,0.0365,intellectu,0.0382,one,0.0342,fiction,0.0620,govern,0.0372
8,natur,0.0307,theori,0.0351,norm,0.0374,charact,0.0457,hygien,0.0352,collaps,0.0363,person,0.0381,feel,0.0321,game,0.0467,process,0.0368
9,resourc,0.0292,valu,0.0342,use,0.0369,one,0.0419,wast,0.0346,time,0.0363,societi,0.0379,love,0.0303,stori,0.0464,idea,0.0356


In [81]:
score_df.to_excel("wiki_tfidf_average.xlsx")