One vs All Method

Train NMF for each topic separately.

Use all Wiki articles as Background Corpus.

In [1]:
import pandas as pd
import numpy as np
from time import time

import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

import matplotlib.pyplot as plt
from math import pi

from omterms.interface import *

from ipywidgets import interact, fixed

import pickle

## Plots and Prints

In [2]:
categories=['universalism', 'hedonism', 'achievement', 'power',
       'self-direction', 'benevolence', 'conformity', 'tradition', 'stimulation',
       'security']

def plot_radar_chart(doc_topic_cumul, doc):
    # ------- PART 1: Create background
 
    # number of variablecategories
    schwartz =['universalism', 'benevolence', 'conformity', 'tradition',
       'security', 'power', 'achievement', 'hedonism', 'stimulation',
       'self-direction']
    
    schwartz_dist = []
    for sch in schwartz:
        schwartz_dist.append(doc_topic_cumul[doc][categories.index(sch)])
    
    N = len(schwartz)
    
    # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += angles[:1]

    plt.figure(figsize=(8,8))
    # Initialise the spider plot
    ax = plt.subplot(111, polar=True)

    # If you want the first axis to be on top:
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)

    # Draw one axe per variable + add labels labels yet
    plt.xticks(angles[:-1], schwartz)

    # Draw ylabels
    ax.set_rlabel_position(0)
    plt.yticks([25,50,75], ["25","50","75"], color="grey", size=7)
    plt.ylim(0,100)


    # ------- PART 2: Add plots

    # Plot each individual = each line of the data
    # I don't do a loop, because plotting more than 3 groups makes the chart unreadable

    # Ind1
    values = list(schwartz_dist) + list(schwartz_dist[:1])
    ax.plot(angles, values, linewidth=1, linestyle='solid')
    ax.fill(angles, values, 'b', alpha=0.1)

    # Add legend
    #plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.title("Schwartz Chart - Doc " + str(doc))
    plt.show()
    
    
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'
    
    
def print_top_words(model, theme, tfidf_vectorizer, n_top_words, n_topics=3):
    feature_names = tfidf_vectorizer.get_feature_names()
    print(color.CYAN + color.BOLD + categories[theme] + color.END)
    for topic_idx, topic in enumerate(model[theme].components_):
        if topic_idx / n_topics == 1:
            break
        message = color.BOLD + "Topic #%d: " % topic_idx + color.END
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
    
def print_cumulative_train_doc_topics(data, doc_topic, doc, n_best):
    test_theme = data.iloc[doc]['theme']
    print(color.BOLD + "Doc " + str(doc) + color.RED +  " (" + test_theme + ")\t: " + color.END, end='')
    dt = doc_topic[doc]
    for i in dt.argsort()[:-n_best - 1:-1]:
        print("(", end='')
        try:
            print(color.CYAN + color.BOLD + categories[i] + color.END, end='')
        except:
            print(color.CYAN + color.BOLD + "General" + color.END, end='')
        print(", %d, %.2lf)  " %(i, dt[i]), end='')    
    print()
    
def print_cumulative_test_doc_topics(doc_topic, doc, n_best):
    print(color.BOLD + "Doc " + str(doc) + "\t: " + color.END, end='')
    dt = doc_topic[doc]
    for i in dt.argsort()[:-n_best - 1:-1]:
        print("(", end='')
        try:
            print(color.CYAN + color.BOLD + categories[i] + color.END, end='')
        except:
            print(color.CYAN + color.BOLD + "General" + color.END, end='')
        print(", %d, %.2lf)  " %(i, dt[i]), end='')    
    print()

def print_doc_topics(doc_topic, doc, n_best):
    print(color.BOLD + "Doc " + str(doc) + "\t: " + color.END, end='')
    for i in doc_topic[doc].argsort()[:-n_best - 1:-1]:
        print("(", end='')
        try:
            print(color.CYAN + color.BOLD + categories[i//3] + color.END, end='')
        except:
            print(color.CYAN + color.BOLD + "General" + color.END, end='')
        print(", %d, %.2lf)  " %(i, doc_topic[doc][i]), end='')    
    print()

def print_train_results(doc_topic, doc, corpus, data):
    print(color.BOLD + "Document " + str(doc) + color.END)
    print()
    print(color.BOLD + "Text: " + color.END)
    print("..." + corpus[doc][len(corpus[doc])//3:len(corpus[doc])//3+500] + "...")
    print()
    print()
    
    print(color.BOLD + "Topic Distribution: " + color.END)
    #print(pd.DataFrame(data=[W_test_norm[doc]], index = [doc], columns=categories+['general']))
    print_cumulative_train_doc_topics(data, doc_topic, doc, 11) 
    print()
    
    plot_radar_chart(doc_topic, doc)
    
def print_test_results(doc_topic, doc, corpus):
    print(color.BOLD + "Document " + str(doc) + color.END)
    print()
    print(color.BOLD + "Text: " + color.END)
    print("..." + corpus[doc][len(corpus[doc])//3:len(corpus[doc])//3+500] + "...")
    print()
    print()
    
    print(color.BOLD + "Topic Distribution: " + color.END)
    
    #print(pd.DataFrame(data=[W_test_norm[doc]], index = [doc], columns=categories+['general']))
    print_cumulative_test_doc_topics(doc_topic, doc, 11)
    print()
    
    plot_radar_chart(doc_topic, doc)
    
    

## Helper Functions

In [3]:
def cumulate_W(W, n_topics):
    W_cumul = []
    for d in W:
        temp = []
        for i in range(W.shape[1]//n_topics):
            temp.append(d[i*n_topics:(i+1)*n_topics].sum())
        W_cumul.append(temp)

    W_cumul = np.asarray(W_cumul)
    
    return W_cumul

def normalize_W(W):
    W_cumul_norm = W/(W.sum(axis=1).reshape(W.shape[0], 1))
    W_cumul_norm *= 100
    
    return W_cumul_norm

def export_to_excel(W, docs, filepath):
    '''
    Take cumulated W as input.
    Don't forget to put xlsx as file extension '''
    
    df = pd.DataFrame(data=W,index = range(len(W)), columns=categories)
    df['Text'] = docs
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    df.to_excel(filepath)
    return df

def export_to_csv(W, docs, filepath):
    '''
    Take cumulated W as input.
    Don't forget to put csv as file extension '''
    
    df = pd.DataFrame(data=W,index = range(len(W)), columns=categories)
    df['Text'] = docs
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    df.to_csv(filepath)
    return df

## Main Functions

In [4]:
def read_data(filepath):
    data = pd.read_json(filepath)
    data = data[data['text']!=""]
    data = data.sort_values('theme.id')
    
    return data
    
def extract_corpus(data):    
    corpus = list(data['text'])
    return corpus

def preprocess_corpus(corpus):
    PPcorpus = [' '.join(list((extract_terms(doc, extra_process = ['stem'])['Stem']+' ')*extract_terms(doc, extra_process = ['stem'])['TF'])) for doc in corpus]
    return PPcorpus

def train_corpus(corpus, data, n_topics=3, betaloss = 'kullback-leibler'):
    N = len(data)
    
    theme_counts = data.groupby(['theme.id','theme']).count().iloc[:,1]
    pd_theme_counts = pd.DataFrame(theme_counts)
    n_themes = len(theme_counts)
    
    n_top_words = 5
    n_components = n_topics*(n_themes)
    
    
    print("Extracting tf-idf features for NMF...")
    tfidf_vectorizer = TfidfVectorizer() # optionally add maxfeatures = n_features to enforce number of features
    t0 = time()
    tfidf = tfidf_vectorizer.fit_transform(corpus)
    n_features = tfidf.shape[1]
    print("done in %0.2fs." % (time() - t0))
    
    W_list = []

    tc_sum = 0
    for tc in theme_counts:
        W = np.zeros((N,2*n_topics))
        W[:, n_topics:] = np.random.random((N,n_topics))
        W[tc_sum:tc_sum+tc, :n_topics] = np.random.random((tc,n_topics))

        tc_sum += tc
        W_list.append(W)
        
    X = tfidf 
    nmf_list = []

    for i, W in enumerate(W_list):
        print("Fitting NMF for " + str(theme_counts.index[i][1]))
        t0 = time()
        H = np.random.rand(2*n_topics, n_features)

        nmf = NMF(n_components= 2*n_topics, solver='mu', beta_loss=betaloss,
                  alpha=.1, l1_ratio=.5, init = 'custom')

        nmf.fit_transform(X=X,W=W,H=H)
        print("done in %0.2fs." % (time() - t0))

        nmf_list.append(nmf)
    
    
    return nmf_list, W_list, tfidf, tfidf_vectorizer
    
def evaluate_docs(docs, nmf, tfidf_test, betaloss = 'kullback-leibler'):
    X_test = tfidf_test
    H_test = nmf.components_
    
    # Fit the NMF model
    t0 = time()

    W_test = nmf.transform(X_test)
    
    return W_test

## Training Model

In [5]:
#https://github.com/bulentozel/OpenMaker/blob/master/Semantics/data/corpuses/schwartz.json
filepath = 'schwartz.json'

data = read_data(filepath)
corpus = extract_corpus(data)
corpusPP = preprocess_corpus(corpus)

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3909
Cleaning process: Initial size of tokens = 3909
Reduction due to punctuations and stopwords = 2792.
Reduction due to all numeral terms = 8
Reduction due to short terms = 8
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2811
Percentage = 72%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom sto

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 6955
Cleaning process: Initial size of tokens = 6955
Reduction due to punctuations and stopwords = 5057.
Reduction due to all numeral terms = 10
Reduction due to short terms = 6
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 11
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 5084
Percentage = 73%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleane

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 4859
Cleaning process: Initial size of tokens = 4859
Reduction due to punctuations and stopwords = 3477.
Reduction due to all numeral terms = 19
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 6
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 3505
Percentage = 72%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 8801
Cleaning process: Initial size of tokens = 8801
Reduction due to punctuations and stopwords = 6502.
Reduction due to all numeral terms = 1
Reduction due to short terms = 8
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 13
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 6524
Percentage = 74%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 965
Cleaning process: Initial size of tokens = 965
Reduction due to punctuations and stopwords = 609.
Reduction due to all numeral terms = 0
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 612
Percentage = 63%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopwor

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 5944
Cleaning process: Initial size of tokens = 5944
Reduction due to punctuations and stopwords = 4194.
Reduction due to all numeral terms = 27
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 12
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 4237
Percentage = 71%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleane

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 7196
Cleaning process: Initial size of tokens = 7196
Reduction due to punctuations and stopwords = 5406.
Reduction due to all numeral terms = 10
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 10
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 5430
Percentage = 75%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom s

Reduction due to partially numeral terms = 11
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 14670
Percentage = 85%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 17289
Cleaning process: Initial size of tokens = 17289
Reduction due to punctuations and stopwords = 14568.
Reduction due to all numeral terms = 85
Reduction due to short terms = 6
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 11
Reduction due to terms with not allowed symbols = 0
The total term coun

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 2106
Cleaning process: Initial size of tokens = 2106
Reduction due to punctuations and stopwords = 1474.
Reduction due to all numeral terms = 0
Reduction due to short terms = 1
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1475
Percentage = 70%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1842
Cleaning process: Initial size of tokens = 1842
Reduction due to punctuations and stopwords = 1240.
Reduction due to all numeral terms = 5
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1249
Percentage = 68%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3630
Cleaning process: Initial size of tokens = 3630
Reduction due to punctuations and stopwords = 2682.
Reduction due to all numeral terms = 10
Reduction due to short terms = 7
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2700
Percentage = 74%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom st

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 4354
Cleaning process: Initial size of tokens = 4354
Reduction due to punctuations and stopwords = 3328.
Reduction due to all numeral terms = 11
Reduction due to short terms = 5
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 4
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 3348
Percentage = 77%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3351
Cleaning process: Initial size of tokens = 3351
Reduction due to punctuations and stopwords = 2355.
Reduction due to all numeral terms = 0
Reduction due to short terms = 6
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 5
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2366
Percentage = 71%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 274
Cleaning process: Initial size of tokens = 274
Reduction due to punctuations and stopwords = 187.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 187
Percentage = 68%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...


Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 315
Cleaning process: Initial size of tokens = 315
Reduction due to punctuations and stopwords = 185.
Reduction due to all numeral terms = 0
Reduction due to short terms = 1
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 187
Percentage = 59%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopwor

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 825
Cleaning process: Initial size of tokens = 825
Reduction due to punctuations and stopwords = 499.
Reduction due to all numeral terms = 3
Reduction due to short terms = 1
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 18
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 521
Percentage = 63%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopwo

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 2567
Cleaning process: Initial size of tokens = 2567
Reduction due to punctuations and stopwords = 1678.
Reduction due to all numeral terms = 9
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 2
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1692
Percentage = 66%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom sto

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 11095
Cleaning process: Initial size of tokens = 11095
Reduction due to punctuations and stopwords = 8765.
Reduction due to all numeral terms = 0
Reduction due to short terms = 8
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 9
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 8782
Percentage = 79%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom s

Reduction due to all numeral terms = 0
Reduction due to short terms = 0
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 466
Percentage = 66%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 706
Cleaning process: Initial size of tokens = 706
Reduction due to punctuations and stopwords = 463.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0
Reduction due to rare terms = 0
Reduction due to partia

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1961
Cleaning process: Initial size of tokens = 1961
Reduction due to punctuations and stopwords = 1292.
Reduction due to all numeral terms = 0
Reduction due to short terms = 5
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1300
Percentage = 66%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1000
Cleaning process: Initial size of tokens = 1000
Reduction due to punctuations and stopwords = 573.
Reduction due to all numeral terms = 0
Reduction due to short terms = 2
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 575
Percentage = 58%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ..

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 899
Cleaning process: Initial size of tokens = 899
Reduction due to punctuations and stopwords = 514.
Reduction due to all numeral terms = 0
Reduction due to short terms = 1
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 4
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 519
Percentage = 58%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopwor

Reduction due to partially numeral terms = 12
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 6527
Percentage = 76%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3464
Cleaning process: Initial size of tokens = 3464
Reduction due to punctuations and stopwords = 2596.
Reduction due to all numeral terms = 6
Reduction due to short terms = 5
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 2
Reduction due to terms with not allowed symbols = 0
The total term count redu

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3955
Cleaning process: Initial size of tokens = 3955
Reduction due to punctuations and stopwords = 2758.
Reduction due to all numeral terms = 1
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 5
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2768
Percentage = 70%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 2451
Cleaning process: Initial size of tokens = 2451
Reduction due to punctuations and stopwords = 1618.
Reduction due to all numeral terms = 2
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1627
Percentage = 66%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom sto

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3751
Cleaning process: Initial size of tokens = 3751
Reduction due to punctuations and stopwords = 2724.
Reduction due to all numeral terms = 0
Reduction due to short terms = 9
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 4
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2737
Percentage = 73%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Reduction due to partially numeral terms = 6
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1148
Percentage = 62%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1849
Cleaning process: Initial size of tokens = 1849
Reduction due to punctuations and stopwords = 1137.
Reduction due to all numeral terms = 0
Reduction due to short terms = 5
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 6
Reduction due to terms with not allowed symbols = 0
The total term count reduc

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3787
Cleaning process: Initial size of tokens = 3787
Reduction due to punctuations and stopwords = 2601.
Reduction due to all numeral terms = 4
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2608
Percentage = 69%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 5300
Cleaning process: Initial size of tokens = 5300
Reduction due to punctuations and stopwords = 3756.
Reduction due to all numeral terms = 0
Reduction due to short terms = 8
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 8
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 3772
Percentage = 71%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 2079
Cleaning process: Initial size of tokens = 2079
Reduction due to punctuations and stopwords = 1424.
Reduction due to all numeral terms = 0
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1430
Percentage = 69%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom sto

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1669
Cleaning process: Initial size of tokens = 1669
Reduction due to punctuations and stopwords = 1024.
Reduction due to all numeral terms = 0
Reduction due to short terms = 6
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1033
Percentage = 62%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Reduction due to all numeral terms = 0
Reduction due to short terms = 6
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 512
Percentage = 54%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 2519
Cleaning process: Initial size of tokens = 2519
Reduction due to punctuations and stopwords = 1685.
Reduction due to all numeral terms = 1
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to par

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3083
Cleaning process: Initial size of tokens = 3083
Reduction due to punctuations and stopwords = 2034.
Reduction due to all numeral terms = 2
Reduction due to short terms = 5
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 6
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2047
Percentage = 66%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 311
Cleaning process: Initial size of tokens = 311
Reduction due to punctuations and stopwords = 159.
Reduction due to all numeral terms = 0
Reduction due to short terms = 1
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 160
Percentage = 51%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...


Percentage = 76%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 5785
Cleaning process: Initial size of tokens = 5785
Reduction due to punctuations and stopwords = 4373.
Reduction due to all numeral terms = 4
Reduction due to short terms = 1
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 4381
Percentage = 76%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insig

Reduction due to partially numeral terms = 57
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 3938
Percentage = 74%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 5329
Cleaning process: Initial size of tokens = 5329
Reduction due to punctuations and stopwords = 3855.
Reduction due to all numeral terms = 16
Reduction due to short terms = 10
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 57
Reduction due to terms with not allowed symbols = 0
The total term count r

Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 11846
Percentage = 83%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 28
Cleaning process: Initial size of tokens = 28
Reduction due to punctuations and stopwords = 14.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 14
Percentage 

Done. Number of terms: 1435
Cleaning process: Initial size of tokens = 1435
Reduction due to punctuations and stopwords = 722.
Reduction due to all numeral terms = 0
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 5
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 731
Percentage = 51%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 606
Cleaning process: Initial size of tokens = 606
Reduction due to punctuations and stopwords = 351.


Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 2959
Cleaning process: Initial size of tokens = 2959
Reduction due to punctuations and stopwords = 2115.
Reduction due to all numeral terms = 6
Reduction due to short terms = 11
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2133
Percentage = 72%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1256
Cleaning process: Initial size of tokens = 1256
Reduction due to punctuations and stopwords = 792.
Reduction due to all numeral terms = 0
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 796
Percentage = 63%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ..

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 7628
Cleaning process: Initial size of tokens = 7628
Reduction due to punctuations and stopwords = 5863.
Reduction due to all numeral terms = 13
Reduction due to short terms = 6
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 2
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 5884
Percentage = 77%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom st

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3482
Cleaning process: Initial size of tokens = 3482
Reduction due to punctuations and stopwords = 2444.
Reduction due to all numeral terms = 0
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 2
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2450
Percentage = 70%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 2518
Cleaning process: Initial size of tokens = 2518
Reduction due to punctuations and stopwords = 1723.
Reduction due to all numeral terms = 0
Reduction due to short terms = 2
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1725
Percentage = 69%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 885
Cleaning process: Initial size of tokens = 885
Reduction due to punctuations and stopwords = 575.
Reduction due to all numeral terms = 0
Reduction due to short terms = 1
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 576
Percentage = 65%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...


Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 672
Cleaning process: Initial size of tokens = 672
Reduction due to punctuations and stopwords = 427.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 427
Percentage = 64%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...


Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 5688
Cleaning process: Initial size of tokens = 5688
Reduction due to punctuations and stopwords = 4303.
Reduction due to all numeral terms = 10
Reduction due to short terms = 7
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 2
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 4322
Percentage = 76%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1487
Cleaning process: Initial size of tokens = 1487
Reduction due to punctuations and stopwords = 923.
Reduction due to all numeral terms = 7
Reduction due to short terms = 6
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 5
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 941
Percentage = 63%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ..

Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1361
Percentage = 69%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 2687
Cleaning process: Initial size of tokens = 2687
Reduction due to punctuations and stopwords = 1951.
Reduction due to all numeral terms = 1
Reduction due to short terms = 2
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1957
Perc

Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 2160
Cleaning process: Initial size of tokens = 2160
Reduction due to punctuations and stopwords = 1440.
Reduction due to all numeral terms = 0
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1444
Percentage = 67%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervise

Done. Number of terms: 6167
Cleaning process: Initial size of tokens = 6167
Reduction due to punctuations and stopwords = 4784.
Reduction due to all numeral terms = 28
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 10
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 4826
Percentage = 78%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 6167
Cleaning process: Initial size of tokens = 6167
Reduction due to punctuations and stopwords =

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 264
Cleaning process: Initial size of tokens = 264
Reduction due to punctuations and stopwords = 184.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 184
Percentage = 70%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopwor

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 4386
Cleaning process: Initial size of tokens = 4386
Reduction due to punctuations and stopwords = 3294.
Reduction due to all numeral terms = 13
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 3310
Percentage = 75%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 189
Cleaning process: Initial size of tokens = 189
Reduction due to punctuations and stopwords = 111.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 111
Percentage = 59%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...


Reduction due to partially numeral terms = 2
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 922
Percentage = 67%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1384
Cleaning process: Initial size of tokens = 1384
Reduction due to punctuations and stopwords = 917.
Reduction due to all numeral terms = 3
Reduction due to short terms = 0
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 2
Reduction due to terms with not allowed symbols = 0
The total term count reducti

Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 365
Percentage = 53%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 691
Cleaning process: Initial size of tokens = 691
Reduction due to punctuations and stopwords = 360.
Reduction due to all numeral terms = 0
Reduction due to short terms = 2
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3260
Cleaning process: Initial size of tokens = 3260
Reduction due to punctuations and stopwords = 2359.
Reduction due to all numeral terms = 2
Reduction due to short terms = 2
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2364
Percentage = 73%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3736
Cleaning process: Initial size of tokens = 3736
Reduction due to punctuations and stopwords = 2462.
Reduction due to all numeral terms = 12
Reduction due to short terms = 7
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 10
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2491
Percentage = 67%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleane

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 996
Cleaning process: Initial size of tokens = 996
Reduction due to punctuations and stopwords = 676.
Reduction due to all numeral terms = 1
Reduction due to short terms = 2
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 679
Percentage = 68%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopwor

Done. Number of terms: 1054
Cleaning process: Initial size of tokens = 1054
Reduction due to punctuations and stopwords = 717.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 717
Percentage = 68%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1054
Cleaning process: Initial size of tokens = 1054
Reduction due to punctuations and stopwords = 717

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3034
Cleaning process: Initial size of tokens = 3034
Reduction due to punctuations and stopwords = 2029.
Reduction due to all numeral terms = 1
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 11
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2045
Percentage = 67%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 5458
Cleaning process: Initial size of tokens = 5458
Reduction due to punctuations and stopwords = 3953.
Reduction due to all numeral terms = 3
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 4
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 3963
Percentage = 73%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2422
Percentage = 76%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 2562
Cleaning process: Initial size of tokens = 2562
Reduction due to punctuations and stopwords = 1953.
Reduction due to all numeral terms = 0
Reduction due to short terms = 5
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 2
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1960
Perc

Done. Number of terms: 5883
Cleaning process: Initial size of tokens = 5883
Reduction due to punctuations and stopwords = 4431.
Reduction due to all numeral terms = 0
Reduction due to short terms = 8
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 4439
Percentage = 75%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 9017
Cleaning process: Initial size of tokens = 9017
Reduction due to punctuations and stopwords = 6

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 4778
Cleaning process: Initial size of tokens = 4778
Reduction due to punctuations and stopwords = 3359.
Reduction due to all numeral terms = 0
Reduction due to short terms = 6
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 3366
Percentage = 70%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom sto

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 4796
Cleaning process: Initial size of tokens = 4796
Reduction due to punctuations and stopwords = 3608.
Reduction due to all numeral terms = 2
Reduction due to short terms = 6
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 2
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 3618
Percentage = 75%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Reduction due to punctuations and stopwords = 6723.
Reduction due to all numeral terms = 130
Reduction due to short terms = 16
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 6
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 6875
Percentage = 77%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 487
Cleaning process: Initial size of tokens = 487
Reduction due to punctuations and stopwords = 322.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 820
Cleaning process: Initial size of tokens = 820
Reduction due to punctuations and stopwords = 473.
Reduction due to all numeral terms = 0
Reduction due to short terms = 2
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 476
Percentage = 58%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopwor

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 6437
Cleaning process: Initial size of tokens = 6437
Reduction due to punctuations and stopwords = 5189.
Reduction due to all numeral terms = 9
Reduction due to short terms = 9
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 5210
Percentage = 81%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3776
Cleaning process: Initial size of tokens = 3776
Reduction due to punctuations and stopwords = 2598.
Reduction due to all numeral terms = 4
Reduction due to short terms = 1
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2606
Percentage = 69%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3247
Cleaning process: Initial size of tokens = 3247
Reduction due to punctuations and stopwords = 2697.
Reduction due to all numeral terms = 3
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 4
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2708
Percentage = 83%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom sto

Done. Number of terms: 1866
Cleaning process: Initial size of tokens = 1866
Reduction due to punctuations and stopwords = 1274.
Reduction due to all numeral terms = 0
Reduction due to short terms = 2
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 3
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1279
Percentage = 69%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1335
Cleaning process: Initial size of tokens = 1335
Reduction due to punctuations and stopwords = 9

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 4052
Cleaning process: Initial size of tokens = 4052
Reduction due to punctuations and stopwords = 2688.
Reduction due to all numeral terms = 3
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 5
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2699
Percentage = 67%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom sto

Reduction due to all numeral terms = 0
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 727
Percentage = 62%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1176
Cleaning process: Initial size of tokens = 1176
Reduction due to punctuations and stopwords = 722.
Reduction due to all numeral terms = 0
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to part

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 662
Cleaning process: Initial size of tokens = 662
Reduction due to punctuations and stopwords = 390.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 390
Percentage = 59%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopwor

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 4576
Cleaning process: Initial size of tokens = 4576
Reduction due to punctuations and stopwords = 3415.
Reduction due to all numeral terms = 8
Reduction due to short terms = 6
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 3429
Percentage = 75%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 5048
Cleaning process: Initial size of tokens = 5048
Reduction due to punctuations and stopwords = 3585.
Reduction due to all numeral terms = 16
Reduction due to short terms = 3
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 3605
Percentage = 71%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom st

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 2026
Cleaning process: Initial size of tokens = 2026
Reduction due to punctuations and stopwords = 1139.
Reduction due to all numeral terms = 2
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1145
Percentage = 57%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Reduction due to punctuations and stopwords = 4495.
Reduction due to all numeral terms = 2
Reduction due to short terms = 7
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 4
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 4508
Percentage = 72%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 206
Cleaning process: Initial size of tokens = 206
Reduction due to punctuations and stopwords = 114.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0
Re

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 3997
Cleaning process: Initial size of tokens = 3997
Reduction due to punctuations and stopwords = 2750.
Reduction due to all numeral terms = 6
Reduction due to short terms = 2
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 4
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2762
Percentage = 69%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1526
Cleaning process: Initial size of tokens = 1526
Reduction due to punctuations and stopwords = 1063.
Reduction due to all numeral terms = 0
Reduction due to short terms = 1
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1064
Percentage = 70%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom sto

Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2043
Percentage = 72%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 2853
Cleaning process: Initial size of tokens = 2853
Reduction due to punctuations and stopwords = 2013.
Reduction due to all numeral terms = 15
Reduction due to short terms = 13
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 2
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2043
Pe

COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 230
Cleaning process: Initial size of tokens = 230
Reduction due to punctuations and stopwords = 128.
Reduction due to all numeral terms = 1
Reduction due to short terms = 4
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 133
Percentage = 58%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No cus

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 528
Cleaning process: Initial size of tokens = 528
Reduction due to punctuations and stopwords = 298.
Reduction due to all numeral terms = 2
Reduction due to short terms = 1
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 302
Percentage = 57%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopwor

Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 5001
Cleaning process: Initial size of tokens = 5001
Reduction due to punctuations and stopwords = 3611.
Reduction due to all numeral terms = 2
Reduction due to short terms = 8
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 2
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 3623
Percentage = 72%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner 

Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 2965
Percentage = 73%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 4063
Cleaning process: Initial size of tokens = 4063
Reduction due to punctuations and stopwords = 2951.
Reduction due to all numeral terms = 7
Reduction due to short terms = 6
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 1
Reduction due to terms with not allowed symbols = 0
The total term count reduc

In [6]:
nmf_list, W_list, tfidf, tfidf_vectorizer = train_corpus(corpusPP, data, n_topics=3, betaloss = 'kullback-leibler')

Extracting tf-idf features for NMF...
done in 0.81s.
Fitting NMF for universalism
done in 7.98s.
Fitting NMF for hedonism
done in 7.62s.
Fitting NMF for achievement
done in 6.52s.
Fitting NMF for power
done in 6.69s.
Fitting NMF for self-direction
done in 7.13s.
Fitting NMF for benevolence
done in 7.23s.
Fitting NMF for conformity
done in 7.06s.
Fitting NMF for tradition
done in 6.56s.
Fitting NMF for stimulation
done in 6.85s.
Fitting NMF for security
done in 6.77s.


In [7]:
print("\nTopics in NMF model:")
for i in range(10):
    print_top_words(nmf_list, i, tfidf_vectorizer, n_top_words=5, n_topics=3)


Topics in NMF model:
[96m[1muniversalism[0m
[1mTopic #0: [0morgan relat number state matter
[1mTopic #1: [0mgroup impact effect found disarma
[1mTopic #2: [0mcreation govern law world ethic

[96m[1mhedonism[0m
[1mTopic #0: [0moutrag shown sever use remov
[1mTopic #1: [0mself see philosophi research piti
[1mTopic #2: [0muse pleasur peopl jealousi surpris

[96m[1machievement[0m
[1mTopic #0: [0minterest set recent place stalin
[1mTopic #1: [0mmean use offer properti intergener
[1mTopic #2: [0mgreater three suggest term return

[96m[1mpower[0m
[1mTopic #0: [0muse highli articl technic toxic
[1mTopic #1: [0mwangchuck method sometim secular trivial
[1mTopic #2: [0moption compos part troubl moham

[96m[1mself-direction[0m
[1mTopic #0: [0mphotographi benedek train domin serbia
[1mTopic #1: [0mliberti right two help character
[1mTopic #2: [0mgener project secess burkina non

[96m[1mbenevolence[0m
[1mTopic #0: [0midea automat user told novel
[1

In [8]:
# Sum up sub topics
W_train_norm_list = []
for W in W_list:
    W_train_cumul = cumulate_W(W, n_topics=3)
    W_train_norm = normalize_W(W_train_cumul)
    W_train_norm_list.append(W_train_norm)
W_train_norm = np.asarray(W_train_norm_list).T[0]

In [9]:
interact(print_train_results, doc_topic=fixed(W_train_norm), doc = (0, len(W_train_norm)-1, 1), corpus=fixed(corpus), data=fixed(data))

<function __main__.print_train_results>

In [10]:
df = export_to_excel(W_train_norm, corpus, filepath = 'output.xlsx')
df.head()

Unnamed: 0,Text,universalism,hedonism,achievement,power,self-direction,benevolence,conformity,tradition,stimulation,security
0,Critical thinking \n Sculpture of Socrates \n ...,66.123691,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Environmental justice \n This article has mult...,98.831153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Natural resource \n ""Primary resource"" redirec...",99.580788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Ceasefire \n ""Truce"" redirects here For other ...",99.969178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,International community \n The \n internationa...,99.999451,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [113]:
df = export_to_csv(W_train_norm, corpus, filepath = 'output.csv')
df.head()

Unnamed: 0,Text,universalism,hedonism,achievement,power,self-direction,benevolence,conformity,tradition,stimulation,security
0,Critical thinking \n Sculpture of Socrates \n ...,2.269606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Environmental justice \n This article has mult...,97.444476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Natural resource \n ""Primary resource"" redirec...",98.315229,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Ceasefire \n ""Truce"" redirects here For other ...",99.943882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,International community \n The \n internationa...,10.333685,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
pickle.dump( [nmf_list, tfidf_vectorizer], open( "nmf2_pretrained.p", "wb" ) )

## Evaluating Different Documents

To evaluate your documents, simply append them to _docs list_ as a whole string.

Two example documents.

In [12]:
test_corpus = []
f = open("pope.txt", "r") #Pope ted talk, https://www.ted.com/speakers/pope_francis
pope = f.read()
test_corpus.append(pope)
f.close()

f = open("dod.txt", "r")  # US Department of Defense, https://www.defense.gov/About/
dod = f.read()
test_corpus.append(dod)
f.close()

In [13]:
test_corpusPP = preprocess_corpus(test_corpus)

Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom stopword list is given, nltk.corpus.stopwords will be used.
File access error at ./data/stopwords_openmaker.txt loading is skipped.
File access error at ./data/specifics_openmaker.txt, data loading is skipped.
A single text is provided.
Extracting the terms ...
Tokenizing the input text ..
Done. Number of terms: 1857
Cleaning process: Initial size of tokens = 1857
Reduction due to punctuations and stopwords = 1332.
Reduction due to all numeral terms = 0
Reduction due to short terms = 0
Reduction due to rare terms = 0
Reduction due to partially numeral terms = 0
Reduction due to terms with not allowed symbols = 0
The total term count reduction during this cleaning process = 1332
Percentage = 72%
Stemming the terms in the corpus ..
Done.
COMPLETED.
Outputs will be written under D:\Boun\OpenMaker\Insight\semi-supervised-nmf/
Configuring the text cleaner ...
No custom sto

In [14]:
print("Extracting tf-idf features for NMF...")
t0 = time()
tfidf_test = tfidf_vectorizer.transform(test_corpusPP)
n_features = tfidf_test.shape[1]
print("done in %0.2fs." % (time() - t0))

W_test_list = []
for i, nmf in enumerate(nmf_list):
    print("Fitting NMF for " + str(categories[i]))
    W_test = evaluate_docs(test_corpusPP, nmf, tfidf_test, betaloss = 'kullback-leibler')
    W_test_list.append(W_test)

Extracting tf-idf features for NMF...
done in 0.00s.
Fitting NMF for universalism
Fitting NMF for hedonism
Fitting NMF for achievement
Fitting NMF for power
Fitting NMF for self-direction
Fitting NMF for benevolence
Fitting NMF for conformity
Fitting NMF for tradition
Fitting NMF for stimulation
Fitting NMF for security


In [15]:
# Sum up sub topics
W_test_norm_list = []
for W in W_test_list:
    W_test_cumul = cumulate_W(W, n_topics=3)
    W_test_norm = normalize_W(W_test_cumul)
    W_test_norm_list.append(W_test_norm)
W_test_norm = np.asarray(W_test_norm_list).T[0]

In [16]:
interact(print_test_results, doc_topic=fixed(W_test_norm), doc = (0, len(W_test_norm)-1, 1), corpus=fixed(test_corpus))

<function __main__.print_test_results>

In [17]:
df = export_to_excel(W_test_norm, test_corpus, filepath = 'output.xlsx')
df.head()

Unnamed: 0,Text,universalism,hedonism,achievement,power,self-direction,benevolence,conformity,tradition,stimulation,security
0,"Good evening â€“ or, good morning, I am not su...",26.917947,53.195741,21.905193,33.749939,14.305954,65.909071,76.491778,38.390092,32.039518,36.997533
1,\nOn behalf of the Secretary of Defense and De...,75.736422,5.911529,40.480161,76.072999,61.269648,0.049261,1.888345,0.001672,58.705763,79.90054


In [21]:
df = export_to_csv(W_test_norm, test_corpus, filepath = 'output.csv')
df.head()

Unnamed: 0,Text,universalism,hedonism,achievement,power,self-direction,benevolence,conformity,tradition,stimulation,security,general
0,"Good evening â€“ or, good morning, I am not su...",3.210163,7.430994,4.855774,6.453324,0.137821,20.459419,27.277332,9.677749,3.74642,5.519147,11.231858
1,\nOn behalf of the Secretary of Defense and De...,19.748271,0.946942,8.978493,16.955744,13.825183,0.004017,2.1e-05,1.3e-05,12.855169,26.148252,0.537895
