# Phase 2

In [None]:
import os
import numpy as np
from contextlib import redirect_stdout
from tqdm import tqdm
from time import time
from collections import defaultdict
from tmu.models.autoencoder.autoencoder import TMAutoEncoder
from sklearn.metrics.pairwise import cosine_similarity
from Evaluation import Evaluation
from Tools import Tools
from DirectoriesUtil import Dicrectories

target_similarity=defaultdict(list)
clause_weight_threshold = 0
clause_drop_p = 0.0
factor = 20
clauses = 80
T = factor*40
s = 5.0
epochs = 100
number_of_examples = 1000
accumulation = 10
sub_accumulation = 10
top_max_clauses1 = 0
top_max_clauses2 = 0
with_clause_update = False
max_spearman = 0.9
true_weight = 0.7
false_weight = 1 - true_weight
neg_length = 50

eval = Evaluation()
def preprocess_text(text):
    return text
vectorizer_X = Tools.read_pickle_data("big_vectorizer_X.pickle")
feature_names = vectorizer_X.get_feature_names_out()
number_of_features = vectorizer_X.get_feature_names_out().shape[0]

for dataset_name in os.listdir(Dicrectories.datasets):
    if dataset_name == 'rg-65':
        current_folder_path = os.path.join(Dicrectories.datasets, dataset_name)
        if os.path.isdir(current_folder_path):
            files_start_name = os.path.join(current_folder_path, dataset_name)

            pair_list = Tools.get_dataset_pairs(files_start_name)
            output_active, target_words = Tools.get_dataset_targets(files_start_name)
            
            result_filepath = Dicrectories.test(dataset_name,"all_phase2")
            with open(result_filepath, 'w') as file, redirect_stdout(file):
                tm = TMAutoEncoder(clauses, T, s, output_active, max_included_literals=3, accumulation=accumulation, feature_negation=False, platform='CPU', output_balancing=0.5)
                total_training = 0
                print("Epochs: %d" % epochs)
                print("Target words: %d" % len(target_words))
                print("No of features: %d" % number_of_features)
                print("Clauses: %d" % clauses)
                print("with_clause_update: %s" % with_clause_update)
                print("Examples: %d" % number_of_examples)
                print("Accumulation: %d" % accumulation)
                print("Sub Accumulation: %d" % sub_accumulation)
                print("true_weight: %f" % true_weight)
                print("false_weight: %f" % false_weight)
                print("top_max_clauses1: %d" % top_max_clauses1)
                print("top_max_clauses2: %d\n" % top_max_clauses2)
                
                epochs_progress_bar = tqdm(total=epochs, desc="Running Epochs")
                for e in range(epochs):
                    print("\nEpoch #: %d" % e)
                    start_training = time()
                    tm.knowledge_fit(
                        number_of_examples = number_of_examples,
                        number_of_features = number_of_features,
                        sub_accumulation = sub_accumulation,
                        top_max_clauses1 = top_max_clauses1,
                        top_max_clauses2 = top_max_clauses2,
                        neg_length = neg_length,
                        with_clause_update = with_clause_update,
                        true_weight = true_weight,
                        false_weight = false_weight,
                        print_c = False
                        )
                    stop_training = time()
                    epoch_time = stop_training - start_training
                    Tools.print_training_time(epoch_time)
                    total_training = total_training + epoch_time

                    profile = np.empty((len(target_words), clauses))
                    for i in range(len(target_words)):
                        weights = tm.get_weights(i)
                        profile[i,:] = np.where(weights >= clause_weight_threshold, weights, 0)
                    similarity = cosine_similarity(profile)
                    for i in range(len(target_words)):
                        sorted_index = np.argsort(-1*similarity[i,:])
                        for j in range(1, len(target_words)):
                            target_similarity[(target_words[i], target_words[sorted_index[j]])]  = similarity[i,sorted_index[j]]
                    spearman = eval.calculate(target_similarity,pair_list)
                    if spearman > max_spearman:
                        break
                    epochs_progress_bar.update(1)
                epochs_progress_bar.close()

                print("\n=====================================\nClauses\n=====================================")
                for j in range(clauses):
                    print("Clause #%-2d " % (j), end=' ')
                    for tw in range(len(target_words)):
                        print("%s:W%-5d " % (target_words[tw], tm.get_weight(tw, j)), end='| ')
                    l = [] 
                    number_of_literals = 0 
                    for k in range(tm.clause_bank.number_of_literals):
                        if tm.get_ta_action(j, k) == 1:
                            number_of_literals = number_of_literals + 1
                            if k < tm.clause_bank.number_of_features:
                                l.append("%s(%d)" % (feature_names[k], tm.clause_bank.get_ta_state(j, k)))
                            else:
                                l.append("¬%s(%d)" % (feature_names[k-tm.clause_bank.number_of_features], tm.clause_bank.get_ta_state(j, k)))
                    print(": No of features:%-6d" % (number_of_literals), end=" ==> ")
                    try:
                        print(" - ".join(l))
                    except UnicodeEncodeError:
                        print(" exception ")
                
                print("\n=====================================\nWord Similarity\n=====================================")
                max_word_length = len(max(target_words, key=len))
                list_of_words = []
                target_words_with_min_max = []
                for i in range(len(target_words)):
                    row_of_similarity = []
                    sorted_index = np.argsort(-1*similarity[i,:])
                    min_similarity = 1.0
                    max_similarity = 0.0
                    word_similarity = []
                    for j in range(1, len(target_words)):
                        target_similarity[(target_words[i], target_words[sorted_index[j]])]  = similarity[i,sorted_index[j]]
                        row_of_similarity.append(target_words[sorted_index[j]])
                        word_similarity.append("{:<{}}({:.2f})  ".format(target_words[sorted_index[j]], max_word_length, similarity[i, sorted_index[j]]))
                        if(min_similarity > similarity[i,sorted_index[j]]):
                            min_similarity = similarity[i,sorted_index[j]]
                        if(max_similarity < similarity[i,sorted_index[j]]):
                            max_similarity = similarity[i,sorted_index[j]]
                
                    output_line = f"{target_words[i]:<{max_word_length}}: Min:{min_similarity:.2f}, Max:{max_similarity:.2f}"
                    print(output_line, end='     ==> ')
                    print(word_similarity)
                    list_of_words.append(row_of_similarity)
                    target_words_with_min_max.append(output_line)

                Tools.print_training_time(total_training)
                
            eval = ResultHelper.get_file_max_spearman(result_filepath)
            dir_name, old_file_name = os.path.split(result_filepath)
            new_file_path = os.path.join(dir_name, "{:.2f}".format(eval) + "_"  + old_file_name)
            os.rename(result_filepath, new_file_path)

2024-06-12 07:31:42,507 - tmu.clause_bank.clause_bank_cuda - ERROR - cuInit failed: no CUDA-capable device is detected
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/tmu/clause_bank/clause_bank_cuda.py", line 43, in <module>
    import pycuda.autoinit
  File "/opt/conda/lib/python3.11/site-packages/pycuda/autoinit.py", line 5, in <module>
    cuda.init()
pycuda._driver.RuntimeError: cuInit failed: no CUDA-capable device is detected
2024-06-12 07:31:42,913 - numexpr.utils - INFO - Note: detected 96 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
2024-06-12 07:31:42,914 - numexpr.utils - INFO - Note: NumExpr detected 96 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2024-06-12 07:31:42,914 - numexpr.utils - INFO - NumExpr defaulting to 8 threads.


Running Epochs:   7%|▋         | 7/100 [2:07:04<27:59:20, 1083.45s/it]

# Pairs Phase 2

In [6]:
import os
import numpy as np
from contextlib import redirect_stdout
from tqdm import tqdm
from time import time
from collections import defaultdict
from tmu.models.autoencoder.autoencoder import TMAutoEncoder
from sklearn.metrics.pairwise import cosine_similarity
from Evaluation import Evaluation
from Tools import Tools
from DirectoriesUtil import Dicrectories

import cProfile

clause_weight_threshold = 0
clause_drop_p = 0.0
factor = 4
T = factor*40
clauses = int(factor*30/(1.0 - clause_drop_p))
s = 5.0
epochs = 100
number_of_examples = 100
accumulation = 10
sub_accumulation = 14
top_max_clauses1 = 0
top_max_clauses2 = 0
with_clause_update = False
true_weight = 0.6
false_weight = 1 - true_weight

eval = Evaluation()
def preprocess_text(text):
    return text
vectorizer_X = Tools.read_pickle_data("vectorizer_X.pickle")
feature_names = vectorizer_X.get_feature_names_out()
number_of_features = vectorizer_X.get_feature_names_out().shape[0]

for dataset_name in os.listdir(Dicrectories.datasets):
    if dataset_name == 'rg-65':
        current_folder_path = os.path.join(Dicrectories.datasets, dataset_name)
        if os.path.isdir(current_folder_path):
            files_start_name = os.path.join(current_folder_path, dataset_name)
            pair_list = Tools.get_dataset_pairs(files_start_name)
            output_active, target_words = Tools.get_dataset_targets(files_start_name)
            available_pair_list = []
            pairs_output_active = []
            for pair, score in pair_list:
                word1, word2 = pair[0], pair[1]
                if all(word in target_words for word in [word1, word2]):
                    available_pair_list.append([pair,score])
                    pairs_output_active.append([vectorizer_X.vocabulary_[word1], vectorizer_X.vocabulary_[word2]])
            
            result_filepath = Dicrectories.test(dataset_name,"pair_pair_phase2")
            with open(result_filepath, 'w') as file, redirect_stdout(file):
                total_training = 0
                print("Epochs: %d" % epochs)
                print("Target words: %d" % len(target_words))
                print("No of features: %d" % number_of_features)
                print("Clauses: %d" % clauses)
                print("with_clause_update: %s" % with_clause_update)
                print("Examples: %d" % number_of_examples)
                print("Accumulation: %d" % accumulation)
                print("Sub Accumulation: %d" % sub_accumulation)
                print("true_weight: %f" % true_weight)
                print("false_weight: %f" % false_weight)
                print("top_max_clauses1: %d" % top_max_clauses1)
                print("top_max_clauses2: %d\n" % top_max_clauses2)
                
                epochs_progress_bar = tqdm(total=epochs, desc="Running Epochs")
                for e in range(epochs):
                    print("\nEpoch #: %d" % e)
                    epoch_time = 0
                    target_similarity=defaultdict(list)
                    for pair_index, pair in enumerate(pairs_output_active):
                        pair_output_active = np.empty(2, dtype=np.uint32)
                        pair_output_active[0] = pair[0]
                        pair_output_active[1] = pair[1]
                        start_training = time()
                        tm = TMAutoEncoder(clauses, T, s, pair_output_active, max_included_literals=3, accumulation=accumulation, feature_negation=False, platform='CPU', output_balancing=0.5)
                        
                        # profile = cProfile.Profile() 
                        # profile.enable()
                        tm.knowledge_pair_fit(
                            number_of_examples = number_of_examples,
                            number_of_features = number_of_features,
                            sub_accumulation = sub_accumulation,
                            top_max_clauses1 = top_max_clauses1,
                            top_max_clauses2 = top_max_clauses2,
                            with_clause_update = with_clause_update,
                            true_weight = true_weight,
                            false_weight = false_weight,
                            print_c = False
                            )
                        # profile.disable()
                        # profile.print_stats(sort='time')

                        stop_training = time()
                        pair_time = stop_training - start_training
                        epoch_time = epoch_time + pair_time

                        profile = np.empty((2, clauses))
                        weights = tm.get_weights(0)
                        profile[0,:] = np.where(weights >= clause_weight_threshold, weights, 0)
                        weights = tm.get_weights(1)
                        profile[1,:] = np.where(weights >= clause_weight_threshold, weights, 0)
                        similarity = cosine_similarity(profile)

                        sorted_index = np.argsort(-1*similarity[0,:])
                        target_similarity[available_pair_list[pair_index][0]]  = similarity[0,sorted_index[1]]

                    Tools.print_training_time(epoch_time)
                    total_training = total_training + epoch_time
                    eval.calculate(target_similarity,available_pair_list)
                    epochs_progress_bar.update(1)
                epochs_progress_bar.close()
                Tools.print_training_time(total_training)

Running Epochs:   1%|          | 1/100 [10:51<17:55:15, 651.67s/it]

KeyboardInterrupt: 

# Check cached files

In [25]:
from Tools import Tools
num_cached = Tools.read_pickle_data.cache_info().currsize
print("Number of cached results:", num_cached)

Number of cached results: 2600


# Print Phase2 features

In [28]:
import random
from Tools import Tools
from DirectoriesUtil import Dicrectories

number_of_examples = 1
accumulation = 10
sub_accumulation = 14
top_max_clauses1 = 0
top_max_clauses2 = 0
first= True

for dataset_name in os.listdir(Dicrectories.datasets):
    if dataset_name == 'rg-65':
        current_folder_path = os.path.join(Dicrectories.datasets, dataset_name)
        if os.path.isdir(current_folder_path):
            files_start_name = os.path.join(current_folder_path, dataset_name)

            pair_list = Tools.get_dataset_pairs(files_start_name)
            output_active, target_words = Tools.get_dataset_targets(files_start_name)
            # for op in output_active:
                # print(vectorizer_X.get_feature_names_out()[op])
            
            number_of_clases = len(output_active)
            class_index = np.arange(number_of_clases, dtype=np.uint32)
            knowledge_directory = Dicrectories.knowledge

            avg_per_ex_features = 0
            documents_of_features = []
            for ex in range(number_of_examples):
                rng = np.random.RandomState(None)
                rng.shuffle(class_index)
                num_features = 0
                for index in class_index:
                    tw = output_active[index]
                    target_value = random.randint(0, 1)
    
                    tw_knowledge_path = Dicrectories.pickle_by_id(knowledge_directory , tw)
                    tw_all_clauses = Tools.read_pickle_data(tw_knowledge_path)
                    if target_value == 1:
                        tw_filtered_clauses = [clause for clause in tw_all_clauses if clause[0] > 0]
                    else:
                        tw_filtered_clauses = [clause for clause in tw_all_clauses if clause[0] < 0]
    
                    tw_clauses_subset = random.sample(tw_filtered_clauses, accumulation)
                    if(top_max_clauses1 > 0):
                        tw_clauses_subset = sorted(tw_clauses_subset, key=lambda x: x[0], reverse=True)[:top_max_clauses1]
    
                    for tw_clause in tw_clauses_subset:
                        related_literals = tw_clause[1]
                        for literal in related_literals:
                            documents_of_features.append(literal)
                            literal_knowledge_path = Dicrectories.pickle_by_id(knowledge_directory , literal)
                            literal_all_clauses = Tools.read_pickle_data(literal_knowledge_path)
                            if target_value == 1:
                                literal_filtered_clauses = [clause for clause in literal_all_clauses if clause[0] > 0]
                            else:
                                literal_filtered_clauses = [clause for clause in literal_all_clauses if clause[0] < 0]
                            
                            literal_clauses_subset = random.sample(literal_filtered_clauses, sub_accumulation)
                            if(top_max_clauses2 > 0):
                                literal_clauses_subset = sorted(literal_clauses_subset, key=lambda x: x[0], reverse=True)[:top_max_clauses2]
    
                            for literal_clause in literal_clauses_subset:
                                literals = literal_clause[1]
                                for sub_literal in literals:
                                    documents_of_features.append(sub_literal)
                    if first == True:
                        first = False
                        words = []
                        print(vectorizer_X.get_feature_names_out()[tw])
                        for feature in documents_of_features:
                            words.append(vectorizer_X.get_feature_names_out()[feature])
                        print(words)
                        break
                            
                    # print(len(documents_of_features))
                    # documents_of_features is all features will include for one class
                    num_features = num_features + len(documents_of_features)
                # avg_per_ex_features = avg_per_ex_features + (num_features / number_of_clases)
                # print(num_features / number_of_clases)
            # print(avg_per_ex_features/number_of_examples)
            print(len(documents_of_features))

forest
['land', 'farm', 'new', 'water', 'site', 'water', 'strip', 'rover', 'air', 'area', 'time', 'value', 'rover', 'space', 'construction', 'government', 'area', 'property', 'property', 'world', 'property', 'water', 'rover', 'burned', 'carbon', 'forest', 'coal', 'fire', 'black', 'wednesday', 'body', 'boy', 'ground', 'police', 'fire', 'water', 'family', 'fire', 'house', 'ground', 'death', 'death', 'people', 'blaze', 'death', 'put', 'understand', 'river', 'basin', 'new', 'pearl', 'bridge', 'people', 'valley', 'bridge', 'new', 'bridge', 'feet', 'flood', 'bridge', 'park', 'fish', 'north', 'red', 'area', 'water', 'year', 'north', 'water', 'new', 'plane', 'lawn', 'park', 'white', 'front', 'grass', 'park', 'tennis', 'house', 'park', 'south', 'association', 'tennis', 'tennis', 'white', 'garden', 'large', 'garden', 'week', 'white', 'front', 'week', 'association', 'club', 'house', 'association', 'tennis', 'garden', 'tennis', 'garden', 'south', 'rain', 'storm', 'weather', 'game', 'heavy', 'day',

# Check target_values balance

In [13]:
import random
import os
from directories import Dicrectories
from tools import Tools
import numpy as np

number_of_examples = 100
accumulation = 10
sub_accumulation = 14
top_max_clauses1 = 0
top_max_clauses2 = 0
first= True
target_values =[]
target_valuess = random.choices([True, False], weights=[0.5, 0.5], k=number_of_examples)

for dataset_name in os.listdir(Dicrectories.datasets):
    if dataset_name == 'rg-65':
        current_folder_path = os.path.join(Dicrectories.datasets, dataset_name)
        if os.path.isdir(current_folder_path):
            files_start_name = os.path.join(current_folder_path, dataset_name)

            pair_list = Tools.get_dataset_pairs(files_start_name + '.csv')
            output_active, target_words = Tools.get_targets(files_start_name)
            
            number_of_clases = len(output_active)
            class_index = np.arange(number_of_clases, dtype=np.uint32)

            avg_per_ex_features = 0
            documents_of_features = []
            for ex in range(number_of_examples):
                rng = np.random.RandomState(None)
                rng.shuffle(class_index)
                num_features = 0
                for index in class_index:
                    tw = output_active[index]
                    target_value = target_valuess[ex]
                    target_values.append(target_value)

num_true = sum(value == True for value in target_values)
num_false = sum(value == False for value in target_values)

print(f"Number of True values: {num_true}")
print(f"Number of False values: {num_false}")
print(target_valuess)

Number of True values: 1700
Number of False values: 1700
[True, False, False, True, True, False, True, True, True, False, False, True, False, True, False, True, True, False, False, True, False, True, False, False, True, False, True, False, False, True, False, True, True, False, True, False, True, True, False, True, True, False, False, False, False, False, False, False, False, True, False, True, True, True, False, True, True, True, True, True, False, True, True, False, True, False, True, False, False, True, True, True, True, False, True, True, True, True, False, False, True, False, False, True, False, True, False, False, True, True, False, False, False, False, False, True, False, False, False, True]


In [37]:
import numpy as np
from scipy.sparse import csr_matrix

output_balancing = 0.5
number_of_features = 2000
rng = np.random.RandomState(None)
num_samples = 200

target_values =[]
output_active = random.sample(range(num_samples + 1), num_samples)
# print(output_active)
X = csr_matrix((1, number_of_features), dtype=np.int64)

feature_true_probability = np.ones(X.shape[1], dtype=np.float32) * output_balancing
for op in output_active:
    target_true_p = feature_true_probability[op]
    target_value = rng.random() <= target_true_p
    target_values.append(target_value)

num_true = sum(value == True for value in target_values)
num_false = sum(value == False for value in target_values)

print(f"Number of True values: {num_true}")
print(f"Number of False values: {num_false}")

Number of True values: 110
Number of False values: 90


# Find positive and negative for tw

In [20]:
import random
import os
from Tools import Tools
from DirectoriesUtil import Dicrectories
import numpy as np

def preprocess_text(text):
    return text
number_of_examples = 1
accumulation = 10
sub_accumulation = 14
top_max_clauses1 = 5
top_max_clauses2 = 5
first= True
vectorizer_X = Tools.read_pickle_data("big_vectorizer_X.pickle")
feature_names = vectorizer_X.get_feature_names_out()
number_of_features = vectorizer_X.get_feature_names_out().shape[0]

for dataset_name in os.listdir(Dicrectories.datasets):
    if dataset_name == 'rg-65':
        current_folder_path = os.path.join(Dicrectories.datasets, dataset_name)
        if os.path.isdir(current_folder_path):
            files_start_name = os.path.join(current_folder_path, dataset_name)

            pair_list = Tools.get_dataset_pairs(files_start_name)
            output_active, target_words = Tools.get_dataset_targets(files_start_name)
            # for op in output_active:
                # print(vectorizer_X.get_feature_names_out()[op])
            
            number_of_clases = len(output_active)
            class_index = np.arange(number_of_clases, dtype=np.uint32)
            knowledge_directory = Dicrectories.knowledge

            avg_per_ex_features = 0
            documents_of_features = []
            for ex in range(number_of_examples):
                rng = np.random.RandomState(None)
                rng.shuffle(class_index)
                num_features = 0

                pos_words = []
                neg_words = []
                for index in class_index:
                    tw = output_active[index]
                    target_value = random.randint(0, 1)
    
                    tw_knowledge_path = Dicrectories.pickle_by_id(knowledge_directory , tw)
                    tw_all_clauses = Tools.read_pickle_data(tw_knowledge_path)
                    pos_tw_filtered_clauses = [clause for clause in tw_all_clauses if clause[0] > 0]
                    pos_tw_clauses_subset = sorted(pos_tw_filtered_clauses, key=lambda x: x[0], reverse=True)[:top_max_clauses1]
                    neg_tw_filtered_clauses = [clause for clause in tw_all_clauses if clause[0] < 0]
                    neg_tw_clauses_subset = sorted(neg_tw_filtered_clauses, key=lambda x: x[0], reverse=True)[:top_max_clauses1]
    
                    for tw_clause in pos_tw_clauses_subset:
                        related_literals = tw_clause[1]
                        for literal in related_literals:
                            literal_knowledge_path = Dicrectories.pickle_by_id(knowledge_directory , literal)
                            literal_all_clauses = Tools.read_pickle_data(literal_knowledge_path)
                            literal_filtered_clauses = [clause for clause in literal_all_clauses if clause[0] > 0]
                            pos_literal_clauses_subset = sorted(literal_filtered_clauses, key=lambda x: x[0], reverse=True)[:top_max_clauses2]
                            
                            for literal_clause in pos_literal_clauses_subset:
                                literals = literal_clause[1]
                                for sub_literal in literals:
                                    pos_words.append(sub_literal)
                                    
                    for tw_clause in neg_tw_filtered_clauses:
                        related_literals = tw_clause[1]
                        for literal in related_literals:
                            literal_knowledge_path = Dicrectories.pickle_by_id(knowledge_directory , literal)
                            literal_all_clauses = Tools.read_pickle_data(literal_knowledge_path)
                            literal_filtered_clauses = [clause for clause in literal_all_clauses if clause[0] < 0]
                            neg_literal_clauses_subset = sorted(literal_filtered_clauses, key=lambda x: x[0], reverse=True)[:top_max_clauses2]
                            
                            for literal_clause in neg_literal_clauses_subset:
                                literals = literal_clause[1]
                                for sub_literal in literals:
                                    neg_words.append(sub_literal)
                                        
                    if first == True:
                        first = False
                        
                        words = []
                        print("Target word = ",vectorizer_X.get_feature_names_out()[tw])
                        for feature in pos_words:
                            words.append(vectorizer_X.get_feature_names_out()[feature])
                        print("Target word positive features = ",words)
                        
                        words = []
                        print("Target word = ",vectorizer_X.get_feature_names_out()[tw])
                        for feature in neg_words:
                            words.append(vectorizer_X.get_feature_names_out()[feature])
                        print("Target word negative features = ",words)
                        
                        break
                            
                    # print(len(documents_of_features))
                    # documents_of_features is all features will include for one class
                    num_features = num_features + len(documents_of_features)
                # avg_per_ex_features = avg_per_ex_features + (num_features / number_of_clases)
                # print(num_features / number_of_clases)
            # print(avg_per_ex_features/number_of_examples)
            print(len(documents_of_features))

Target word =  appeal
Target word positive features =  ['crown', 'last', 'said', 'case', 'judge', 'said', 'told', 'said', 'supreme', 'told', 'high', 'said', 'supreme', 'heard', 'ruled', 'said', 'ballots', 'election', 'campaign', 'clinton', 'ballots', 'election', 'said', 'candidates', 'party', 'said', 'one', 'percent', 'registered', 'said', 'require', 'hoped', 'said', 'hoped', 'hoped', 'predicted', 'crown', 'last', 'said', 'case', 'judge', 'said', 'told', 'said', 'supreme', 'told', 'high', 'said', 'supreme', 'heard', 'ruled', 'said', 'said', 'saves', 'saves', 'debut', 'debut', 'mistakes', 'spokeswoman', 'spokeswoman', 'spokeswoman', 'spokeswoman', 'analyst', 'ballots', 'election', 'campaign', 'clinton', 'ballots', 'election', 'said', 'candidates', 'party', 'said', 'one', 'percent', 'registered', 'said', 'spokeswoman', 'spokeswoman', 'spokeswoman', 'spokeswoman', 'analyst', 'crown', 'last', 'said', 'case', 'judge', 'said', 'told', 'said', 'supreme', 'told', 'high', 'said', 'supreme', 'he

KeyboardInterrupt: 

In [21]:
from collections import Counter
word_counts = Counter(words)
sorted_words = word_counts.most_common()
for word, count in sorted_words:
    print(f"{word}: {count}")

certificates: 37
jittery: 37
monks: 37
wounds: 37
rests: 37
illustrator: 37
ulrich: 37
said: 23
plate: 10
pantomime: 10
hobbit: 10
unanimous: 10
germain: 10
two: 7
blacktie: 7
criminality: 7
pathan: 7
commandos: 7
transformed: 7
also: 7
gastronomic: 5
new: 5
would: 5
didnt: 5
stephen: 5
minsk: 5
sindh: 5
rochus: 5
microfinance: 4
complexes: 4
starter: 4
wood: 4
cleveland: 4
bens: 4
nervy: 4
akmal: 4
comic: 4
modernised: 4
gasfired: 4
cluster: 4
christians: 4
canals: 4
rematch: 4
one: 3
percent: 3
government: 3
dioceses: 3
newry: 3
qualify: 3
emea: 3
grameen: 3
mistrial: 3
hastert: 3
multistate: 3
atwood: 3
plunges: 3
sunday: 3
blue: 3
mccullum: 3
meaningfully: 3
pga: 3
inviting: 3
course: 3
holdings: 3
nfc: 3
cirque: 3
titfortat: 3
saddened: 3
concludes: 3
reel: 3
repression: 3
around: 3
dear: 3
refurbish: 3
first: 3
year: 3
holtzeakin: 3
peacemaking: 3
measurement: 3
police: 2
exhibition: 2
stanford: 2
seminar: 2
captured: 2
study: 2
shoes: 2
vulcan: 2
winstonsalem: 2
tongues: 2
indus