In [1]:
import os
import sys
import nltk
import re
import numpy as np
import json
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize

from textblob import TextBlob

# NEED TO NORAMLIZE EVERYTHING TO WEIGHT IT PROPERLY


# loads the text and formats it so that it is a list of sentences (and each sentence is a list of words)
# data is just the sentence tokenized
def load_corpus(path):
    with open(path, "r",encoding='utf-8') as f:
        data = ""
        for line in f:
            line = line.replace('\n', ' ')
            data += line
    data = sent_tokenize(data)
    tokenized_data = []
    for sent in data:
        sent = re.sub(r'[^\w\s]','', sent)
        sent = sent.lower()
        word = word_tokenize(sent)
        
        if word != '.':
            tokenized_data.append(word)
        
    return data, tokenized_data

# IF I want to parse poetry for enjambment
def load_poem(path):
    with open(path, "r",encoding='utf-8') as f:
        data = ""
        for line in f:
            data += line
    data = sent_tokenize(data)
    tokenized_data = []
    for sent in data:
        sent = re.sub(r'[^\w\s]','', sent)
        sent = sent.lower()
        word = word_tokenize(sent)
        
        if word != '.':
            tokenized_data.append(word)
    return data, tokenized_data

def remove_stopwords(corpus):
    stop_words = stopwords.words('english')
    
    
    
    filtered = []
    removed_count= []
    for sentence in corpus:
        sent = []
        removed = 0
        for word in sentence:
            if word not in stop_words:
                sent.append(word)
            else:
                removed += 1
        removed_count.append(removed)
        
        filtered.append(sent)
    return filtered, removed

# takes in corpus and returns a numpy array that has the length of each sentence
def get_sentence_len(corpus):
    arr = []
    for sentence in corpus:
        arr.append(len(sentence))
    return np.array(arr)

# takes in numpy array of sentences and returns a normalized numpy array
def process_sentence_length(sentences):
    max_len = np.max(sentences)
    sentences = sentences / 30 # our max
    return sentences * 100
    
def get_sentiment(corpus):
    sentiment_array = []
    for sentence in corpus:
        blob = TextBlob(sentence)
        sentiment_array.append(blob.sentiment.polarity)
    sentiment_array = np.array(sentiment_array)
    return sentiment_array * 100

# gets a sentence, returns an average length of the words
def word_length(sentence):
    length = 0
    for word in sentence:
        length += len(word)
    length = float(length)
    avg = length / len(sentence)
    return avg

# counts the adjectives in a sentence and returns the percentage
# of the sentence that is an adjective
def adj_count(sentence):
    sent = TextBlob(sentence)
    tags = sent.tags
    count = 0
    for t in tags:
        if t[1] == "JJ" or t[1] == "RB":
            count += 1
    count = float(count)
    return count/len(sentence)

# pass in a numpy array, divides it by its maximum and multiplied by 100
def normalize_adj(adj_array):
    return adj_array * 10 # 1000 is normal for this, then divide by half because waited less (it's waited like 10%)
    

def rgb_to_hex(a):
    result = '#%02x%02x%02x' % (a,a,a)
#     print(result)
    return result
    
def main(data_dir):
    # gets the data in the formatting we want
    sentence_lists, corpus = load_corpus(data_dir)
    
    count = 0

    
    sentence_lengths = get_sentence_len(corpus)
    sentence_lengths = process_sentence_length(sentence_lengths)
    
    sentiment_of_sentence = get_sentiment(sentence_lists)
    
    weight = []

    
    word_lengths = []
    adjective_count = []

    for sentence in sentence_lists:
        adjective_count.append(adj_count(sentence))
    for sentence in corpus:
        word_lengths.append(word_length(sentence))
        
    word_lengths = np.array(word_lengths)
    adjective_count = np.array(adjective_count)
    
    adjective_count = normalize_adj(adjective_count)
    
    
    
    final_arr = np.add(sentence_lengths, word_lengths)
        
    final_arr = (final_arr/np.average(final_arr)) + adjective_count
    
    max_num = np.max(final_arr)
    min_num = np.min(final_arr)
    
    
    final_list = final_arr.tolist()
    
    # scales the list into colors from 0 to 255 for rgb
    print(final_arr)
    avg = np.average(final_arr)
    
    sd = np.std(final_arr)
    print(sd)
    
    # weird thing i am trying to inflate the higher ones
    for i in range(0, len(final_list)):
        if final_list[i] > (avg - sd) and final_list[i] < (max_num - sd):
            more_than = final_list[i]  / avg  # thisi s a number greater than one    
#             print(final_list[i])
            final_list[i] =  (1  +  ( sd * more_than) )
#             print(final_list[i])
#             print("---")
        elif final_list[i] > 3*sd:
            final_list[i] -= 3*sd
            
    nump = np.array(final_list)
    new_min = np.min(nump)
    new_max = np.max(nump)
                   
    color_list = []
    for num in final_list:
        x_i = (num - new_min) / (new_max - new_min)
        color_list.append(int(x_i * 255))
        
    print(color_list)
        


    for x in range(0, len(sentence_lists)):
        print(x+1)
        print(sentence_lists[x])
        print("sentence + word", sentence_lengths[x], word_lengths[x])
        print("adj", adjective_count[x])
        print("final array", final_arr[x])
        print("sentence_length", sentence_lengths[x])
        print("sentiment of sentence", sentiment_of_sentence[x])
        print(color_list[x])
        print("         ")
    
#     print(color_list)

    # convert everything to hex
    for c in range(0, len(color_list)):
        color_list[c] = rgb_to_hex(color_list[c])
        
        


    
    
    # formula is (x_i - min(x) / max(x) - min(x))
    # to combine: take average of the sentence and word length
    # add on the adjective count with a norm of

    with open('colors/cant_and_wont.json', 'w') as outfile:
        json.dump(color_list, outfile, indent=4)


    
# to manually specify the path to the data.
# This may take a little bit of time (~30-60 seconds) to run.
if __name__ == '__main__':
    data_dir = 'text/cant_and_wont.txt'
    main(data_dir)
    
    

[0.84372605 0.86577179 1.301581   0.57883034 1.05201817 3.31532217]
0.9161263956256981
[194, 197, 255, 159, 222, 0]
1
I had had a feeling of freedom because of the sudden change in my life.
sentence + word 50.0 3.7333333333333334
adj 0.14084507042253522
final array 0.843726051161871
sentence_length 50.0
sentiment of sentence 0.0
194
         
2
By comparison to what had come before, I felt immensely free.
sentence + word 36.666666666666664 4.454545454545454
adj 0.3278688524590164
final array 0.8657717911367057
sentence_length 36.666666666666664
sentiment of sentence 40.0
197
         
3
But then, once I became used to that freedom, even small tasks became more difficult.
sentence + word 50.0 4.533333333333333
adj 0.5882352941176471
final array 1.3015810040987597
sentence_length 50.0
sentiment of sentence -8.333333333333332
255
         
4
I placed constraints on myself, and filled the hours of the day.
sentence + word 40.0 4.25
adj 0.0
final array 0.5788303361857744
sentence_length 40.

In [11]:
import os
import sys
import nltk
import re
import numpy as np
import json
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize

from textblob import TextBlob

# NEED TO NORAMLIZE EVERYTHING TO WEIGHT IT PROPERLY


# loads the text and formats it so that it is a list of sentences (and each sentence is a list of words)
# data is just the sentence tokenized
def load_corpus(path):
    with open(path, "r",encoding='utf-8') as f:
        data = ""
        for line in f:
            line = line.replace('\n', ' ')
            data += line
    data = sent_tokenize(data)
    tokenized_data = []
    for sent in data:
        sent = re.sub(r'[^\w\s]','', sent)
        sent = sent.lower()
        word = word_tokenize(sent)
        
        if word != '.':
            tokenized_data.append(word)
        
    return data, tokenized_data

# IF I want to parse poetry for enjambment
def load_poem(path):
    with open(path, "r",encoding='utf-8') as f:
        data = ""
        for line in f:
            data += line
    data = sent_tokenize(data)
    tokenized_data = []
    for sent in data:
        sent = re.sub(r'[^\w\s]','', sent)
        sent = sent.lower()
        word = word_tokenize(sent)
        
        if word != '.':
            tokenized_data.append(word)
    return data, tokenized_data

def remove_stopwords(corpus):
    stop_words = stopwords.words('english')
    
    filtered = []
    removed_count= []
    for sentence in corpus:
        sent = []
        removed = 0
        for word in sentence:
            if word not in stop_words:
                sent.append(word)
            else:
                removed += 1
        removed_count.append(removed)
        
        filtered.append(sent)
    return filtered, removed_count

# takes in corpus and returns a numpy array that has the length of each sentence
def get_sentence_len(corpus):
    arr = []
    for sentence in corpus:
        arr.append(len(sentence))
    return np.array(arr)

# takes in numpy array of sentences and returns a normalized numpy array
def process_sentence_length(sentences):
    max_len = np.max(sentences)
    sentences = sentences / 30 # our max
    return sentences * 100
    
def get_sentiment(corpus):
    sentiment_array = []
    for sentence in corpus:
        blob = TextBlob(sentence)
        sentiment_array.append(blob.sentiment.polarity)
    sentiment_array = np.array(sentiment_array)
    return sentiment_array * 100

# gets a sentence, returns an average length of the words
def word_length(sentence):
    length = 0
    for word in sentence:
        length += len(word)
    length = float(length)
    avg = length / len(sentence)
    return avg

# counts the adjectives in a sentence and returns the percentage
# of the sentence that is an adjective
def adj_count(sentence):
    sent = TextBlob(sentence)
    tags = sent.tags
    count = 0
    for t in tags:
        if t[1] == "JJ" or t[1] == "RB":
            count += 1
    count = float(count)
    return count/len(sentence)

# pass in a numpy array, divides it by its maximum and multiplied by 100
def normalize_adj(adj_array):
    return adj_array * 10 # 1000 is normal for this, then divide by half because waited less (it's waited like 10%)
    

def rgb_to_hex(a):
    result = '#%02x%02x%02x' % (a,a,a)
#     print(result)
    return result
    
def main(data_dir):
    # gets the data in the formatting we want
    sentence_lists, corpus = load_corpus(data_dir)
    
    for c in range(len(sentence_lists)):
        print(c, sentence_lists[c])
    
    
    count = 0

    
    sentence_lengths = get_sentence_len(corpus)
    sentence_lengths = process_sentence_length(sentence_lengths)
    
    sentiment_of_sentence = get_sentiment(sentence_lists)
    
    weight = []

    
    word_lengths = []
    adjective_count = []

    for sentence in sentence_lists:
        adjective_count.append(adj_count(sentence))
    for sentence in corpus:
        word_lengths.append(word_length(sentence))
        
    word_lengths = np.array(word_lengths)
    adjective_count = np.array(adjective_count)
    
    adjective_count = normalize_adj(adjective_count)
    
    
    
    final_arr = np.add(sentence_lengths, word_lengths)
        
    final_arr = (final_arr/np.average(final_arr)) + adjective_count
    
    max_num = np.max(final_arr)
    min_num = np.min(final_arr)
    
    
    final_list = final_arr.tolist()
    
    # scales the list into colors from 0 to 255 for rgb
    print(final_arr)
    avg = np.average(final_arr)
    
    sd = np.std(final_arr)
    print(sd)
    
    # weird thing i am trying to inflate the higher ones
    for i in range(0, len(final_list)):
        if final_list[i] > (avg - sd) and final_list[i] < (max_num - sd):
            more_than = final_list[i]  / avg  # thisi s a number greater than one    
#             print(final_list[i])
            final_list[i] =  (1  +  ( sd * more_than) )
#             print(final_list[i])
#             print("---")
        elif final_list[i] > 3*sd:
            final_list[i] -= 3*sd
            
    nump = np.array(final_list)
    new_min = np.min(nump)
    new_max = np.max(nump)
                   
    color_list = []
    for num in final_list:
        x_i = (num - new_min) / (new_max - new_min)
        color_list.append(int(x_i * 255))
        
    print(color_list)
        
    
#     print(color_list)

    # convert everything to hex
    for c in range(0, len(color_list)):
        color_list[c] = rgb_to_hex(color_list[c])
        
        

    ##
    print(color_list)
    corpus, removed_count = remove_stopwords(corpus)
    
    dict_count = {}
    
    print(removed_count)

    for c in range(0, len(color_list)):
        smol_dict = {}
        smol_dict["removed"] = removed_count[c];
        smol_dict["color"] =  color_list[c];
        dict_count[int(c)] = (smol_dict)
        
    
    # formula is (x_i - min(x) / max(x) - min(x))
    # to combine: take average of the sentence and word length
    # add on the adjective count with a norm of

    with open('colors/vonnegut.json', 'w') as outfile:
        json.dump(dict_count, outfile, indent=4)


    
# to manually specify the path to the data.
# This may take a little bit of time (~30-60 seconds) to run.
if __name__ == '__main__':
    data_dir = 'text/vonnegut_sirens_of_titan.txt'
    main(data_dir)
    
    

0 Mankind, ignorant of the truths that lie within every human being, looked outward—pushed ever outward.
1 What mankind hoped to learn in its outward push was who was actually in charge of all creation, and what all creation was all about.
2 Mankind flung its advance agents ever outward, ever outward.
3 Eventually it flung them out into space, into the colorless, tasteless, weightless sea of outwardness without end.
4 It flung them like stones.
5 These unhappy agents found that what had already been found in abundance on Earth—a nightmare of meaninglessness without end.
6 The bounties of space, of infinite outwardness, were three: empty heroics, low comedy, and pointless death.
7 Outwardness lost, at last, its imagined attractions.
8 Only inwardness remained to be explored.
9 Only the human soul remain terra incognita.
10 This was the beginning of goodness and wisdom.
[1.50665861 1.98558229 1.44207256 1.53182876 0.45506633 1.66171125
 1.65416018 1.02744072 0.80974613 1.31869146 0.68514

In [13]:
import os
import sys
import nltk
import re
import numpy as np
import json
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize

from textblob import TextBlob

# NEED TO NORAMLIZE EVERYTHING TO WEIGHT IT PROPERLY


# loads the text and formats it so that it is a list of sentences (and each sentence is a list of words)
# data is just the sentence tokenized
def load_corpus(path):
    with open(path, "r",encoding='utf-8') as f:
        data = ""
        for line in f:
            line = line.replace('\n', ' ')
            data += line
    data = sent_tokenize(data)
    tokenized_data = []
    for sent in data:
        sent = re.sub(r'[^\w\s]','', sent)
        sent = sent.lower()
        word = word_tokenize(sent)
        
        if word != '.':
            tokenized_data.append(word)
        
    return data, tokenized_data

# IF I want to parse poetry for enjambment
def load_poem(path):
    with open(path, "r",encoding='utf-8') as f:
        data = ""
        for line in f:
            data += line
    data = sent_tokenize(data)
    tokenized_data = []
    for sent in data:
        sent = re.sub(r'[^\w\s]','', sent)
        sent = sent.lower()
        word = word_tokenize(sent)
        
        if word != '.':
            tokenized_data.append(word)
    return data, tokenized_data

def remove_stopwords(corpus):
    stop_words = stopwords.words('english')
    
    filtered = []
    removed_count= []
    for sentence in corpus:
        sent = []
        removed = 0
        for word in sentence:
            if word not in stop_words:
                sent.append(word)
            else:
                removed += 1
        removed_count.append(removed)
        
        filtered.append(sent)
    return filtered, removed_count

# takes in corpus and returns a numpy array that has the length of each sentence
def get_sentence_len(corpus):
    arr = []
    for sentence in corpus:
        arr.append(len(sentence))
    return np.array(arr)

# takes in numpy array of sentences and returns a normalized numpy array
def process_sentence_length(sentences):
    max_len = np.max(sentences)
    sentences = sentences / 30 # our max
    return sentences * 100
    
def get_sentiment(corpus):
    sentiment_array = []
    for sentence in corpus:
        blob = TextBlob(sentence)
        sentiment_array.append(blob.sentiment.polarity)
    sentiment_array = np.array(sentiment_array)
    return sentiment_array * 100

# gets a sentence, returns an average length of the words
def word_length(sentence):
    length = 0
    for word in sentence:
        length += len(word)
    length = float(length)
    avg = length / len(sentence)
    return avg

# counts the adjectives in a sentence and returns the percentage
# of the sentence that is an adjective
def adj_count(sentence):
    sent = TextBlob(sentence)
    tags = sent.tags
    count = 0
    for t in tags:
        if t[1] == "JJ" or t[1] == "RB":
            count += 1
    count = float(count)
    return count/len(sentence)

# pass in a numpy array, divides it by its maximum and multiplied by 100
def normalize_adj(adj_array):
    return adj_array * 10 # 1000 is normal for this, then divide by half because waited less (it's waited like 10%)
    

def rgb_to_hex(a):
    result = '#%02x%02x%02x' % (a,a,a)
#     print(result)
    return result
    
def main(data_dir):
    # gets the data in the formatting we want
    sentence_lists, corpus = load_corpus(data_dir)
    
    for c in range(len(sentence_lists)):
        print(c, sentence_lists[c])
    
    
    count = 0

    
    sentence_lengths = get_sentence_len(corpus)
    sentence_lengths = process_sentence_length(sentence_lengths)
    
#     sentiment_of_sentence = get_sentiment(sentence_lists)
    
    weight = []

    
    word_lengths = []
#     adjective_count = []

#     for sentence in sentence_lists:
#         adjective_count.append(adj_count(sentence))
    for sentence in corpus:
        word_lengths.append(word_length(sentence))
        
    word_lengths = np.array(word_lengths)
#     adjective_count = np.array(adjective_count)
    
#     adjective_count = normalize_adj(adjective_count)
    
    
    
    final_arr = np.add(sentence_lengths, word_lengths)
        
    final_arr = (final_arr/np.average(final_arr))
    
    max_num = np.max(final_arr)
    min_num = np.min(final_arr)
    
    
    final_list = final_arr.tolist()
    
    # scales the list into colors from 0 to 255 for rgb
    print(final_arr)
    avg = np.average(final_arr)
    
    sd = np.std(final_arr)
    print(sd)
    
    # weird thing i am trying to inflate the higher ones
#     for i in range(0, len(final_list)):
#         if final_list[i] > (avg - sd) and final_list[i] < (max_num - sd):
#             more_than = final_list[i]  / avg  # thisi s a number greater than one    
# #             print(final_list[i])
#             final_list[i] =  (1  +  ( sd * more_than) )
# #             print(final_list[i])
# #             print("---")
#         elif final_list[i] > 3*sd:
#             final_list[i] -= 3*sd
            
    nump = np.array(final_list)
    new_min = np.min(nump)
    new_max = np.max(nump)
                   
    color_list = []
    for num in final_list:
        x_i = (num - new_min) / (new_max - new_min)
        color_list.append(int(x_i * 255))
        
    print(color_list)
    
    color_np = np.array(color_list)
    args = np.argsort(color_np)
    
    print(args)
    
    color_list= np.sort(np.array(color_list)).tolist()
        
    
#     print(color_list)

    # convert everything to hex
    for c in range(0, len(color_list)):
        color_list[c] = rgb_to_hex(color_list[c])
        
        

    ##
    print(color_list)
    
    for number in args:
        print(sentence_lists[number])

    
    # add on the adjective count with a norm of

    with open('colors/cant_and_wont.json', 'w') as outfile:
        json.dump(color_list, outfile, indent=4)


    
# to manually specify the path to the data.
# This may take a little bit of time (~30-60 seconds) to run.
if __name__ == '__main__':
    data_dir = 'text/cant_and_wont.txt'
    main(data_dir)
    
    

0 I had had a feeling of freedom because of the sudden change in my life.
1 By comparison to what had come before, I felt immensely free.
2 But then, once I became used to that freedom, even small tasks became more difficult.
3 I placed constraints on myself, and filled the hours of the day.
4 Or perhaps it was even more complicated than that.
5 Sometimes I did exactly what I wanted to do all day—I lay on the sofa and read a book, or I typed up an old diary—and then the most terrifying sort of despair would descend on me: the very freedom I was enjoying seemed to say that what I did in my day was arbitrary, and that therefore my whole life and how I spent it was arbitrary.
[0.70288098 0.53790294 0.71334571 0.57883034 0.45201817 3.01502187]
0.9057193452995492
[24, 8, 26, 12, 0, 255]
[4 1 3 0 2 5]
['#000000', '#080808', '#0c0c0c', '#181818', '#1a1a1a', '#ffffff']
Or perhaps it was even more complicated than that.
By comparison to what had come before, I felt immensely free.
I placed cons