In [2]:
#
# ... file : lexical_diversity_metrics.py
#
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ...
# ... msds 7337 NLP
# ... homework 02
# ... gutenberg - documment vocabulary normalization
# ... pmcdevitt@smu.edu
# ... 15-sep-2018
# ...
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... load packages
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import os
import re
import math
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.probability import ConditionalFreqDist

get_ipython().magic('matplotlib inline')
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})
plt.rc('xtick', labelsize=20)     
plt.rc('ytick', labelsize=20)

# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... some directory and file name definitions
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

files = ".*\.txt"
home_dir = "/home/mcdevitt/_ds/_smu/_src/nlp/homework_02/"
corpus_root = "./text/"
corpus_clean = "./text_no_license/"
plot_dir = "./plots/"

os.chdir(home_dir)

# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... read in saved .txt files (.csv) from normalizations
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

file_wordl = 'max_word_length_normalized.csv'
file_vocab = 'vocab_normalized.csv'

selected_texts = [
    "mcguffey's_first_eclectic_reader,_revised_edition.txt", 
    "mcguffey's_second_eclectic_reader.txt", 
    "mcguffey's_third_eclectic_reader.txt", 
    "mcguffey's_fourth_eclectic_reader.txt", 
    "mcguffey's_fifth_eclectic_reader.txt", 
    "mcguffey's_sixth_eclectic_reader.txt",
    "new_national_first_reader.txt",
    "the_ontario_high_school_reader.txt",
    "the_literary_world_seventh_reader.txt"
]

df_wordl = pd.read_csv(file_wordl)
df_vocab = pd.read_csv(file_vocab)

df_vocab[:10]

# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... condense to normalized metrics and selected texts
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

df_vocab_subset = df_vocab.loc[df_vocab['text_name'].isin(selected_texts)]
df_wordl_subset = df_wordl.loc[df_wordl['text_name'].isin(selected_texts)]

df_wordl_subset = df_wordl_subset[['text_name','long_word_length']]
df_vocab_subset = df_vocab_subset[['text_name', 'title_short', 'num_vocab', 'lex_div']]

df_wordl_subset = df_wordl_subset.sort_values(df_wordl_subset.columns[0])
df_vocab_subset = df_vocab_subset.sort_values(df_vocab_subset.columns[0])

df_wordl_subset = df_wordl_subset.reset_index(drop=True)
df_vocab_subset = df_vocab_subset.reset_index(drop=True)

df_wordl_subset = df_wordl_subset[['long_word_length']]

df_metrics = pd.concat([df_vocab_subset, df_wordl_subset], axis = 1)
df_metrics.columns = ['text_name', 'title_short', 'vocab_nrml', 'lex_div_nrml', 'word_length_nrml']

# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... combine all 3 normalized metrics to total score
# ... - use addition (sum_scores) and multiplication (mlt_scores)
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

df_metrics['sum_scores'] = df_metrics['vocab_nrml'] + df_metrics['lex_div_nrml'] + df_metrics['word_length_nrml']
df_metrics['mlt_scores'] = df_metrics['vocab_nrml'] * df_metrics['lex_div_nrml'] * df_metrics['word_length_nrml']
df_metrics['sss_scores'] = df_metrics['vocab_nrml'] * df_metrics['vocab_nrml'] + \
                            df_metrics['lex_div_nrml'] * df_metrics['lex_div_nrml'] + \
                            df_metrics['word_length_nrml'] * df_metrics['word_length_nrml']

df_metrics['sss_scores'] = math.sqrt(df_metrics['sss_scores'])

df_metrics = df_metrics.sort_values(df_metrics.columns[5], ascending = False)
df_metrics
df_metrics = df_metrics.sort_values(df_metrics.columns[6], ascending = False)
df_metrics
df_metrics = df_metrics.sort_values(df_metrics.columns[7], ascending = False)
df_metrics

Unnamed: 0,text_name,num_chars,num_words,num_sents,num_vocab,tokens,types,lex_div,vocab_ldiv,title_short
0,mcguffey's_sixth_eclectic_reader.txt,0.620284,0.595362,0.524434,1.0,0.595362,1.0,0.389382,1.0,mcguffey's_sixth_eclectic
1,"the_ontario_readers:_the_high_school_reader,_1...",0.578756,0.544632,0.367759,0.98508,0.544632,0.972309,0.413865,0.972309,the_ontario_readers:_the_
2,sanders'_union_fourth_reader.txt,0.504808,0.503093,0.629738,0.981658,0.503093,0.987752,0.455153,0.987752,sanders'_union_fourth_rea
3,"a_modern_history,_from_the_time_of_luther_to_t...",0.985318,0.870958,0.642908,0.919513,0.870958,0.881486,0.234626,0.881486,"a_modern_history,_from_th"
4,first_italian_readings.txt,0.26823,0.259063,0.375685,0.88276,0.259063,0.804627,0.720023,0.804627,first_italian_readings.tx
5,mcguffey's_fifth_eclectic_reader.txt,0.446147,0.448834,0.513766,0.821641,0.448834,0.832318,0.429893,0.832318,mcguffey's_fifth_eclectic
6,modern_prose_and_poetry_for_secondary_schools.txt,0.421361,0.397253,0.339631,0.789131,0.397253,0.784806,0.457986,0.784806,modern_prose_and_poetry_f
7,the_literary_world_seventh_reader.txt,0.458234,0.450569,0.37795,0.770926,0.450569,0.734986,0.378159,0.734986,the_literary_world_sevent
8,"poems_teachers_ask_for,_book_two.txt",0.41418,0.417088,0.284148,0.723017,0.417088,0.72404,0.402432,0.72404,"poems_teachers_ask_for,_b"
9,a_school_history_of_the_united_states.txt,0.6678,0.613324,0.568057,0.714804,0.613324,0.718833,0.271704,0.718833,a_school_history_of_the_u


Unnamed: 0,text_name,title_short,vocab_nrml,lex_div_nrml,word_length_nrml,sum_scores,mlt_scores,sss_scores
4,mcguffey's_sixth_eclectic_reader.txt,mcguffey's_sixth_eclectic,1.0,0.389382,0.73913,2.128513,0.287804,1.697932
0,mcguffey's_fifth_eclectic_reader.txt,mcguffey's_fifth_eclectic,0.821641,0.429893,0.695652,1.947186,0.245717,1.343834
7,the_literary_world_seventh_reader.txt,the_literary_world_sevent,0.770926,0.378159,0.695652,1.844737,0.202805,1.221263
8,the_ontario_high_school_reader.txt,the_ontario_high_school_r,0.671207,0.448132,0.695652,1.814991,0.209245,1.135273
2,mcguffey's_fourth_eclectic_reader.txt,mcguffey's_fourth_eclecti,0.604065,0.461787,0.652174,1.718026,0.181923,1.003473
1,"mcguffey's_first_eclectic_reader,_revised_edit...",mcguffey's_first_eclectic,0.125453,0.633082,0.652174,1.410709,0.051797,0.841862
3,mcguffey's_second_eclectic_reader.txt,mcguffey's_second_eclecti,0.232085,0.599927,0.565217,1.39723,0.078698,0.733247
5,mcguffey's_third_eclectic_reader.txt,mcguffey's_third_eclectic,0.275614,0.485236,0.608696,1.369546,0.081406,0.681928
6,new_national_first_reader.txt,new_national_first_reader,0.11197,0.492067,0.652174,1.256212,0.035933,0.679998


Unnamed: 0,text_name,title_short,vocab_nrml,lex_div_nrml,word_length_nrml,sum_scores,mlt_scores,sss_scores
4,mcguffey's_sixth_eclectic_reader.txt,mcguffey's_sixth_eclectic,1.0,0.389382,0.73913,2.128513,0.287804,1.697932
0,mcguffey's_fifth_eclectic_reader.txt,mcguffey's_fifth_eclectic,0.821641,0.429893,0.695652,1.947186,0.245717,1.343834
8,the_ontario_high_school_reader.txt,the_ontario_high_school_r,0.671207,0.448132,0.695652,1.814991,0.209245,1.135273
7,the_literary_world_seventh_reader.txt,the_literary_world_sevent,0.770926,0.378159,0.695652,1.844737,0.202805,1.221263
2,mcguffey's_fourth_eclectic_reader.txt,mcguffey's_fourth_eclecti,0.604065,0.461787,0.652174,1.718026,0.181923,1.003473
5,mcguffey's_third_eclectic_reader.txt,mcguffey's_third_eclectic,0.275614,0.485236,0.608696,1.369546,0.081406,0.681928
3,mcguffey's_second_eclectic_reader.txt,mcguffey's_second_eclecti,0.232085,0.599927,0.565217,1.39723,0.078698,0.733247
1,"mcguffey's_first_eclectic_reader,_revised_edit...",mcguffey's_first_eclectic,0.125453,0.633082,0.652174,1.410709,0.051797,0.841862
6,new_national_first_reader.txt,new_national_first_reader,0.11197,0.492067,0.652174,1.256212,0.035933,0.679998


Unnamed: 0,text_name,title_short,vocab_nrml,lex_div_nrml,word_length_nrml,sum_scores,mlt_scores,sss_scores
4,mcguffey's_sixth_eclectic_reader.txt,mcguffey's_sixth_eclectic,1.0,0.389382,0.73913,2.128513,0.287804,1.697932
0,mcguffey's_fifth_eclectic_reader.txt,mcguffey's_fifth_eclectic,0.821641,0.429893,0.695652,1.947186,0.245717,1.343834
7,the_literary_world_seventh_reader.txt,the_literary_world_sevent,0.770926,0.378159,0.695652,1.844737,0.202805,1.221263
8,the_ontario_high_school_reader.txt,the_ontario_high_school_r,0.671207,0.448132,0.695652,1.814991,0.209245,1.135273
2,mcguffey's_fourth_eclectic_reader.txt,mcguffey's_fourth_eclecti,0.604065,0.461787,0.652174,1.718026,0.181923,1.003473
1,"mcguffey's_first_eclectic_reader,_revised_edit...",mcguffey's_first_eclectic,0.125453,0.633082,0.652174,1.410709,0.051797,0.841862
3,mcguffey's_second_eclectic_reader.txt,mcguffey's_second_eclecti,0.232085,0.599927,0.565217,1.39723,0.078698,0.733247
5,mcguffey's_third_eclectic_reader.txt,mcguffey's_third_eclectic,0.275614,0.485236,0.608696,1.369546,0.081406,0.681928
6,new_national_first_reader.txt,new_national_first_reader,0.11197,0.492067,0.652174,1.256212,0.035933,0.679998


In [1]:
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... plot comparison of all (normalized) metrics - 
# ... sorted in descending mlt_scores order
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

N = len(df_metrics)

ind = np.arange(N) 
width = 0.18

_ = plt.figure(figsize = (18, 12))

offset = width / 2
offset = 0
_ = plt.bar(ind + offset, df_metrics['lex_div_nrml'],
            width, label='Lex_Div', color = 'tomato')
_ = plt.bar(ind + offset + width, df_metrics['vocab_nrml'],
            width, label='Vocab', color = 'dodgerblue', alpha = 0.9)
_ = plt.bar(ind + offset + width*2, df_metrics['word_length_nrml'],
            width, label='Word Length', color = 'slateblue', alpha = 0.9)

#_ = plt.bar(ind + offset + width*3, df_metrics['sum_scores'], width, label='Sum Scores', color = 'orchid', alpha = 0.9)

_ = plt.bar(ind + offset + width*4, df_metrics['mlt_scores']*4,
            width,
            label='Mult Scores',
            color = 'darkolivegreen',
            alpha = 0.9)
_ = plt.bar(ind + offset + width*5, df_metrics['sss_scores']*4,
            width,
            label='Mult Scores',
            color = 'orchid',
            alpha = 0.9)

_ = plt.xticks(ind + width / 2, df_metrics['title_short'])
_ = plt.xticks(rotation=90)
_ = plt.legend(loc='upper right')
_ = plt.title('Normalized Characteristics Comparison', fontsize = '30')

axes = plt.gca()
axes.set_ylim([0, 1.2])

_ = plt.savefig(plot_dir + 'lex_div_normalized_scores.png')
_ = plt.show()

# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ... end_of_file
# ... -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

TypeError: cannot convert the series to <class 'float'>