## Brennan Danek (bd4bk@virginia.edu) DS 5001 Spring 2023

# Setup

In [1]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import plotly_express as px
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer


from numpy.linalg import norm
from scipy.linalg import eigh

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

from gensim.models import word2vec

In [2]:
# Use to download nltk resources the first time
nltk_resources = [
    'tokenizers/punkt', 
    'taggers/averaged_perceptron_tagger', 
    'corpora/stopwords', 
    'help/tagsets'
]

for rsc in nltk_resources:
    try:
        nltk.data.find(rsc)
    except IndexError:
        nltk.download(rsc)

In [3]:
# Set OHCO, as well as roman numeral and paragraph regexes

OHCO = ['book_id','chap_num', 'para_num', 'sent_num', 'token_num']

data_in = 'data'
data_out = 'output'

roman = '[IVXLCM]+'
para_pat = r'\n+'


In [4]:
# Set the bag level for Vector Space Analysis
# bag at the chapter level
BAG = OHCO[:2] 
print(BAG)

['book_id', 'chap_num']


In [5]:
# Load in full North American Slave Narrative Library
LIB = pd.read_csv('toc.csv')

In [6]:
LIB

Unnamed: 0,Filename,Author,Title,Date,URL,URL(text-only)
0,neh-johnstone-johnstone.xml,Abraham Johnstone,"The Address of Abraham Johnstone, a Black Man,...",1797,http://docsouth.unc.edu/neh/johnstone/menu.html,http://docsouth.unc.edu/full-text/na-slave-nar...
1,neh-meachum-meachum.xml,John B. Meachum,An Address to All the Colored Citizens of the ...,1846,http://docsouth.unc.edu/neh/meachum/menu.html,http://docsouth.unc.edu/full-text/na-slave-nar...
2,neh-johnsontl-johnsontl.xml,Thomas L. Johnson,Africa for Christ. Twenty-Eight Years a Slave,1892,http://docsouth.unc.edu/neh/johnsontl/menu.html,http://docsouth.unc.edu/full-text/na-slave-nar...
3,neh-white-white.xml,William S. White,The African Preacher. An Authentic Narrative,[c1849],http://docsouth.unc.edu/neh/white/menu.html,http://docsouth.unc.edu/full-text/na-slave-nar...
4,neh-brown55-brown55.xml,William Wells Brown,The American Fugitive in Europe. Sketches of P...,1855,http://docsouth.unc.edu/neh/brown55/menu.html,http://docsouth.unc.edu/full-text/na-slave-nar...
...,...,...,...,...,...,...
289,neh-henson-henson.xml,Josiah Henson,Uncle Tom's Story of His Life. An Autobiograph...,1876,http://docsouth.unc.edu/neh/henson/menu.html,http://docsouth.unc.edu/full-text/na-slave-nar...
290,fpn-washington-washing.xml,Booker T. Washington,Up from Slavery: An Autobiography,c1901,http://docsouth.unc.edu/fpn/washington/menu.html,http://docsouth.unc.edu/full-text/na-slave-nar...
291,fpn-burtont-burton.xml,Thomas William Burton,What Experience Has Taught Me: An Autobiograph...,c1910,http://docsouth.unc.edu/fpn/burtont/menu.html,http://docsouth.unc.edu/full-text/na-slave-nar...
292,neh-hildreth-hildreth.xml,Richard Hildreth,"The White Slave; or, Memoirs of a Fugitive",1852,http://docsouth.unc.edu/neh/hildreth/menu.html,http://docsouth.unc.edu/full-text/na-slave-nar...


In [7]:
def clean_year_string(year_str):
    """
    This function cleans up the year strings from the LIB and returns them as an int

    Parameters:
    year_str: a string of the year from the NA Slave Narrative Corpus Table of Contents

    Returns:
    and integer of the cleaned year 
    """
    # Replace '?' with '5', average digit
    if pd.isna(year_str):
        return year_str
    year_str = str(year_str)
    year_str = year_str.replace('?', '5')
    # Remove square brackets and 'c'
    year_str = year_str.replace('[', '').replace(']', '').replace('c', '')
    # Strip leading and trailing whitespace
    year_str = year_str.strip()
    # Check if the resulting string is only digits
    if year_str.isdigit():
        # Convert to integer and return
        return int(year_str)
    else:
        # Return None for non-numeric strings
        return None

In [8]:
# We are looking at autobiographies specifically, filter out non-autobiographies

LIB = LIB.loc[LIB.Title.str.contains('Autobiography')]

# replace the Filename with the .txt version of the text
LIB.loc[:,'Filename'] = LIB.loc[:,'Filename'].str.replace('xml','txt').copy()
LIB.index.name = 'book_id'

#apply year cleaning
LIB.loc[:, 'Date'] = LIB.loc[:,'Date'].apply(clean_year_string).copy()
LIB = LIB.sort_values('Filename')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  LIB.loc[:,'Filename'] = LIB.loc[:,'Filename'].str.replace('xml','txt').copy()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  LIB.loc[:, 'Date'] = LIB.loc[:,'Date'].apply(clean_year_string).copy()
  LIB.loc[:, 'Date'] = LIB.loc[:,'Date'].apply(clean_year_string).copy()


In [9]:
LIB

Unnamed: 0_level_0,Filename,Author,Title,Date,URL,URL(text-only)
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
291,fpn-burtont-burton.txt,Thomas William Burton,What Experience Has Taught Me: An Autobiograph...,1910,http://docsouth.unc.edu/fpn/burtont/menu.html,http://docsouth.unc.edu/full-text/na-slave-nar...
15,fpn-lane-lane.txt,Isaac Lane,"Autobiography of Bishop Isaac Lane, LL.D. with...",1916,http://docsouth.unc.edu/fpn/lane/menu.html,http://docsouth.unc.edu/full-text/na-slave-nar...
290,fpn-washington-washing.txt,Booker T. Washington,Up from Slavery: An Autobiography,1901,http://docsouth.unc.edu/fpn/washington/menu.html,http://docsouth.unc.edu/full-text/na-slave-nar...
19,nc-omarsaid-omarsaid.txt,Omar ibn Said,"Autobiography of Omar ibn Said, Slave in North...",1925,http://docsouth.unc.edu/nc/omarsaid/menu.html,http://docsouth.unc.edu/full-text/na-slave-nar...
28,neh-aleckson-aleckson.txt,Sam Aleckson,"Before the War, and After the Union. An Autob...",1929,http://docsouth.unc.edu/neh/aleckson/menu.html,http://docsouth.unc.edu/full-text/na-slave-nar...
277,neh-beard63-beard63.txt,J. R. Beard,Toussaint L'Ouverture: A Biography and Autobio...,1863,http://docsouth.unc.edu/neh/beard63/menu.html,http://docsouth.unc.edu/full-text/na-slave-nar...
13,neh-browne-browne.txt,Martha Griffith Browne,Autobiography of a Female Slave,1857,http://docsouth.unc.edu/neh/browne/menu.html,http://docsouth.unc.edu/full-text/na-slave-nar...
26,neh-campbell-campbell.txt,Israel Campbell,"An Autobiography. Bond and Free: Or, Yearning...",1861,http://docsouth.unc.edu/neh/campbell/menu.html,http://docsouth.unc.edu/full-text/na-slave-nar...
256,neh-drumgoold-drumgoold.txt,Kate Drumgoold,A Slave Girl's Story. Being an Autobiography o...,1898,http://docsouth.unc.edu/neh/drumgoold/menu.html,http://docsouth.unc.edu/full-text/na-slave-nar...
54,neh-fjones-jones.txt,Friday Jones,Days of Bondage. Autobiography of Friday Jones...,1883,http://docsouth.unc.edu/neh/fjones/menu.html,http://docsouth.unc.edu/full-text/na-slave-nar...


In [10]:
# pattern list generated by inspecting autobiographical texts 
# Those that do not match the OHCO model (no chapters) are removed at parsing

pat_list = [
    ('data/texts/neh-jamison-jamison.txt',rf"^\s*(CHAPTER)\s+{roman}\.*\s*$"),
    ('data/texts/fpn-burtont-burton.txt',rf"^\s*(CHAPTER)\s+{roman}\s"),
    ('data/texts/fpn-washington-washing.txt', rf"^\s*(CHAPTER)\s+{roman}\s*$"),
    ('data/texts/nc-omarsaid-omarsaid.txt', "REMOVE"),
    ('data/texts/neh-aleckson-aleckson.txt', rf"^\s*(CHAPTER)\s+{roman}\s*$"),
    ('data/texts/neh-beard63-beard63.txt', rf"^\s*(CHAPTER)\s+{roman}\.$"),
    ('data/texts/neh-browne-browne.txt',rf"^\s*(CHAPTER)\s+{roman}\.$"),
    ('data/texts/neh-campbell-campbell.txt', rf"^\s*(CHAPTER)\s+{roman}\."),
    ('data/texts/neh-drumgoold-drumgoold.txt',rf"^\s*(CHAPTER)\s+{roman}\.$"),
    ('data/texts/neh-fjones-jones.txt', "REMOVE"),
    ('data/texts/neh-flipper-flipper.txt', rf"^\s*(CHAPTER)\s+{roman}\.$"),
    ('data/texts/neh-frederick-frederick.txt', rf"^\s*(CHAPTER)\s+{roman}\.$"),
    ('data/texts/neh-heard-heard.txt', rf"^\s*(CHAPTER)\s[A-Z]+"),
    ('data/texts/neh-henry-henry.txt', "REMOVE"),
    ('data/texts/neh-henson-henson.txt', rf"^\s*(CHAPTER)\s+{roman}\.$"),
    ('data/texts/neh-henson81-henson81.txt', rf"^\s*(CHAPTER)\s+{roman}\.$"),
    ('data/texts/neh-holsey-holsey.txt', "REMOVE"),
    ('data/texts/neh-jamison-jamison.txt', rf"^\s*(CHAPTER)\s+{roman}\.$"),
    ('data/texts/neh-latta-latta.txt', rf"^\s*(CHAPTER)\s+{roman}\.$"),
    ('data/texts/neh-parkerh-parkerh.txt', "REMOVE"),
    ('data/texts/neh-randolph-randolph.txt', rf"^\s*(CHAPTER)\s+{roman}\.$"),           
    ('data/texts/neh-rayemma-rayemma.txt', rf"^\s*(CHAPTER)\s+{roman}\s"),
    ('data/texts/neh-said-said.txt', rf"^\s*(CHAPTER)\s+{roman}\.$"),
    ('data/texts/neh-smitham-smith.txt', rf"^\s*(CHAPTER)\s+{roman}\.$"),          
    ('data/texts/neh-smithj-smithj.txt', rf"^\s*(CHAPTER)\s+{roman}\."),
    ('data/texts/neh-wards-ward.txt', rf"^\s*(CHAPTER)\s+{roman}\.$"),
    ('data/texts/neh-washstory-washin.txt',  rf"^\s*(CHAPTER)\s+{roman}\.$")
    
    
]

In [11]:
# add file structure to names
LIB.Filename = 'data/texts/' + LIB.Filename.values

In [12]:
def parse_text(src_file, chap_pat):
    '''
    This function takes in a source file and a regex pattern for chapter headings,
    and returns a DataFrame of tokenized and POS-tagged text.

    Args:
        src_file (str): Filepath for source text file
        chap_pat (str): Regular expression pattern for identifying chapter headings

    Returns:
        TOKENS (DataFrame): DataFrame of tokenized and POS-tagged text
    '''
    # Read in source file and convert to DataFrame
    LINES = pd.DataFrame(open(src_file, 'r', encoding='utf-8-sig').readlines(), columns=['line_str'])
    LINES.index.name = 'line_id'

    # Clean up line strings
    LINES.line_str = LINES.line_str.str.replace(r'\n+', ' ', regex=True).str.strip()

    # Find starting line for parsing
    try:
        pat_start = LINES.line_str.str.match(r'^CHAPTER\s')
        line_start = LINES.loc[pat_start].index[0]
    except:
        print("pat_start not found")
        
    # Subset DataFrame to lines after chapter headings
    LINES = LINES.loc[line_start:]
    
    # Identify chapter numbers
    chap_lines = LINES.line_str.str.match(chap_pat, case=False)
    LINES.loc[chap_lines, 'chap_num'] = [i+1 for i in range(LINES.loc[chap_lines].shape[0])]
    LINES.chap_num = LINES.chap_num.ffill()
    LINES = LINES.loc[~chap_lines] # Remove chapter heading lines; their work is done
    LINES.chap_num = LINES.chap_num.astype('int') # Convert chap_num from float to int
    
    # Group lines into chapters
    CHAPS = LINES.groupby('chap_num').line_str\
        .apply(lambda x: '\n'.join(x))\
        .str.strip()\
        .to_frame('chap_str')

    # Split chapters into paragraphs
    PARAS = CHAPS['chap_str'].str.split(para_pat, expand=True).stack()\
        .to_frame('para_str').sort_index()
    PARAS.index.names = OHCO[1:3]
    PARAS['para_str'] = PARAS['para_str'].str.replace(r'\n', ' ', regex=True)
    PARAS['para_str'] = PARAS['para_str'].str.strip()
    # Remove empty paragraphs
    PARAS = PARAS[~PARAS['para_str'].str.match(r'^\s*$')]

    # Split paragraphs into sentences and tokenize
    SENTS = PARAS.para_str.apply(lambda x: pd.Series(nltk.sent_tokenize(x)))\
            .stack()\
            .to_frame('sent_str')
    SENTS.index.names = OHCO[1:4]
    
    # Strip hyphens
    SENTS.sent_str = SENTS.sent_str.str.replace(r"—", ' ')

    # Add pos-tag tokens
    # NOTE: pos-tags added here,even though it is not F3, so that the tagger can use
    # sentence context to predict
    TOKENS = SENTS.sent_str\
        .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))\
        .stack()\
        .to_frame('pos_tuple')
    TOKENS['pos'] = TOKENS.pos_tuple.apply(lambda x: x[1])
    TOKENS['token_str'] = TOKENS.pos_tuple.apply(lambda x: x[0])
    
    TOKENS.index.names = OHCO[1:5]
    
    
    # Remove non-alphanumeric characters from tokens
    TOKENS['token_str'] = TOKENS.token_str.str.replace(r'[\W_]+', '', regex=True)
    TOKENS['token_str'] = TOKENS['token_str'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
    TOKENS = TOKENS[TOKENS['token_str'] != '']
    
    TOKENS['term_str'] = TOKENS['token_str'].str.lower()
    TOKENS = TOKENS.reindex(columns = ['token_str','term_str', 'pos_tuple', 'pos'])


    return TOKENS

In [13]:
# Create empty CORPUS
CORPUS=pd.DataFrame(index = pd.MultiIndex.from_arrays([[] for x in OHCO], names=OHCO), columns = ['token_str','term_str'])

for i in range(len(pat_list)):
    
    # remove docs with incongruent OHCO 
    if pat_list[i][1] != 'REMOVE':
        TEMP = parse_text(*pat_list[i])
        
        # set the book_id instead of 
        TEMP['book_id'] = [LIB[LIB['Filename'] == str(pat_list[i][0])].index.values[0]]*len(TEMP)
        TEMP = TEMP.reset_index().set_index(OHCO)
        CORPUS = pd.concat([TEMP,CORPUS])


In [14]:
# Remove the non-OHCO books from the Library
LIB = LIB.loc[CORPUS.reset_index()['book_id'].unique()]

LIB.to_csv('{}/LIB.csv'.format(data_out))

In [15]:
# F1 Corpus Format. Only token_str is included
CORPUS_F1 = CORPUS.copy()[['token_str']]

CORPUS_F1.to_csv('{}/CORPUS_F1.csv'.format(data_out))

In [16]:
CORPUS_F2 = CORPUS.copy()[['token_str','term_str']]

VOCAB_F2 = CORPUS_F2.term_str.value_counts().to_frame('n')
VOCAB_F2.index.name = 'term_str'
VOCAB_F2['rank'] = VOCAB_F2.n.rank(method='min', ascending=False).astype(int)
VOCAB_F2['p'] = VOCAB_F2.n / VOCAB_F2.n.sum()
VOCAB_F2['i'] = -np.log2(VOCAB_F2.p)

In [17]:
VOCAB_F2.to_csv('{}/VOCAB_F2.csv'.format(data_out))

In [18]:
CORPUS_F2.to_csv('{}/CORPUS_F2.csv'.format(data_out))

In [19]:
tokens_per_book = CORPUS_F2.reset_index().groupby('book_id')['token_num'].count().rename('word_len')
chaps_per_book = CORPUS_F2.reset_index().groupby('book_id')['chap_num'].nunique().rename('chap_len')
LIB_F2 = LIB.join(tokens_per_book, on = 'book_id')
LIB_F2 = LIB_F2.join(chaps_per_book, on = 'book_id')

In [20]:
LIB_F2.to_csv('{}/LIB_F2.csv'.format(data_out))

In [21]:
CORPUS_F3 = CORPUS.copy()

VOCAB_F3 = VOCAB_F2.copy()

In [22]:
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

VOCAB_F3['stop'] = VOCAB_F3.index.map(sw.dummy)
VOCAB_F3['stop'] = VOCAB_F3['stop'].fillna(0).astype('int')

In [23]:
VOCAB_F3

Unnamed: 0_level_0,n,rank,p,i,stop
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
the,88579,1,5.963422e-02,4.067716,1
and,52486,2,3.533526e-02,4.822748,1
to,47744,3,3.214279e-02,4.959361,1
of,46178,4,3.108851e-02,5.007475,1
i,39384,5,2.651457e-02,5.237071,1
...,...,...,...,...,...
rejecting,1,20274,6.732320e-07,20.502393,0
cosgrove,1,20274,6.732320e-07,20.502393,0
appendage,1,20274,6.732320e-07,20.502393,0
gravitating,1,20274,6.732320e-07,20.502393,0


In [24]:
VOCAB_F3['stem'] = VOCAB_F3.apply(lambda x: stemmer.stem(x.name), 1)
VOCAB_F3 = VOCAB_F3.reindex(columns = ['stem','n','rank','p','i','stop'])

In [25]:
VOCAB_F3

Unnamed: 0_level_0,stem,n,rank,p,i,stop
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
the,the,88579,1,5.963422e-02,4.067716,1
and,and,52486,2,3.533526e-02,4.822748,1
to,to,47744,3,3.214279e-02,4.959361,1
of,of,46178,4,3.108851e-02,5.007475,1
i,i,39384,5,2.651457e-02,5.237071,1
...,...,...,...,...,...,...
rejecting,reject,1,20274,6.732320e-07,20.502393,0
cosgrove,cosgrov,1,20274,6.732320e-07,20.502393,0
appendage,appendag,1,20274,6.732320e-07,20.502393,0
gravitating,gravit,1,20274,6.732320e-07,20.502393,0


In [26]:
VOCAB_F3['max_pos'] = CORPUS_F3[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)
VOCAB_F3['n_pos'] = CORPUS_F3[['term_str','pos']].value_counts().unstack().count(1)
VOCAB_F3['cat_pos'] = CORPUS_F3[['term_str','pos']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos.apply(lambda x: set(x))

In [27]:
VOCAB_F3

Unnamed: 0_level_0,stem,n,rank,p,i,stop,max_pos,n_pos,cat_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
the,the,88579,1,5.963422e-02,4.067716,1,DT,3,"{DT, NNP, JJ}"
and,and,52486,2,3.533526e-02,4.822748,1,CC,4,"{CC, VBP, NNP, JJ}"
to,to,47744,3,3.214279e-02,4.959361,1,TO,5,"{VBP, TO, NNP, JJ, NN}"
of,of,46178,4,3.108851e-02,5.007475,1,IN,4,"{RP, NNP, JJ, IN}"
i,i,39384,5,2.651457e-02,5.237071,1,PRP,3,"{PRP, NN, NNP}"
...,...,...,...,...,...,...,...,...,...
rejecting,reject,1,20274,6.732320e-07,20.502393,0,VBG,1,{VBG}
cosgrove,cosgrov,1,20274,6.732320e-07,20.502393,0,NNP,1,{NNP}
appendage,appendag,1,20274,6.732320e-07,20.502393,0,NN,1,{NN}
gravitating,gravit,1,20274,6.732320e-07,20.502393,0,VBG,1,{VBG}


In [28]:
VOCAB_F3.to_csv('{}/VOCAB_F3.csv'.format(data_out))

In [29]:
CORPUS_F3.to_csv('{}/CORPUS_F3.csv'.format(data_out))

In [30]:
CORPUS_F4 = CORPUS_F3.copy()
VOCAB_F4 = VOCAB_F3.copy()

In [31]:
BOW = CORPUS_F4.groupby(BAG + ['term_str']).term_str.count().to_frame('n') 

In [32]:
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n
book_id,chap_num,term_str,Unnamed: 3_level_1
12,1,1848,2
12,1,27,2
12,1,a,130
12,1,able,2
12,1,about,28
...,...,...,...
291,10,years,1
291,10,yet,1
291,10,you,3
291,10,young,4


In [33]:
DTCM = BOW.n.unstack().fillna(0).astype('int')

In [34]:
DTCM

Unnamed: 0_level_0,term_str,003,016,0161,02384,026,02667,05134,05641,08714,1,...,zuilille,zurich,zury,à,á,âme,ægis,æwilberforce,écrits,élite
book_id,chap_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
12,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291,6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
291,7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
291,8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
291,9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
DOC = DTCM.sum(1).to_frame('n')

In [36]:
DOC

Unnamed: 0_level_0,Unnamed: 1_level_0,n
book_id,chap_num,Unnamed: 2_level_1
12,1,6658
12,2,1876
12,3,4116
12,4,6954
12,5,3918
...,...,...
291,6,2811
291,7,1529
291,8,2788
291,9,1516


In [37]:
TF = DTCM.T / DTCM.T.sum()
TF = TF.T

In [38]:
TF

Unnamed: 0_level_0,term_str,003,016,0161,02384,026,02667,05134,05641,08714,1,...,zuilille,zurich,zury,à,á,âme,ægis,æwilberforce,écrits,élite
book_id,chap_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
12,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
291,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
291,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
291,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
DF = DTCM.astype('bool').sum()
N = DTCM.shape[0]

In [40]:
IDF = np.log2(N / DF)

In [41]:
IDF

term_str
003             8.882643
016             8.882643
0161            8.882643
02384           8.882643
026             8.882643
                  ...   
âme             8.882643
ægis            8.882643
æwilberforce    8.882643
écrits          8.882643
élite           7.882643
Length: 30914, dtype: float64

In [42]:
TFIDF = TF * IDF

In [43]:
TFIDF

Unnamed: 0_level_0,term_str,003,016,0161,02384,026,02667,05134,05641,08714,1,...,zuilille,zurich,zury,à,á,âme,ægis,æwilberforce,écrits,élite
book_id,chap_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
12,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
291,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
291,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
291,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
VOCAB_F4['df'] = DF
VOCAB_F4['idf'] = IDF

In [45]:
VOCAB_F4

Unnamed: 0_level_0,stem,n,rank,p,i,stop,max_pos,n_pos,cat_pos,df,idf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
the,the,88579,1,5.963422e-02,4.067716,1,DT,3,"{DT, NNP, JJ}",472,0.000000
and,and,52486,2,3.533526e-02,4.822748,1,CC,4,"{CC, VBP, NNP, JJ}",472,0.000000
to,to,47744,3,3.214279e-02,4.959361,1,TO,5,"{VBP, TO, NNP, JJ, NN}",472,0.000000
of,of,46178,4,3.108851e-02,5.007475,1,IN,4,"{RP, NNP, JJ, IN}",472,0.000000
i,i,39384,5,2.651457e-02,5.237071,1,PRP,3,"{PRP, NN, NNP}",461,0.034020
...,...,...,...,...,...,...,...,...,...,...,...
rejecting,reject,1,20274,6.732320e-07,20.502393,0,VBG,1,{VBG},1,8.882643
cosgrove,cosgrov,1,20274,6.732320e-07,20.502393,0,NNP,1,{NNP},1,8.882643
appendage,appendag,1,20274,6.732320e-07,20.502393,0,NN,1,{NN},1,8.882643
gravitating,gravit,1,20274,6.732320e-07,20.502393,0,VBG,1,{VBG},1,8.882643


In [46]:
CORPUS_F4

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str,pos_tuple,pos
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
24,1,0,0,0,BIRTH,birth,"(BIRTH, NNP)",NNP
24,1,0,0,1,AND,and,"(AND, CC)",CC
24,1,0,0,2,EARLY,early,"(EARLY, NNP)",NNP
24,1,0,0,3,CHILDHOOD,childhood,"(CHILDHOOD, NNP)",NNP
24,1,1,0,0,Many,many,"(Many, JJ)",JJ
...,...,...,...,...,...,...,...,...
12,24,12,0,56,sink,sink,"(sink, NN)",NN
12,24,12,0,57,to,to,"(to, TO)",TO
12,24,12,0,58,rise,rise,"(rise, VB)",VB
12,24,12,0,59,no,no,"(no, DT)",DT


In [47]:
TFIDF_stacked = TFIDF.stack()
TFIDF_summed = TFIDF_stacked.groupby(level=list(range(len(BAG)+1))).sum().to_frame('tfidf')
TFIDF_summed = TFIDF_summed.reset_index().set_index([*BAG,'term_str'])

CORPUS_F4 = pd.merge(CORPUS_F4.reset_index(), TFIDF_summed.reset_index(), on=[*BAG,'term_str'], how = 'left')

CORPUS_F4 = CORPUS_F4.set_index(OHCO)

In [48]:
CORPUS_F4

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str,pos_tuple,pos,tfidf
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
24,1,0,0,0,BIRTH,birth,"(BIRTH, NNP)",NNP,0.002469
24,1,0,0,1,AND,and,"(AND, CC)",CC,0.000000
24,1,0,0,2,EARLY,early,"(EARLY, NNP)",NNP,0.003053
24,1,0,0,3,CHILDHOOD,childhood,"(CHILDHOOD, NNP)",NNP,0.001417
24,1,1,0,0,Many,many,"(Many, JJ)",JJ,0.000279
...,...,...,...,...,...,...,...,...,...
12,24,12,0,56,sink,sink,"(sink, NN)",NN,0.002991
12,24,12,0,57,to,to,"(to, TO)",TO,0.000000
12,24,12,0,58,rise,rise,"(rise, VB)",VB,0.002041
12,24,12,0,59,no,no,"(no, DT)",DT,0.000249


In [49]:
CORPUS_F4[CORPUS_F4.pos != 'NNP'].sort_values('tfidf',ascending = False).drop_duplicates(['term_str']).head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str,pos_tuple,pos,tfidf
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
28,10,6,3,0,Silla,silla,"(Silla, NN)",NN,0.099433
74,26,3,0,5,cries,cries,"(cries, NNS)",NNS,0.093455
286,37,0,0,29,constraineth,constraineth,"(constraineth, NN)",NN,0.086239
28,8,3,1,30,turkey,turkey,"(turkey, NN)",NN,0.085287
20,5,21,0,18,Fanny,fanny,"(Fanny, JJ)",JJ,0.084311
291,9,13,0,7,Sundayschool,sundayschool,"(Sunday-school, JJ)",JJ,0.073707
86,28,4,0,6,president,president,"(president, NN)",NN,0.068946
28,16,6,0,19,pie,pie,"(pie, NN)",NN,0.068853
74,26,11,0,9,unto,unto,"(unto, VBP)",VBP,0.067795
18,7,4,1,90,mecca,mecca,"(mecca, JJ)",JJ,0.067666


In [50]:
BOW['tf'] = TF.stack()
BOW['tfidf'] = TFIDF.stack()

In [51]:
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,tf,tfidf
book_id,chap_num,term_str,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12,1,1848,2,0.000300,0.002668
12,1,27,2,0.000300,0.001825
12,1,a,130,0.019525,0.000000
12,1,able,2,0.000300,0.000312
12,1,about,28,0.004205,0.000665
...,...,...,...,...,...
291,10,years,1,0.000932,0.000346
291,10,yet,1,0.000932,0.000592
291,10,you,3,0.002796,0.000896
291,10,young,4,0.003728,0.002905


In [52]:
VOCAB_F4['tfidf_mean'] = TFIDF.mean() 
VOCAB_F4['tfidf_sum'] = TFIDF.sum()
VOCAB_F4['dfidf'] = VOCAB_F4.df * VOCAB_F4.idf

In [53]:
VOCAB_F4.sort_values('tfidf_sum', ascending = False)

Unnamed: 0_level_0,stem,n,rank,p,i,stop,max_pos,n_pos,cat_pos,df,idf,tfidf_mean,tfidf_sum,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
she,she,5380,39,0.003622,8.109002,1,PRP,2,"{PRP, NNP}",314,0.588022,0.002251,1.062328,184.639002
her,her,5434,38,0.003658,8.094594,1,PRP$,7,"{PRP, VB, PRP$, NNP, JJ, RB, NN}",341,0.469015,0.001860,0.877986,159.934156
lord,lord,2212,77,0.001489,9.391257,0,NNP,2,"{NN, NNP}",217,1.121092,0.001413,0.666866,243.276924
church,church,1782,105,0.001200,9.703111,0,NNP,4,"{NN, VB, NNP, JJ}",203,1.217307,0.001396,0.659074,247.113348
you,you,6750,30,0.004544,7.781721,1,PRP,7,"{PRP, VB, JJR, NNP, JJ, RB, NN}",378,0.320401,0.001326,0.625672,121.111436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
a,a,29172,6,0.019640,5.670096,1,DT,4,"{NN, DT, NNP, JJ}",472,0.000000,0.000000,0.000000,0.000000
of,of,46178,4,0.031089,5.007475,1,IN,4,"{RP, NNP, JJ, IN}",472,0.000000,0.000000,0.000000,0.000000
to,to,47744,3,0.032143,4.959361,1,TO,5,"{VBP, TO, NNP, JJ, NN}",472,0.000000,0.000000,0.000000,0.000000
and,and,52486,2,0.035335,4.822748,1,CC,4,"{CC, VBP, NNP, JJ}",472,0.000000,0.000000,0.000000,0.000000


In [54]:
VOCAB_F4.to_csv('{}/VOCAB_F4.csv'.format(data_out))

In [55]:
CORPUS_F4.to_csv('{}/CORPUS_F4.csv'.format(data_out))

In [56]:
DOC['tfidf_mean'] = TFIDF.T.mean()

In [57]:
DOC

Unnamed: 0_level_0,Unnamed: 1_level_0,n,tfidf_mean
book_id,chap_num,Unnamed: 2_level_1,Unnamed: 3_level_1
12,1,6658,0.000041
12,2,1876,0.000039
12,3,4116,0.000038
12,4,6954,0.000040
12,5,3918,0.000043
...,...,...,...
291,6,2811,0.000050
291,7,1529,0.000042
291,8,2788,0.000048
291,9,1516,0.000040


In [58]:
# Assuming 'df' is your pandas DataFrame with a multi-index

# Group by the primary index and find the row with the largest value in the desired column
max_rows = DOC.groupby(level=0)['tfidf_mean'].idxmax()

# Extract the second index value from each row
second_index_values = max_rows.str[1]

# Get the corresponding column values for each row
column_values = DOC.loc[max_rows]['tfidf_mean'].values

# Create a new DataFrame with the primary index, second index, and column value
result = pd.DataFrame({'book_id': max_rows.str[0], 'chap_num': second_index_values, 'tfidf_mean': column_values})


In [59]:
print(result)

         book_id  chap_num  tfidf_mean
book_id                               
12            12        22    0.000055
13            13         5    0.000061
14            14        22    0.000051
17            17         7    0.000051
18            18         7    0.000071
20            20         7    0.000058
22            22        31    0.000061
24            24        22    0.000058
25            25        34    0.000044
26            26         3    0.000042
28            28         4    0.000060
49            49         6    0.000072
74            74        21    0.000046
76            76         7    0.000047
86            86        32    0.000046
256          256         5    0.000026
277          277         1    0.000068
286          286        37    0.000049
289          289        30    0.000056
290          290        17    0.000042
291          291         5    0.000055


In [60]:
VOCAB_PCA = VOCAB_F4[VOCAB_F4.max_pos.isin(['NN', 'VB', 'JJ'])].sort_values('dfidf', ascending=False).head(1000)

In [61]:
#VOCAB_PCA = VOCAB[VOCAB.max_pos.isin(['NN','NNS','VB','VBD','VBG','VBN','VBP','VBZ','JJ','JJR','JJS'])].sort_values('dfidf', ascending=False).head(1000)

In [62]:
VOCAB_PCA

Unnamed: 0_level_0,stem,n,rank,p,i,stop,max_pos,n_pos,cat_pos,df,idf,tfidf_mean,tfidf_sum,dfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
visit,visit,429,365,0.000289,11.757559,0,NN,9,"{VBP, VBZ, IN, VB, NNP, JJ, NNS, FW, NN}",174,1.439700,0.000408,0.192586,250.507722
hour,hour,313,483,0.000211,12.212374,0,NN,1,{NN},174,1.439700,0.000305,0.143775,250.507722
open,open,298,505,0.000201,12.283224,0,JJ,8,"{VBP, IN, VB, RP, NNP, JJ, VBD, NN}",174,1.439700,0.000296,0.139592,250.507722
ground,ground,287,525,0.000193,12.337486,0,NN,5,"{VBP, NNP, VBD, VBN, NN}",173,1.448015,0.000319,0.150404,250.506564
close,close,315,480,0.000212,12.203185,0,NN,7,"{VBP, VB, NNP, JJ, FW, RB, NN}",173,1.448015,0.000279,0.131791,250.506564
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
bottom,bottom,51,2388,0.000034,14.829968,0,NN,4,"{NN, VB, NNP, FW}",42,3.490326,0.000102,0.048175,146.593676
rejoicing,rejoic,56,2220,0.000038,14.695038,0,NN,2,"{VBG, NN}",42,3.490326,0.000159,0.074865,146.593676
blind,blind,54,2277,0.000036,14.747505,0,NN,9,"{VBP, IN, VBN, VB, NNP, JJ, NNS, RB, NN}",42,3.490326,0.000142,0.066991,146.593676
soldier,soldier,96,1434,0.000065,13.917430,0,NN,5,"{NNP, NN, JJ, RB, JJR}",42,3.490326,0.000165,0.078081,146.593676


In [63]:
TFIDF_PCA = TFIDF[VOCAB_PCA.index]

In [64]:
def create_pca(X, k = 10, norm_docs = True, center_by_mean = True, center_by_variance = False):
    X = X.copy()
    if norm_docs:
        X = (X.T / norm(X, 2, axis=1)).T
    if center_by_mean:
        X = X - X.mean(axis = 0)
    if center_by_variance:
        X = X - X.var()

    COV = X.cov()

    eig_vals, eig_vecs = eigh(COV)
    EIG_VEC = pd.DataFrame(eig_vecs, index=COV.index, columns=COV.index)
    EIG_VAL = pd.DataFrame(eig_vals, index=COV.index, columns=['eig_val'])
    EIG_VAL.index.name = 'term_str'
    EIG_PAIRS = EIG_VAL.join(EIG_VEC.T)
    EIG_PAIRS['exp_var'] = np.round((EIG_PAIRS.eig_val / EIG_PAIRS.eig_val.sum()) * 100, 2)
    
    COMPS = EIG_PAIRS.sort_values('exp_var', ascending=False).head(k).reset_index(drop=True)
    COMPS.index.name = 'comp_id'
    COMPS.index = ["PC{}".format(i) for i in COMPS.index.tolist()]
    COMPS.index.name = 'pc_id'
    
    LOADINGS = COMPS[COV.index].T
    LOADINGS.index.name = 'term_str'
    LOADINGS = LOADINGS.sort_values('PC0', ascending = False)
    
    DCM = X.dot(COMPS[COV.index].T) 
    
    top_terms = []
    for i in range(k):
        for j in [0, 1]:
            comp_str = LOADINGS.sort_values(f'PC{i}', ascending=bool(j)).head(10).index.to_list()
            top_terms.append((f"PC{i}", j, comp_str))
    COMP_GLOSS = pd.DataFrame(top_terms).set_index([0,1]).unstack()
    COMP_GLOSS.index.name = 'comp_id'
    COMP_GLOSS.columns = COMP_GLOSS.columns.droplevel(0) 
    COMP_GLOSS = COMP_GLOSS.rename(columns={0:'pos', 1:'neg'})
    
    return LOADINGS, DCM, COMPS, COMP_GLOSS

In [65]:
LOADINGS, DCM, COMPINFO, COMP_GLOSS = create_pca(TFIDF_PCA, k = 10)

In [66]:
LOADINGS

pc_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
race,0.228932,-0.112845,-0.106622,-0.442776,0.055775,-0.047806,-0.108501,-0.191326,0.157970,-0.041738
school,0.178944,-0.211876,-0.291494,0.158030,-0.117263,-0.369486,0.159927,0.052828,-0.159523,-0.087358
coloured,0.143902,0.000878,-0.136497,0.320396,0.194477,-0.125554,-0.062591,-0.025323,0.134121,-0.169850
institution,0.127030,-0.057029,-0.129690,-0.106229,0.000946,-0.079298,-0.005814,-0.040527,-0.017939,0.038488
industrial,0.107205,-0.054788,-0.084718,-0.062243,-0.003994,-0.058786,-0.021031,0.024450,-0.035019,-0.048527
...,...,...,...,...,...,...,...,...,...,...
prayer,-0.088638,-0.119535,0.052962,0.005177,0.019096,0.004244,0.010821,-0.051868,0.006015,-0.055710
mistress,-0.105519,0.111578,-0.112290,-0.034084,-0.044393,0.080472,-0.043547,0.095203,0.008573,-0.195710
pray,-0.129212,-0.134530,0.070107,-0.012741,0.029498,0.002308,0.035547,-0.049162,0.013908,-0.047070
mother,-0.170102,0.038267,-0.086786,-0.035572,-0.001504,-0.084334,0.342512,-0.262760,0.195899,-0.117234


In [67]:
DCM

Unnamed: 0_level_0,pc_id,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9
book_id,chap_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
12,1,-0.214693,0.191143,-0.206195,-0.071350,-0.035322,0.153718,0.147319,-0.006519,0.000591,-0.191112
12,2,-0.102800,0.152339,-0.030413,-0.027922,-0.105751,0.026136,0.048435,-0.048649,-0.000275,-0.015575
12,3,-0.174178,0.163774,-0.112500,-0.057952,-0.073443,0.135795,0.009903,-0.014095,-0.011319,-0.046993
12,4,-0.093579,-0.170018,0.042017,0.061723,0.051087,0.188499,0.158635,-0.020374,-0.149314,-0.074351
12,5,-0.081950,0.026909,-0.047985,0.003377,-0.152580,0.088717,-0.157170,-0.022229,-0.026418,0.186467
...,...,...,...,...,...,...,...,...,...,...,...
291,6,0.201239,-0.081691,-0.011107,0.003057,0.014373,0.036788,0.030083,0.127153,-0.040132,0.039527
291,7,0.074682,-0.027695,-0.032437,-0.003834,0.011045,-0.052062,0.055708,0.108336,-0.056269,0.095156
291,8,0.117103,-0.078477,-0.062533,-0.031499,-0.066854,0.078687,-0.032472,0.021015,-0.072584,0.063656
291,9,0.066514,-0.105045,-0.045486,0.072320,-0.007043,-0.039732,0.098270,0.124200,0.033419,0.064272


In [68]:
COMPINFO

Unnamed: 0_level_0,eig_val,visit,hour,open,ground,close,moment,second,stand,sure,...,magnificent,date,british,proceed,bottom,rejoicing,blind,soldier,determination,exp_var
pc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PC0,0.033977,0.017318,-0.016662,-0.023701,-0.014635,-0.013981,-0.021591,0.018672,-0.023075,-0.016232,...,0.02302,0.003578,0.056184,0.00784,0.00607,-0.014393,-0.007048,0.024109,0.012696,3.92
PC1,0.025524,-0.013072,-0.007655,-0.007732,0.011443,-0.022826,0.033857,-0.015982,-0.017594,-0.020407,...,0.000874,-0.00876,0.016026,0.020405,0.001666,-0.004327,-0.002794,0.017829,0.02062,2.94
PC2,0.020065,-0.00541,0.002369,0.018476,-0.019384,0.005424,0.045043,0.011477,0.009186,0.001298,...,0.008449,0.001785,0.027638,0.004986,-0.007164,-0.002457,0.020589,0.053116,0.026905,2.31
PC3,0.015384,0.022932,0.008205,-0.00461,0.01351,-0.016453,-0.007477,0.022916,-0.010653,0.006324,...,0.003183,0.004537,0.150917,0.039843,-0.001578,0.006276,0.011791,-0.018729,-3.8e-05,1.77
PC4,0.014694,0.025327,-0.003325,0.006588,-0.018033,0.018427,-0.005877,-0.037859,-0.00015,0.006409,...,-0.003694,0.006292,0.19777,-0.011571,0.000959,0.001536,-0.015527,-0.044737,0.003134,1.69
PC5,0.013412,0.025669,-0.025257,-0.027603,0.011424,-0.007223,-0.018975,0.012628,-0.001736,-0.017691,...,-0.010044,0.01556,0.080641,0.004453,-0.014534,0.026348,-0.005098,-0.018006,-0.01895,1.55
PC6,0.012116,-0.04812,-0.023832,-0.008035,0.008803,-0.022874,-0.023558,0.002047,-0.012843,-0.022967,...,-0.022209,0.013308,-0.050352,-0.05184,-0.017285,0.000291,-0.012418,-0.00853,0.02139,1.4
PC7,0.01144,-0.00795,0.020898,0.021242,0.014496,0.037977,-0.011907,0.052475,0.016316,0.009298,...,-0.008684,0.001811,-0.009672,-0.037309,-0.002506,-0.014689,-0.027276,0.042025,-0.00051,1.32
PC8,0.010665,-0.032624,0.007104,-0.008457,-0.003935,-0.026379,-0.013515,0.005825,0.010165,0.001585,...,-0.021863,-0.02042,0.127448,-0.023857,0.001855,0.005127,-0.014542,0.032898,-0.018648,1.23
PC9,0.010124,-0.042197,-0.006805,-0.019545,-0.02243,-0.010227,-0.009092,-0.007264,-0.014074,-0.009111,...,-0.007311,-0.002452,-0.005628,0.046164,-0.013097,-0.026266,0.013625,-0.023872,-0.00746,1.17


In [69]:
COMP_GLOSS

1,pos,neg
comp_id,Unnamed: 1_level_1,Unnamed: 2_level_1
PC0,"[race, school, coloured, institution, industri...","[master, mother, pray, mistress, prayer, child..."
PC1,"[master, french, overseer, slave, mistress, is...","[meeting, school, colored, praise, pastor, pra..."
PC2,"[french, island, army, camp, war, liberty, gua...","[school, master, overseer, coloured, instituti..."
PC3,"[coloured, antislavery, school, british, socie...","[race, colored, problem, institution, master, ..."
PC4,"[antislavery, british, coloured, american, sla...","[camp, french, army, guard, school, island, mi..."
PC5,"[colored, preach, overseer, pastor, preacher, ...","[school, french, coloured, whilst, building, t..."
PC6,"[mother, school, overseer, father, child, love...","[boat, river, journey, captain, road, steamer,..."
PC7,"[camp, class, guard, examination, military, of...","[mother, race, boat, island, river, french, jo..."
PC8,"[camp, class, mother, antislavery, race, guard...","[master, district, committee, school, session,..."
PC9,"[boat, ye, whilst, river, captain, journey, do...","[master, mistress, french, coloured, meeting, ..."


In [70]:
COMP_GLOSS.

SyntaxError: invalid syntax (1839536369.py, line 1)

In [None]:
for i in range(len(COMP_GLOSS)):
    print(COMP_GLOSS.iloc[i][0], end = ' ')
    print('')

In [None]:
VOCAB_PCA = pd.concat([VOCAB_PCA, LOADINGS], axis=1)
VOCAB_PCA

In [None]:
DOCS_PCA = DCM.join(LIB_F2[['Author','Date']])

In [None]:
DOCS_PCA['disp'] = DOCS_PCA.Date.astype(str)+ ' ' + DOCS_PCA.Author.astype(str)

In [None]:
DOCS_PCA

In [None]:
COMP_GLOSS.to_csv('{}/COMP_GLOSS.csv'.format(data_out))
LOADINGS.to_csv('{}/LOADINGS.csv'.format(data_out))

VOCAB_PCA.to_csv('{}/VOCAB_PCA.csv'.format(data_out))

DOCS_PCA.to_csv('{}/DOCS_PCA.csv'.format(data_out))

In [None]:
def vis_pcs(M, a, b, label='author', hover_name='Author', symbol=None, size=None):
    fig = px.scatter(M, f"PC{a}", f"PC{b}", color=hover_name, 
                     symbol=symbol, size=size,
                     marginal_x='box', height=800)
    fig.show()

In [None]:
def vis_loadings(X,a=1, b=2, hover_name='term_str'):
    return px.scatter(X.reset_index(), f"PC{a}", f"PC{b}", 
                      text='term_str', size='i', color='max_pos', 
                      marginal_x='box', height=800)

In [None]:
vis_pcs(DOCS_PCA, 0, 1)

In [None]:
vis_loadings(VOCAB_PCA)

In [None]:
# LOOK AT WHICH DOCUMENTS INCLUDE THE DIFFERENT AREAS (FRENCH, AMERICAN, BRITISH)

In [None]:
DOCS_str = CORPUS[CORPUS.pos.str.match(r'^NNS?$')]\
    .groupby(BAG).term_str\
    .apply(lambda x: ' '.join(x))\
    .to_frame()\
    .rename(columns={'term_str':'doc_str'})

In [None]:
DOCS_str

In [None]:
def topicmodels(data, bag):
    
    n_terms = 4000
    n_topics = 20
    max_iter = 10
    n_top_terms = 10
    data = data.copy()

    
    count_engine = CountVectorizer(max_features=n_terms, stop_words='english')
    count_model = count_engine.fit_transform(data.doc_str)
    TERMS = count_engine.get_feature_names()

    VOCAB = pd.DataFrame(index=TERMS)
    VOCAB.index.name = 'term_str'

    DTM = pd.DataFrame(count_model.toarray(), index=data.index, columns=TERMS)
    VOCAB['doc_count'] = DTM.astype('bool').astype('int').sum()
    data['term_count'] = DTM.sum(1)
    
    lda_engine = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=50., random_state=0)
    lda_model = lda_engine.fit_transform(count_model)

    THETA = pd.DataFrame(lda_model, index=data.index)
    THETA.columns.name = 'topic_id'
    THETA['book_id'] = data['book_id']
    THETA['chap_num'] = data['chap_num']
    THETA = THETA.set_index(BAG)
    TNAMES = [f"T{str(x).zfill(len(str(n_topics)))}" for x in range(n_topics)]
    THETA.columns = TNAMES

    PHI = pd.DataFrame(lda_engine.components_, columns=TERMS, index=TNAMES)
    PHI.index.name = 'topic_id'
    PHI.columns.name  = 'term_str'

    TOPICS = PHI.stack().to_frame('topic_weight').groupby('topic_id')\
        .apply(lambda x: x.sort_values('topic_weight', ascending=False)\
            .head(n_top_terms).reset_index().drop('topic_id', axis=1)['term_str'])
    TOPICS['label'] = TOPICS.apply(lambda x: x.name + ' ' + ', '.join(x[:n_top_terms]), 1)
    
    TOPICS['doc_weight_sum'] = THETA.sum()
    TOPICS['term_freq'] = PHI.sum(1) / PHI.sum(1).sum()
    
    return {'THETA':THETA, "TNAMES":TNAMES, "PHI":PHI, "TOPICS":TOPICS, "VOCAB":VOCAB}
    

In [None]:
chap_model = topicmodels(DOCS_str.reset_index(), BAG)

In [None]:
DOCS_LDA = chap_model['THETA']

In [None]:
DOCS_LDA.to_csv('{}/DOCS_LDA.csv'.format(data_out))

In [None]:
TOPICS_LDA = chap_model['TOPICS']
TOPICS_LDA.to_csv('{}/TOPICS_LDA.csv'.format(data_out))

In [None]:
PHI_LDA = chap_model['PHI']
PHI_LDA.to_csv('{}/PHI_LDA.csv'.format(data_out))

In [None]:
PHI_LDA

In [None]:
VOCAB_LDA = chap_model['VOCAB'].join(chap_model['PHI'].T)

In [None]:
VOCAB_LDA.to_csv('{}/VOCAB_LDA.csv'.format(data_out))

In [None]:
TOPICS_LDA = chap_model["TOPICS"]

In [None]:
AUTHORS = sorted(LIB.Author.value_counts().index.to_list())

In [None]:
TOPICS_LDA[AUTHORS] = DOCS_LDA.join(LIB, on='book_id').groupby('Author')[chap_model['TNAMES']].mean().T
TOPICS_LDA['author'] = TOPICS_LDA[AUTHORS].idxmax(1)

In [None]:
TOPICS_LDA.to_csv('{}/TOPICS_LDA.csv'.format(data_out))

In [None]:
w2v_params = dict(
    window = 2,
    min_count = 80,
    vector_size = 256
    #,workers = 4
)

In [None]:
CORPUS_w2v = CORPUS[CORPUS.pos.str.fullmatch('(NNS?|VB[A-Z]?)')]
DOCS_w2v = CORPUS_w2v\
    .groupby(BAG)\
    .term_str.apply(lambda  x:  x.tolist())\
    .reset_index()['term_str'].tolist()
DOCS_w2v = [doc for doc in DOCS_w2v if len(doc) > 1] # Lose single word docs

In [None]:
w2v_model = word2vec.Word2Vec(DOCS_w2v, **w2v_params)

In [None]:
W2V = pd.DataFrame(w2v_model.wv.get_normed_vectors(), index=w2v_model.wv.key_to_index.keys())

In [None]:
VOCAB_W2V = VOCAB_F4.join(W2V, how = 'inner')

In [None]:
VOCAB_W2V.to_csv('{}/VOCAB_W2V.csv'.format(data_out))

In [None]:
def get_tsne_coords(model):
    """
    Compute and return the t-SNE coordinates for the given Word2Vec model.
    
    Args:
    - model (Word2Vec): the trained Word2Vec model
    
    Returns:
    - coords (pd.DataFrame): a DataFrame with the t-SNE coordinates for each word in the model's vocabulary
    """
    # Build DataFrame with word vectors and corresponding term strings
    coords = pd.DataFrame(
        dict(
            vector=[model.wv.get_vector(w) for w in model.wv.key_to_index], 
            term_str=model.wv.key_to_index.keys()
        )
    ).set_index('term_str')

    # Compute t-SNE coordinates
    tsne_engine = TSNE(
        learning_rate=200, perplexity=20, n_components=2, init='random', n_iter=1000, random_state=42
    )
    tsne_model = tsne_engine.fit_transform(np.array(coords.vector.to_list()))

    # Add x and y columns to DataFrame
    coords['x'] = tsne_model[:, 0]
    coords['y'] = tsne_model[:, 1]

    return coords

coords = get_tsne_coords(w2v_model)

In [None]:
coords

In [None]:
# TFM = coords.apply(lambda x: pd.Series(x.vector), 1)

## Use ScikitLearn's TSNE library

## Add some vocab features 

In [None]:
if coords.shape[1] == 3:
    coords = coords.merge(VOCAB_F4.reset_index(), on='term_str')
    coords = coords.set_index('term_str')

In [None]:
coords = coords[coords.stop == 0]

In [None]:
coords.max_pos.value_counts()

In [None]:
coords.head()

## Plot the coordinates

In [None]:
coords['pos_grp'] = coords.max_pos.str[:2]
coords['logn'] = np.log(coords.n)

In [None]:
coords.to_csv('{}/coords_W2V.csv'.format(data_out))

In [None]:
px.scatter(coords.reset_index(), 'x', 'y', 
           text='term_str', 
           color='pos_grp', 
           hover_name='term_str',          
           size='logn',
           height=1000).update_traces(
                mode='markers+text', 
                textfont=dict(color='black', size=14, family='Arial'),
                textposition='top center')

In [None]:
nrc_file = data_in + "/lexicons/sources/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
syuzhet_file = data_in + "/lexicons/sources/syuzhet.csv"

# Import NRC lexicon

In [None]:
nrc = pd.read_csv(nrc_file, sep='\t', header=None)
nrc.columns = ['term_str','nrc_emotion','val']
nrc = nrc.set_index(['term_str','nrc_emotion'])
nrc = nrc.unstack()
nrc.columns = nrc.columns.droplevel(0)
nrc = nrc[nrc.sum(1) > 1]
nrc.columns = ['nrc_'+col for col in nrc.columns]

In [None]:
nrc.sum().sort_values(ascending=False)

In [None]:
nrc.head()

In [None]:
nrc['nrc_sentiment'] =nrc.nrc_positive - nrc.nrc_negative

In [None]:
nrc['nrc_sentiment'].sample(10)

# Import Syuzhet lexicon

In [None]:
syu = pd.read_csv(syuzhet_file)
syu.columns = ['id','term_str','syu_sentiment']
syu = syu.drop('id', axis=1)
syu = syu.set_index('term_str')

In [None]:
syu.head()

In [None]:
gi = pd.read_csv(gi_file, index_col=['term_str'])
gi.columns = ['gi_sentiment']

In [None]:
gi.head()

# Combine all

In [None]:
combo = nrc.join(syu, how='outer')\
    .sort_index()

In [None]:
combo.head()

In [None]:
combo.count().sort_values().plot(kind='barh', figsize=(7,7));

# Save

In [None]:
nrc.to_csv(data_in + '/lexicons/salex_nrc.csv')
syu.to_csv(data_in + '/lexicons/salex_syuzhet.csv')
combo.to_csv(data_in + '/lexicons/salex_combo.csv')        

In [None]:
salex_csv = f'{data_in}/lexicons/salex_combo.csv'

sent_cols = "syu_sentiment nrc_sentiment".split()
emo_cols = "anger anticipation disgust fear joy sadness surprise trust".split()

In [None]:
salex_csv

In [None]:
SALEX = pd.read_csv(salex_csv).set_index('term_str')
SALEX.columns = [col.replace('nrc_','') if 'sentiment' not in col else col for col in SALEX.columns]

In [None]:
SALEX

In [None]:
print(VOCAB_F4.index.duplicated().any()) # check if VOCAB index has any duplicates
print(SALEX.index.duplicated().any()) # check if SALEX index has any duplicates

In [None]:
duplicates = SALEX[SALEX.index.duplicated(keep=False)]
print(duplicates)

In [None]:
SALEX

In [None]:
# Identify the duplicated index value
dup_index = SALEX.index.duplicated(keep=False)

# Calculate the mean of the duplicated rows
mean_row = [SALEX.loc[dup_index].mean()] * sum(dup_index)

# Assign the mean values to the duplicated rows
SALEX.loc[dup_index] = mean_row

# Remove the duplicated rows, keeping the first occurrence
SALEX = SALEX.loc[~SALEX.index.duplicated(keep='first')]

In [None]:
SALEX

In [None]:
VOCAB_sent = pd.concat([VOCAB_F4, SALEX], join='inner', axis=1)

In [None]:
VOCAB_sent.to_csv('{}/VOCAB_sent.csv'.format(data_out))

In [None]:
len(BOW.reset_index().book_id.value_counts())

In [None]:
B = BOW.join(VOCAB_sent[['max_pos'] + sent_cols + emo_cols], on='term_str', rsuffix='_v').dropna()

In [None]:
B

In [None]:
for col in [*sent_cols, *emo_cols]:
    B[col] = B[col] * B.tfidf

In [None]:
len(B.reset_index().book_id.value_counts())

In [None]:
B.max_pos.value_counts().sort_values().plot.barh();

In [None]:
EMO_BOOKS = B.groupby(['book_id'])[[*sent_cols, *emo_cols]].mean()
#EMO_CHAPS = B.groupby(['book_id','chap_num'])[[*sent_cols, *emo_cols]].mean()

In [None]:
EMO_BOOKS = pd.merge(LIB_F2[['Title','Date']], EMO_BOOKS, left_index=True, right_index=True)

# Set the index of the merged DataFrame to 'Title'
EMO_BOOKS.index = EMO_BOOKS['Date'].astype(str) + ": " + EMO_BOOKS['Title'].str[:100]

In [None]:
EMO_BOOKS

In [None]:
EMO_BOOKS.to_csv('{}/BOOKS_sent.csv'.format(data_out))

In [None]:
EMO_BOOKS[[*sent_cols,*emo_cols]].plot.barh(figsize=(15,30));

In [None]:
trust_words = pd.Series(VOCAB_sent[VOCAB_sent['trust'] == 1].index.tolist())