# TFIDF
- Brigitte Hogan (bwh5v@virginia.edu) & Jason Tiezzi (jbt5am@virginia.edu)  
- DS 5001: Exploratory Text Analytics
- April 2020  

<font color = gray>

## Overview

This notebook creates 3 TFIDF tables for our corpus:  
    
    1. TFIDF with bag of Recipe (TFIDF_recipe.csv)  
    
    2. TFIDF with bag of Book (`TFIDF_book.csv`)  
    
    3. TFIDF with bag of Period (`'TFIDF_timeperiod.csv'`)  

---
# Setup

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly_express as px
from nltk.stem.porter import PorterStemmer
import os

## Configs

In [2]:
#os.chdir('C:/Users/Jason/Documents/Data Science/Spring 2020/Text Analytics/final_project/DS5001-Text-Analytics/')
file_dir = 'C:/Users/Jason/Documents/Data Science/Spring 2020/Text Analytics/final_project/DS5001-Text-Analytics/'
data_dir = 'Tables/'
#os.chdir(file_dir)

## Functions

### Defining the TFIDF Function


In [3]:
def TFIDF(TOKEN, OHCO='book', count_method='n', tf_method='sum', idf_method='standard', term_type='term_str'): 

    #specifying the term type
    TOKEN = TOKEN[~TOKEN.term_str.isna()] #removing blank term_str
    stemmer1 = PorterStemmer()
    TOKEN['stem'] = TOKEN.term_str.apply(stemmer1.stem) #adding stems
    print('Term Type: ', term_type)

    
    #specifying the bag size, customized to our text
    OHCO1 = ['period', 'book_year','book_id', 'vol_num','chap_num', 'recp_num','para_num', 'sent_num', 'token_num']
    sentence = OHCO1[:8]
    paragraph = OHCO1[:7]
    recipe = OHCO1[:6]
    chapter = OHCO1[:5]
    volume = OHCO1[:4]
    book = OHCO1[:3]
    book_year = OHCO1[:2]
    period = OHCO1[:1]
    if OHCO == 'chapter':
        bag = chapter
    elif OHCO == 'sentence':
        bag = sentence
    elif OHCO == 'paragraph':
        bag = paragraph
    elif OHCO == 'book':
        bag = book
    elif OHCO == 'recipe':
        bag = recipe
    elif OHCO == 'volume':
        bag = volume
    elif OHCO == 'period':
        bag = period
    elif OHCO == 'book_year':
        bag = book_year
    print('Bag Level: ', OHCO)

    #creating the document-term matrix
    BOW = TOKEN.groupby(bag+[term_type]).term_str.count()\
    .to_frame().rename(columns={'term_str':'n'})
    BOW['c'] = BOW.n.astype('bool').astype('int')
    DTCM = BOW[count_method].unstack().fillna(0).astype('int')
    print('Count method: ', count_method)

    #specifying the TF method and calculating TF
    if tf_method == 'sum':
        TF = DTCM.T / DTCM.T.sum()
    elif tf_method == 'max':
        TF = DTCM.T / DTCM.T.max()
    elif tf_method == 'log':
        TF = np.log10(1 + DTCM.T)
    elif tf_method == 'raw':
        TF = DTCM.T
    elif tf_method == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
        TF = tf_norm_k + (1 - tf_norm_k) * TF[TF > 0] 
    elif tf_method == 'binary':
        TF = DTCM.T.astype('bool').astype('int')
    print('TF method: ', tf_method)
   
    #calculating DF and TFIDF
    DF = DTCM[DTCM > 0].count()
    N = DTCM.shape[0]
    
    if idf_method == 'standard':
        IDF = np.log10(N / DF)
    elif idf_method == 'max':
        IDF = np.log10(DF.max() / DF) 
    elif idf_method == 'smooth':
        IDF = np.log10((1 + N) / (1 + DF)) + 1 
    print('IDF method: ', idf_method)    

    TF = TF.T
    TFIDF = TF * IDF
    
    return TFIDF

---
# Calculating TFIDF for our corpus

### Reading in the data and preprocessing

In [4]:
TOKEN = pd.read_csv(data_dir + 'TOKEN.csv')

In [5]:
TOKEN.sample(20, random_state=2887)

Unnamed: 0,book_id,vol_num,chap_num,recp_num,para_num,sent_num,token_num,pos_tuple,pos,token_str,term_str
1002620,54568,2,16,157.0,1,3,10,"('time', 'NN')",NN,time,time
702288,28681,2,8,0.0,12,0,29,"('up', 'RP')",RP,up,up
325684,9938,3,156,269.0,1,1,27,"('all', 'DT')",DT,all,all
113804,9936,2,30,84.0,0,6,5,"(',', ',')",",",",",
747960,28681,3,24,18.0,52,0,12,"('No', 'UH')",UH,No,no
268429,9937,5,208,408.0,0,4,0,"('Besides', 'IN')",IN,Besides,besides
1034432,54568,2,29,289.0,7,1,52,"('aside', 'RB')",RB,aside,aside
281432,9938,1,6,19.0,0,3,14,"('serve', 'VB')",VB,serve,serve
30538,9935,2,51,135.0,0,4,25,"('that', 'IN')",IN,that,that
886083,28681,5,37,37.0,32,0,6,"('light', 'JJ')",JJ,light,light


In [6]:
TOKEN.shape

(1130904, 11)

In [7]:
LIB = pd.read_csv(data_dir + 'LIB.csv')
LIB

Unnamed: 0,book_id,author_last,author_full,book_year,book_title,book_file
0,9935,WIDAS,Woman's Institute of Domestic Arts and Sciences,1923,"Woman's Institute Library of Cookery, Vol. 1",Cookbooks/WIDAS1923_WILCV01_pg9935.txt
1,9936,WIDAS,Woman's Institute of Domestic Arts and Sciences,1923,"Woman's Institute Library of Cookery, Vol. 2",Cookbooks/WIDAS1923_WILCV02_pg9936.txt
2,9937,WIDAS,Woman's Institute of Domestic Arts and Sciences,1923,"Woman's Institute Library of Cookery, Vol. 3",Cookbooks/WIDAS1923_WILCV03_pg9937.txt
3,9938,WIDAS,Woman's Institute of Domestic Arts and Sciences,1923,"Woman's Institute Library of Cookery, Vol. 4",Cookbooks/WIDAS1923_WILCV04_pg9938.txt
4,9939,WIDAS,Woman's Institute of Domestic Arts and Sciences,1923,"Woman's Institute Library of Cookery, Vol. 5",Cookbooks/WIDAS1923_WILCV05_pg9939.txt
5,10582,Bradley,Alice Bradley,1923,For Luncheon and Supper Guests,Cookbooks/Bradley1923_FLSG_pg10582.txt
6,12519,Randolf,Mary Randolph,1860,The Virginia Housewife,Cookbooks/Randolf1860_VAHousewife_pg12519.txt
7,15464,Goudiss,Alice Bradley,1918,Foods That Will Win The War And How To Cook,Cookbooks/Goudiss1918_War_pg15464.txt
8,19077,Hill,Janet McKenzie Hill,1909,"Salads, Sandwiches and Chafing - Dish Dainties",Cookbooks/Hill1909_SSCDD_pg19077.txt
9,24205,Murrey,Thomas J. Murrey,1885,Breakfast Dainties,Cookbooks/Murrey1885_Bfast_pg24205.txt


### Merging in the library table since it contains the book year

In [8]:
TOKEN1 = pd.merge(TOKEN,LIB,on='book_id', how="inner")
TOKEN1.sample(10)

Unnamed: 0,book_id,vol_num,chap_num,recp_num,para_num,sent_num,token_num,pos_tuple,pos,token_str,term_str,author_last,author_full,book_year,book_title,book_file
755610,28681,3,25,19.0,60,0,27,"(',', ',')",",",",",,Kitchiner,William Kitchiner,1830,The Cook's Oracle; and Housekeeper's Manual,Cookbooks/Kitchiner1830_TCO_pg28681.txt
243165,9937,3,149,286.0,1,0,19,"('.', '.')",.,.,,WIDAS,Woman's Institute of Domestic Arts and Sciences,1923,"Woman's Institute Library of Cookery, Vol. 3",Cookbooks/WIDAS1923_WILCV03_pg9937.txt
625958,19077,2,16,240.0,0,0,1,"('-', ':')",:,-,,Hill,Janet McKenzie Hill,1909,"Salads, Sandwiches and Chafing - Dish Dainties",Cookbooks/Hill1909_SSCDD_pg19077.txt
526670,12519,0,7,96.0,1,0,78,"('scraped', 'JJ')",JJ,scraped,scraped,Randolf,Mary Randolph,1860,The Virginia Housewife,Cookbooks/Randolf1860_VAHousewife_pg12519.txt
1069419,61185,0,3,0.0,5,2,3,"('extravagance', 'NN')",NN,extravagance,extravagance,Payne,Arthur Gay Payne,1877,Common - Sense Papers on Cookery,Cookbooks/Payne1877_CSPC_pg61185.txt
296415,9938,1,53,107.0,0,4,22,"('the', 'DT')",DT,the,the,WIDAS,Woman's Institute of Domestic Arts and Sciences,1923,"Woman's Institute Library of Cookery, Vol. 4",Cookbooks/WIDAS1923_WILCV04_pg9938.txt
909603,29519,0,5,113.0,1,4,16,"('a', 'DT')",DT,a,a,Hooper,Mary Hooper,1892,Nelson's Home Comforts,Cookbooks/Hooper1892_NHC_pg29519.txt
784112,28681,3,26,20.0,237,0,45,"('it', 'PRP')",PRP,it,it,Kitchiner,William Kitchiner,1830,The Cook's Oracle; and Housekeeper's Manual,Cookbooks/Kitchiner1830_TCO_pg28681.txt
215349,9937,2,79,134.0,1,6,39,"('done', 'VBN')",VBN,done,done,WIDAS,Woman's Institute of Domestic Arts and Sciences,1923,"Woman's Institute Library of Cookery, Vol. 3",Cookbooks/WIDAS1923_WILCV03_pg9937.txt
511858,12519,0,1,2.0,1,1,85,"('three', 'CD')",CD,three,three,Randolf,Mary Randolph,1860,The Virginia Housewife,Cookbooks/Randolf1860_VAHousewife_pg12519.txt


### Adding in a feature for the rough period of time

In [9]:
per = []
for i in range(len(TOKEN1)):
    if TOKEN1.book_year[i] >= 1900: #creating three groups -- those after 1900, those in the late 1800s, and those in the mid 1800s
        value = '1900s'
    elif TOKEN1.book_year[i] >= 1875:
        value = "late1800s"
    else:
        value = "mid1800s"
    per.append(value)
TOKEN1['period'] = per
TOKEN1.sample(10)

Unnamed: 0,book_id,vol_num,chap_num,recp_num,para_num,sent_num,token_num,pos_tuple,pos,token_str,term_str,author_last,author_full,book_year,book_title,book_file,period
571656,15464,2,8,59.0,5,0,10,"('cooking', 'NN')",NN,cooking,cooking,Goudiss,Alice Bradley,1918,Foods That Will Win The War And How To Cook,Cookbooks/Goudiss1918_War_pg15464.txt,1900s
1122399,61185,0,18,0.0,7,0,20,"('.', '.')",.,.,,Payne,Arthur Gay Payne,1877,Common - Sense Papers on Cookery,Cookbooks/Payne1877_CSPC_pg61185.txt,late1800s
904301,29519,0,4,81.0,1,0,25,"('follows', 'VBZ')",VBZ,follows,follows,Hooper,Mary Hooper,1892,Nelson's Home Comforts,Cookbooks/Hooper1892_NHC_pg29519.txt,late1800s
334285,9938,3,182,324.0,0,6,0,"('mace', 'NN')",NN,mace,mace,WIDAS,Woman's Institute of Domestic Arts and Sciences,1923,"Woman's Institute Library of Cookery, Vol. 4",Cookbooks/WIDAS1923_WILCV04_pg9938.txt,1900s
509806,10582,0,9,124.0,1,0,10,"('1', 'CD')",CD,1,1,Bradley,Alice Bradley,1923,For Luncheon and Supper Guests,Cookbooks/Bradley1923_FLSG_pg10582.txt,1900s
1108853,61185,0,14,0.0,43,0,3,"('the', 'DT')",DT,the,the,Payne,Arthur Gay Payne,1877,Common - Sense Papers on Cookery,Cookbooks/Payne1877_CSPC_pg61185.txt,late1800s
53666,9935,3,121,231.0,0,5,5,"('fat', 'NN')",NN,fat,fat,WIDAS,Woman's Institute of Domestic Arts and Sciences,1923,"Woman's Institute Library of Cookery, Vol. 1",Cookbooks/WIDAS1923_WILCV01_pg9935.txt,1900s
529323,12519,0,7,116.0,1,0,4,"(',', ',')",",",",",,Randolf,Mary Randolph,1860,The Virginia Housewife,Cookbooks/Randolf1860_VAHousewife_pg12519.txt,mid1800s
176634,9936,6,190,389.0,1,1,13,"('rather', 'RB')",RB,rather,rather,WIDAS,Woman's Institute of Domestic Arts and Sciences,1923,"Woman's Institute Library of Cookery, Vol. 2",Cookbooks/WIDAS1923_WILCV02_pg9936.txt,1900s
424367,9939,3,106,317.0,0,2,17,"('in', 'IN')",IN,in,in,WIDAS,Woman's Institute of Domestic Arts and Sciences,1923,"Woman's Institute Library of Cookery, Vol. 5",Cookbooks/WIDAS1923_WILCV05_pg9939.txt,1900s


## Applying the Function

### Running the function with a bag of book

Note: I defined all of the arguments (even though it's not required by the function) just so it would be clear what specifications we used.

In [10]:
TFIDF_book = TFIDF(TOKEN=TOKEN1, OHCO='book', count_method='n', tf_method='sum', idf_method='standard', term_type='term_str') 




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Term Type:  term_str
Bag Level:  book
Count method:  n
TF method:  sum
IDF method:  standard


In [11]:
TFIDF_book.sum().sort_values(ascending=False).head(30)

#looking at the top 30 words in our corpus by TFIDF -- interesting that we got some numbers in there!

term_str
12               0.010532
1                0.009640
curry            0.008969
teaspoon         0.008843
tablespoons      0.007332
tsp              0.006166
antonini         0.005320
cup              0.005207
14               0.005086
c                0.004756
p                0.004661
nelson           0.004642
tb               0.004391
smith            0.003997
metabolism       0.003850
tablespoon       0.003781
teaspoons        0.003732
syrup            0.003596
j                0.003342
4                0.003191
stuffs           0.003133
blazer           0.003121
till             0.003036
calories         0.002940
madras           0.002933
teaspoonful      0.002919
foods            0.002870
gill             0.002839
3                0.002827
tablespoonful    0.002749
dtype: float64

### Running TFIDF with a bag of recipe


In [12]:
TFIDF_recp = TFIDF(TOKEN=TOKEN1, OHCO='recipe', count_method='n', tf_method='sum', idf_method='standard', term_type='term_str') 




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Term Type:  term_str
Bag Level:  recipe
Count method:  n
TF method:  sum
IDF method:  standard


In [13]:
TFIDF_recp.sum().sort_values(ascending=False).head(30)
#looking at TFIDF by recipe

term_str
1         25.807714
is        18.428777
cup       18.059645
it        17.931239
them      17.303845
be        16.731512
are       16.177557
c         16.170029
2         16.137305
milk      16.041030
water     15.611171
12        15.214541
as        15.057910
add       15.005058
the       14.919713
sugar     14.177607
butter    13.546018
for       13.489683
that      13.470927
cream     13.446396
of        13.203503
or        13.176509
with      13.100030
salt      13.007485
they      12.901848
a         12.451291
flour     12.357812
salad     12.221847
to        11.958100
sauce     11.916914
dtype: float64

### Running TFIDF with a bag of book period

In [14]:
TFIDF_timeperiod = TFIDF(TOKEN1, OHCO="period", count_method='n', tf_method='sum', idf_method='standard', term_type='term_str')
        



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Term Type:  term_str
Bag Level:  period
Count method:  n
TF method:  sum
IDF method:  standard


In [15]:
TFIDF_timeperiod.sum().sort_values(ascending=False).head(30)


term_str
tsp               0.000777
obs               0.000682
tb                0.000554
flavor            0.000335
teaspoon          0.000299
nelson            0.000291
sirup             0.000282
amount            0.000252
teaspoonful       0.000228
tablespoons       0.000191
yelks             0.000184
nb                0.000172
shown             0.000168
etc               0.000162
tablespoonful     0.000156
tablespoon        0.000149
qt                0.000145
color             0.000142
canned            0.000141
colour            0.000135
protein           0.000133
yelk              0.000132
yolks             0.000127
saltspoon         0.000122
tablespoonfuls    0.000120
tamis             0.000115
¼                 0.000109
mayonnaise        0.000108
drachm            0.000108
savoury           0.000106
dtype: float64

---
# Writing to CSV

In [None]:
TFIDF_book.to_csv(data_dir + 'TFIDF_book.csv')
TFIDF_recp.to_csv(data_dir + 'TFIDF_recp.csv')
TFIDF_timeperiod.to_csv(data_dir + 'TFIDF_timeperiod.csv')