In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly_express as px
from nltk.stem.porter import PorterStemmer
import os

# Calculating TFIDF for our corpus

In [2]:
#os.chdir('C:/Users/Jason/Documents/Data Science/Spring 2020/Text Analytics/final_project/DS5001-Text-Analytics/Tables')
TOKEN = pd.read_csv('TOKEN.csv')

### Reading in the data

In [11]:
TOKEN.sample(20, random_state=2887)

Unnamed: 0,book_id,vol_num,chap_num,recp_num,para_num,sent_num,token_num,pos_tuple,pos,token_str,term_str
1036397,54568,2,36,390.0,0,0,20,"(',', ',')",",",",",
411713,9939,2,98,273.0,0,3,22,"('sharp', 'JJ')",JJ,sharp,sharp
846868,28681,4,33,494.0,1,0,4,"('in', 'IN')",IN,in,in
545713,12519,0,15,368.0,1,0,78,"('strain', 'VB')",VB,strain,strain
1003044,54568,2,25,268.0,0,2,32,"('the', 'DT')",DT,the,the
227082,9937,3,125,220.0,1,1,23,"('into', 'IN')",IN,into,into
536739,12519,0,11,265.0,1,0,95,"('paste', 'NN')",NN,paste,paste
909321,32472,0,2,0.0,73,0,60,"('45.0', 'CD')",CD,45.0,450
963565,54568,1,11,71.0,0,0,8,"('of', 'IN')",IN,of,of
818874,28681,3,30,398.0,2,0,29,"('it', 'PRP')",PRP,it,it


### Defining the TFIDF Function


In [21]:
def TFIDF(TOKEN, OHCO='book', count_method='n', tf_method='sum', idf_method='standard', term_type='term_str'): 

    #specifying the term type
    TOKEN = TOKEN[~TOKEN.term_str.isna()] #removing blank term_str
    stemmer1 = PorterStemmer()
    TOKEN['stem'] = TOKEN.term_str.apply(stemmer1.stem) #adding stems
    print('Term Type: ', term_type)

    
    #specifying the bag size, customized to our text
    OHCO1 = ['book_id', 'vol_num','chap_num', 'recp_num','para_num', 'sent_num', 'token_num']
    sentence = OHCO1[:6]
    paragraph = OHCO1[:5]
    recipe = OHCO1[:4]
    chapter = OHCO1[:3]
    volume = OHCO1[:2]
    book = OHCO1[:1]
    if OHCO == 'chapter':
        bag = chapter
    elif OHCO == 'sentence':
        bag = sentence
    elif OHCO == 'paragraph':
        bag = paragraph
    elif OHCO == 'book':
        bag = book
    elif OHCO == 'recipe':
        bag = recipe
    elif OHCO == 'volume':
        bag = volume
    print('Bag Level: ', OHCO)

    #creating the document-term matrix
    BOW = TOKEN.groupby(bag+[term_type]).term_str.count()\
    .to_frame().rename(columns={'term_str':'n'})
    BOW['c'] = BOW.n.astype('bool').astype('int')
    DTCM = BOW[count_method].unstack().fillna(0).astype('int')
    print('Count method: ', count_method)

    #specifying the TF method and calculating TF
    if tf_method == 'sum':
        TF = DTCM.T / DTCM.T.sum()
    elif tf_method == 'max':
        TF = DTCM.T / DTCM.T.max()
    elif tf_method == 'log':
        TF = np.log10(1 + DTCM.T)
    elif tf_method == 'raw':
        TF = DTCM.T
    elif tf_method == 'double_norm':
        TF = DTCM.T / DTCM.T.max()
        TF = tf_norm_k + (1 - tf_norm_k) * TF[TF > 0] 
    elif tf_method == 'binary':
        TF = DTCM.T.astype('bool').astype('int')
    print('TF method: ', tf_method)
   
    #calculating DF and TFIDF
    DF = DTCM[DTCM > 0].count()
    N = DTCM.shape[0]
    
    if idf_method == 'standard':
        IDF = np.log10(N / DF)
    elif idf_method == 'max':
        IDF = np.log10(DF.max() / DF) 
    elif idf_method == 'smooth':
        IDF = np.log10((1 + N) / (1 + DF)) + 1 
    print('IDF method: ', idf_method)    

    TF = TF.T
    TFIDF = TF * IDF
    
    return TFIDF

### Running the function with a bag of book

Note: I defined all of the arguments (even though it's not required by the function) just so it would be clear what specifications we used.

In [22]:
TFIDF_book = TFIDF(TOKEN=TOKEN, OHCO='book', count_method='n', tf_method='sum', idf_method='standard', term_type='term_str') 




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Term Type:  term_str
Bag Level:  book
Count method:  n
TF method:  sum
IDF method:  standard


In [23]:
TFIDF_book.sum().sort_values(ascending=False).head(30)

#looking at the top 30 words in our corpus by TFIDF -- interesting that we got some numbers in there!

term_str
12             0.009100
1              0.009049
curry          0.008863
teaspoon       0.006679
tablespoons    0.006457
tsp            0.006200
antonini       0.005337
cup            0.005202
14             0.004794
nelson         0.004729
p              0.004687
c              0.004598
tb             0.004414
smith          0.004010
saladbowl      0.004005
metabolism     0.004002
j              0.003354
teaspoons      0.003243
syrup          0.003195
blazer         0.003173
4              0.003101
till           0.003057
madras         0.002966
foods          0.002885
stuffs         0.002833
gill           0.002828
3              0.002639
halfapint      0.002635
basal          0.002617
calories       0.002561
dtype: float64

### Running TFIDF with a bag of recipe


In [24]:
TFIDF_recp = TFIDF(TOKEN=TOKEN, OHCO='recipe', count_method='n', tf_method='sum', idf_method='standard', term_type='term_str') 




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Term Type:  term_str
Bag Level:  recipe
Count method:  n
TF method:  sum
IDF method:  standard


In [25]:
TFIDF_recp.sum().sort_values(ascending=False).head(30)
#looking at TFIDF by recipe

term_str
1         26.508159
it        20.730153
them      20.710833
is        19.997702
cup       19.413025
be        18.383906
are       17.918670
milk      17.331250
2         17.306352
c         17.117471
as        16.767356
water     16.734672
add       16.646113
the       15.848324
sugar     15.476664
or        14.838175
that      14.791185
for       14.780923
of        14.589839
butter    14.578949
with      14.411424
they      14.384563
12        14.159247
no        14.127763
salt      14.027344
cream     14.000373
a         13.826816
salad     13.552913
flour     13.386859
to        13.274817
dtype: float64

### Writing to CSV

In [26]:
TFIDF_book.to_csv('TFIDF_book.csv')
TFIDF_recp.to_csv('TFIDF_recp.csv')