# Metadata

```yaml
Course:   DS5001: Exploratory Text Analytics
Topic:    Final Project, Create Tables
Author:   Andrew Avitabile
Date:     24 March 2024 (Edited May 02, 2024)
```

# Set Up

## Packages

In [1]:
# Importing required libraries
import pandas as pd
import numpy as np
from collections import Counter
from numpy.linalg import norm

# sklearn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoModelForSeq2SeqLM

In [2]:
# Define the base path
base_path = "C:/Users/yaj3ma/Box/DS5001 Final Project/"

## Import Data

In [3]:
LIB = pd.read_csv(base_path + "output/LIB.csv", delimiter = "|").set_index('observationid')
CORPUS = pd.read_csv(base_path + "output/CORPUS.csv", delimiter = "|").set_index(['observationid', 'sentence_num', 'token_num'])
VOCAB = pd.read_csv(base_path + "output/VOCAB.csv", delimiter = "|").set_index('term_str')

In [4]:
# Replace NaN values with an empty string (or another placeholder)
CORPUS['term_str'] = CORPUS['term_str'].fillna('')

# Create tables

## Bag-of-words (BOW)

In [5]:
def create_bow(CORPUS, bag, item_type='term_str'):
    BOW = CORPUS.groupby(bag+[item_type])[item_type].count().to_frame('n')
    return BOW

In [6]:
BOW_document = create_bow(CORPUS, bag=['observationid'])
BOW_document

Unnamed: 0_level_0,Unnamed: 1_level_0,n
observationid,term_str,Unnamed: 2_level_1
35,,1
35,.,1
35,IEP,1
35,Ms.,1
35,Simmons,1
...,...,...
43899,who,1
43899,why,1
43899,work,2
43899,worked,2


In [7]:
BOW_document.to_csv(base_path + "output/BOW_document.csv", sep='|', index=True)

## Create Document-Term Count Matrix (DCTM), DFIDF, and TFIDF

In [8]:
def get_tfidf(BOW, tf_method='max', df_method='standard', item_type='term_str'):
            
    DTCM = BOW.n.unstack() # Create Doc-Term Count Matrix
    
    if tf_method == 'sum':
        TF = (DTCM.T / DTCM.T.sum()).T
    elif tf_method == 'max':
        TF = (DTCM.T / DTCM.T.max()).T
    elif tf_method == 'log':
        TF = (np.log2(DTCM.T + 1)).T
    elif tf_method == 'raw':
        TF = DTCM
    elif tf_method == 'bool':
        TF = DTCM.astype('bool').astype('int')
    else:
        raise ValueError(f"TF method {tf_method} not found.")

    DF = DTCM.count() # Assumes NULLs 
    N_docs = len(DTCM)
    
    if df_method == 'standard':
        IDF = np.log2(N_docs/DF) # This what the students were asked to use
    elif df_method == 'textbook':
        IDF = np.log2(N_docs/(DF + 1))
    elif df_method == 'sklearn':
        IDF = np.log2(N_docs/DF) + 1
    elif df_method == 'sklearn_smooth':
        IDF = np.log2((N_docs + 1)/(DF + 1)) + 1
    else:
        raise ValueError(f"DF method {df_method} not found.")
    
    TFIDF = TF * IDF
    
    DFIDF = DF * IDF
    
    TFIDF = TFIDF.fillna(0)
    
    DTCM = DTCM.fillna(0)

    return TFIDF, DFIDF, DTCM, TF, DF, IDF

In [9]:
TFIDF, DFIDF, DTCM, TF, DF, IDF = get_tfidf(BOW = BOW_document, tf_method = "max")

In [10]:
TFIDF.to_csv(base_path + "output/TFIDF.csv", sep='|', index=True)

In [11]:
DFIDF.to_csv(base_path + "output/DFIDF.csv", sep='|', index=True)

In [12]:
DTCM.to_csv(base_path + "output/DTCM.csv", sep='|', index=True)

In [13]:
VOCAB['df'] = DF
VOCAB['idf'] = IDF
VOCAB['dfidf'] = DFIDF
VOCAB['mean_tfidf'] = TFIDF.mean()

In [14]:
VOCAB.to_csv(base_path + "output/VOCAB.csv", sep='|', index=True)

In [15]:
# Sorting by DFIDF to find the top 20 significant words
top_20_significant = VOCAB.sort_values(by='dfidf', ascending=False).head(20)
top_20_significant

Unnamed: 0_level_0,n,porter_stem,stop,max_pos,max_pos_group,ngram_length,df,idf,dfidf,mean_tfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
were,12611,were,True,VBD,VERB,1,6770.0,1.488088,10074.35668,0.109376
as,12849,as,True,IN,OTHER,1,6721.0,1.498568,10071.875971,0.106611
you,32847,you,True,PRP,OTHER,1,6681.0,1.50718,10069.468984,0.194305
-,12347,-,False,HYPH,OTHER,1,6424.0,1.563772,10045.671822,0.116169
student,12205,student,False,NN,NOUN,1,6412.0,1.56647,10044.202717,0.114385
are,11637,are,True,VBP,VERB,1,6276.0,1.597399,10025.273723,0.109802
that,15048,that,True,IN,OTHER,1,7815.0,1.280998,10011.000124,0.111746
well,9083,well,False,RB,ADVERB,1,6173.0,1.621272,10008.113157,0.10322
was,15365,wa,True,VBD,VERB,1,7936.0,1.258832,9990.090307,0.124074
good,12688,good,False,JJ,ADJECTIVE,1,6007.0,1.660599,9975.22003,0.101964


## Create a reduced and Normalized TFIDF_L2

In [16]:
bag = 'observationid'
vocab_filter = 'dfidf'
n_terms = 1000
pos_list = ['NN', 'VB', 'JJ'] #Limit to nouns, verbs, and ajectives

In [17]:
VIDX = VOCAB.loc[VOCAB.max_pos.isin(pos_list)]\
    .sort_values(vocab_filter, ascending=False)\
    .head(n_terms).index

In [18]:
M = TFIDF[VIDX].fillna(0).groupby('observationid').mean() # MUST FILLNA

In [19]:
TFIDF_L2 = M.apply(lambda x: x / norm(x), 1) # Euclidean

In [20]:
TFIDF_L2

term_str,student,good,classroom,job,teacher,learning,be,class,work,time,...,nothing,tie,calculate,proficiency,deliberate,range,recognition,complex,insure,elaborate
observationid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
35,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36,0.000000,0.000000,0.138260,0.00000,0.0,0.000000,0.000000,0.000000,0.162713,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.241623,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38,0.000000,0.000000,0.080120,0.08254,0.0,0.000000,0.090075,0.000000,0.188579,0.098808,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43878,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43885,0.000000,0.148337,0.149609,0.00000,0.0,0.084075,0.000000,0.171238,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43888,0.000000,0.000000,0.000000,0.00000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43889,0.000000,0.000000,0.095306,0.00000,0.0,0.000000,0.000000,0.109084,0.000000,0.117536,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
TFIDF_L2.to_csv(base_path + "output/TFIDF_L2.csv", sep='|', index=True)