# Metadata

```yaml
Course:   DS5001: Exploratory Text Analytics
Topic:    Final Project, Create Tables
Author:   Andrew Avitabile
Date:     24 March 2024 (Edited April 25, 2024)
```

# Set Up

## Packages

In [1]:
# Importing required libraries
import pandas as pd
import numpy as np
from collections import Counter
from numpy.linalg import norm

# sklearn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoModelForSeq2SeqLM

In [2]:
# Define the base path
base_path = "C:/Users/Andre/Box/DS5001 Final Project/"

## Import Data

In [3]:
LIB = pd.read_csv(base_path + "output/LIB.csv", delimiter = "|").set_index('observationid')
CORPUS = pd.read_csv(base_path + "output/CORPUS.csv", delimiter = "|").set_index(['observationid', 'sentence_num', 'token_num'])
VOCAB = pd.read_csv(base_path + "output/VOCAB.csv", delimiter = "|").set_index('term_str')

In [4]:
# Replace NaN values with an empty string (or another placeholder)
CORPUS['term_str'] = CORPUS['term_str'].fillna('')

# Create tables

## Bag-of-words (BOW)

In [5]:
def create_bow(CORPUS, bag, item_type='term_str'):
    BOW = CORPUS.groupby(bag+[item_type])[item_type].count().to_frame('n')
    return BOW

In [6]:
BOW_document = create_bow(CORPUS, bag=['observationid'])
BOW_document

Unnamed: 0_level_0,Unnamed: 1_level_0,n
observationid,term_str,Unnamed: 2_level_1
0,%,1
0,'',3
0,'re,1
0,'s,2
0,(,4
...,...,...
19007,questioning,1
19007,students,1
19007,technique,1
19007,the,1


In [7]:
BOW_document.to_csv(base_path + "output/BOW_document.csv", sep='|', index=True)

## Create Document-Term Count Matrix (DCTM), DFIDF, and TFIDF

In [8]:
def get_tfidf(BOW, tf_method='max', df_method='standard', item_type='term_str'):
            
    DTCM = BOW.n.unstack() # Create Doc-Term Count Matrix
    
    if tf_method == 'sum':
        TF = (DTCM.T / DTCM.T.sum()).T
    elif tf_method == 'max':
        TF = (DTCM.T / DTCM.T.max()).T
    elif tf_method == 'log':
        TF = (np.log2(DTCM.T + 1)).T
    elif tf_method == 'raw':
        TF = DTCM
    elif tf_method == 'bool':
        TF = DTCM.astype('bool').astype('int')
    else:
        raise ValueError(f"TF method {tf_method} not found.")

    DF = DTCM.count() # Assumes NULLs 
    N_docs = len(DTCM)
    
    if df_method == 'standard':
        IDF = np.log2(N_docs/DF) # This what the students were asked to use
    elif df_method == 'textbook':
        IDF = np.log2(N_docs/(DF + 1))
    elif df_method == 'sklearn':
        IDF = np.log2(N_docs/DF) + 1
    elif df_method == 'sklearn_smooth':
        IDF = np.log2((N_docs + 1)/(DF + 1)) + 1
    else:
        raise ValueError(f"DF method {df_method} not found.")
    
    TFIDF = TF * IDF
    
    DFIDF = DF * IDF
    
    TFIDF = TFIDF.fillna(0)
    
    DTCM = DTCM.fillna(0)

    return TFIDF, DFIDF, DTCM, TF, DF, IDF

In [9]:
TFIDF, DFIDF, DTCM, TF, DF, IDF = get_tfidf(BOW = BOW_document, tf_method = "max")

In [10]:
TFIDF.to_csv(base_path + "output/TFIDF.csv", sep='|', index=True)

In [11]:
DFIDF.to_csv(base_path + "output/DFIDF.csv", sep='|', index=True)

In [12]:
DTCM.to_csv(base_path + "output/DTCM.csv", sep='|', index=True)

In [13]:
VOCAB['df'] = DF
VOCAB['idf'] = IDF
VOCAB['dfidf'] = DFIDF
VOCAB['mean_tfidf'] = TFIDF.mean()

In [14]:
VOCAB.to_csv(base_path + "output/VOCAB.csv", sep='|', index=True)

In [15]:
# Sorting by DFIDF to find the top 20 significant words
top_20_significant = VOCAB.sort_values(by='dfidf', ascending=False).head(20)
top_20_significant

Unnamed: 0_level_0,n,porter_stem,stop,max_pos,max_pos_group,ngram_length,df,idf,dfidf,mean_tfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
were,12607,were,True,VBD,VERB,1,6769.0,1.488453,10075.339571,0.11055
as,12852,as,True,IN,OTHER,1,6720.0,1.498935,10072.840937,0.108078
you,32801,you,True,PRP,OTHER,1,6677.0,1.508196,10070.223731,0.197062
student,11985,student,False,NN,NOUN,1,6307.0,1.590442,10030.917461,0.115394
are,11629,are,True,VBP,VERB,1,6275.0,1.59778,10026.072244,0.111555
that,15046,that,True,IN,OTHER,1,7812.0,1.281704,10012.671234,0.112913
was,15364,wa,True,VBD,VERB,1,7934.0,1.259347,9991.663055,0.125271
good,12605,good,False,JJ,ADJECTIVE,1,5970.0,1.669665,9967.89984,0.103054
classroom,8795,classroom,False,NN,NOUN,1,5940.0,1.676933,9960.981814,0.099954
her,15665,her,True,PRP$,OTHER,1,5925.0,1.680581,9957.440898,0.191286


## Create a reduced and Normalized TFIDF_L2

In [23]:
bag = 'observationid'
vocab_filter = 'dfidf'
n_terms = 1000
pos_list = ['NN', 'VB', 'JJ'] #Limit to nouns, verbs, and ajectives

In [24]:
VIDX = VOCAB.loc[VOCAB.max_pos.isin(pos_list)]\
    .sort_values(vocab_filter, ascending=False)\
    .head(n_terms).index

In [25]:
M = TFIDF[VIDX].fillna(0).groupby('observationid').mean() # MUST FILLNA

In [26]:
TFIDF_L2 = M.apply(lambda x: x / norm(x), 1) # Euclidean

In [27]:
TFIDF_L2

term_str,student,good,classroom,job,teacher,be,learning,class,work,time,...,pass,rotation,steady,immediate,informative,moon,comfort,spring,volunteer,real-life
observationid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.106177,0.000000,0.000000,0.000000,0.0,0.031508,0.000000,0.000000,0.032964,0.069303,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,0.000000,0.000000,0.000000,0.023661,0.0,0.051749,0.077774,0.026445,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.000000,0.092932,0.093337,0.000000,0.0,0.000000,0.000000,0.107395,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,0.045063,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.476333,0.0,0.0
4,0.092386,0.096988,0.000000,0.000000,0.0,0.164494,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19003,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.217200,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
19004,0.156218,0.082000,0.082357,0.000000,0.0,0.000000,0.092896,0.000000,0.096999,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
19005,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.433303,0.000000,0.155414,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
19006,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [28]:
TFIDF_L2.to_csv(base_path + "output/TFIDF_L2.csv", sep='|', index=True)