# Metadata

```yaml
Course:   DS5001: Exploratory Text Analytics
Topic:    Final Project, Create Tables
Author:   Andrew Avitabile
Date:     24 March 2024 (Edited April 25, 2024)
```

# Set Up

## Packages

In [1]:
# Importing required libraries
import pandas as pd
import numpy as np
from collections import Counter


# sklearn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoModelForSeq2SeqLM

In [2]:
# Define the base path
base_path = "C:/Users/Andre/Box/DS5001 Final Project/"

## Import Data

In [3]:
LIB = pd.read_csv(base_path + "output/LIB.csv", delimiter = "|")
CORPUS = pd.read_csv(base_path + "output/CORPUS.csv", delimiter = "|")
VOCAB = pd.read_csv(base_path + "output/VOCAB.csv", delimiter = "|")

# Create tables

## Bag-of-words (BOW)

In [4]:
def create_bow(CORPUS, bag, item_type='term_str'):
    BOW = CORPUS.groupby(bag+[item_type])[item_type].count().to_frame('n')
    return BOW

In [5]:
BOW_document = create_bow(CORPUS, bag=['document_id'])
BOW_document

Unnamed: 0_level_0,Unnamed: 1_level_0,n
document_id,term_str,Unnamed: 2_level_1
1,%,1
1,'',3
1,'re,1
1,'s,2
1,(,4
...,...,...
11385,well,1
11385,when,1
11385,with,1
11385,word,1


In [6]:
BOW_document.to_csv(base_path + "output/BOW_document.csv", sep='|', index=True)

In [7]:
BOW_sentence = create_bow(CORPUS, bag=['document_id', 'sentence_num'])
BOW_sentence

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n
document_id,sentence_num,term_str,Unnamed: 3_level_1
1,1,.,1
1,1,:,1
1,1,Europe,1
1,1,Industrial,1
1,1,Objective,1
...,...,...,...
11385,2,testing,1
11385,2,the,1
11385,2,to,1
11385,2,when,1


In [8]:
BOW_sentence.to_csv(base_path + "output/BOW_sentence.csv", sep='|', index=True)

## Create a document-term matrix (DTM)

In [9]:
TF = CORPUS.groupby(['document_id', 'term_str']).size().reset_index(name='n')
DTM = TF.pivot(index='document_id', columns='term_str', values='n').fillna(0)
DTM

term_str,!,#,$,%,&,','','16,'1st,'Alright,...,zip,zombieism,zombies,zone,zones,zoning,zoo,zoophonics,{,}
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11382,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11384,1.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
DTM.to_csv(base_path + "output/DTM.csv", sep='|', index=True)

## Create TFIDF

In [11]:
# Grouping tokens back into document-level strings
grouped_corpus = CORPUS.groupby('document_id')['term_str'].apply(' '.join).reset_index()

# Using sklearn's TfidfVectorizer to compute TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(grouped_corpus['term_str'])

# Converting the TF-IDF matrix to a DataFrame
TFIDF = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out(), index=grouped_corpus['document_id'])

TFIDF

TypeError: sequence item 115: expected str instance, float found

In [None]:
TFIDF.to_csv(base_path + "output/TFIDF.csv", sep='|', index=True)

## Create a reduced and Normalized TFIDF_L2

In [None]:
# Convert the TFIDF DataFrame to a numpy array for processing
TFIDF_array = TFIDF.values

# Apply L2 normalization
TFIDF_normalized = normalize(TFIDF_array, norm='l2', axis=1)

# Convert the normalized array back to a DataFrame
TFIDF_L2 = pd.DataFrame(TFIDF_normalized, index=TFIDF.index, columns=TFIDF.columns)

#Display
TFIDF_L2

In [None]:
TFIDF_L2.to_csv(base_path + "output/TFIDF_L2.csv", sep='|', index=True)