# Metadata

```yaml
Course:   DS5001: Exploratory Text Analytics
Topic:    Final Project, Data Prep
Author:   Andrew Avitabile
Date:     24 March 2024 (Edited April 25, 2024)
```

# Set Up

## Packages

In [7]:
# Importing required libraries
import pandas as pd
import numpy as np
from collections import Counter

#nltk packages
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

#Sklearn
from sklearn.feature_extraction.text import CountVectorizer

# Downloading necessary data from nltk
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('vader_lexicon')


# Creating a list of stop words for later use
stop_words = set(stopwords.words('english'))

# Initialize Porter Stemmer
stemmer = PorterStemmer()

In [2]:
# Define the base path
base_path = "C:/Users/Andre/Box/DS5001 Final Project/"

## Import Data

In [3]:
# Define filepaths
file_path_eval_text = base_path + "Data/eval_text.xlsx"

# Read the CSV file
eval_text = pd.read_excel(file_path_eval_text)

In [4]:
#Get just PST feedback. Replace missing feedback with blank strings.
eval_text['overallcomments'].fillna('', inplace=True)

# Parse Data

## Initial Data Cleaning

In [5]:
# Adding a 'document_id' column that is the row number starting from 1
eval_text = eval_text.reset_index()
eval_text['document_id'] = range(1, len(eval_text) + 1)
eval_text.set_index('document_id', inplace=True)

# Counting documents written by each supervisor
n_documents = eval_text.groupby('supervisor').size()

# Counting PSTs evaluated by each supervisor
n_psts = eval_text.groupby('supervisor')['uin_deident'].nunique()

# Joining counts back to the eval_text on supervisor
eval_text = eval_text.join(n_documents.rename('n_documents'), on='supervisor')
eval_text = eval_text.join(n_psts.rename('n_psts'), on='supervisor')

## Creating CORPUS

In [6]:
#Tokenize with SciKitLearn
engine = CountVectorizer()
model = engine.fit_transform(eval_text.overallcomments)

NameError: name 'CountVectorizer' is not defined

In [None]:
# Define a simple POS grouping function
def pos_group(tag):
    if tag.startswith('N'):
        return 'NOUN'
    elif tag.startswith('V'):
        return 'VERB'
    elif tag.startswith('J'):
        return 'ADJECTIVE'
    elif tag.startswith('R'):
        return 'ADVERB'
    else:
        return 'OTHER'

# Initialize the list to collect token data
long_format_data = []

# Iterate through each row in the DataFrame
for document_id, row in eval_text.iterrows():
    document = row['overallcomments']
    sentences = nltk.sent_tokenize(document)
    for sentence_num, sentence in enumerate(sentences):
        tokens = nltk.word_tokenize(sentence)
        tagged_tokens = nltk.pos_tag(tokens)  # Get POS tags for the tokens
        for token_num, (token, tag) in enumerate(tagged_tokens):
            long_format_data.append({
                'document_id': document_id,
                'sentence_num': sentence_num + 1,
                'token_num': token_num + 1,
                'token_str': token.lower(),  # Typically, terms are stored in lower case
                'term_str': token,           # Original token as it appears
                'pos': tag,                  # POS tag
                'pos_group': pos_group(tag)  # Grouped POS tag
            })

# Create DataFrame from long-format data
CORPUS = pd.DataFrame(long_format_data)

In [None]:
CORPUS

In [None]:
CORPUS.to_csv(base_path + "output/CORPUS.csv", sep='|', index=True)

## Creating LIB

In [None]:
# Creating the new DataFrame LIB from eval_text
LIB = eval_text[['supervisor', 'uin_deident', 'order_alt', 'n_documents', 'n_psts', 'overallcomments']].copy()

# Count the number of sentences per document
sentence_counts = CORPUS.groupby('document_id')['sentence_num'].nunique().rename('sentence_count')

# Count the number of tokens per document
token_counts = CORPUS.groupby('document_id')['token_num'].size().rename('token_count')

# Combine sentence and token counts into a single DataFrame
doc_counts = pd.DataFrame({'sentence_count': sentence_counts, 'token_count': token_counts})

#Merge LIB with document count information
LIB = LIB.join(doc_counts)

# Get a count of the characters in the comments
LIB['char_count'] = eval_text['overallcomments'].str.len()

In [None]:
LIB

In [None]:
LIB['char_count'].fillna(0).mean()

In [None]:
LIB.to_csv(base_path + "output/LIB.csv", sep='|', index=True)

## Creating Sentence-Level Data

In [None]:
# Step 1: Aggregate terms to form sentences and count terms
grouped = CORPUS.groupby(['document_id', 'sentence_num'])
SENTENCES = pd.DataFrame({
    'sentence': grouped['term_str'].apply(' '.join),
    'term_count': grouped['term_str'].size()
}).reset_index()

# Flatten the column multi-levels generated by agg
SENTENCES.columns = ['document_id', 'sentence_num', 'sentence', 'term_count']

SENTENCES.set_index(['document_id', 'sentence_num'])

In [None]:
SENTENCES.to_csv(base_path + "output/SENTENCES.csv", sep='|', index=True)

## Creating VOCAB

In [None]:
# Calculate Term Frequency across the corpus
CORPUS['term_str'] = CORPUS['token_str'].str.lower()  # normalize to lowercase
TF = CORPUS['term_str'].value_counts().rename('n')

# Calculate Document Frequency
DF = CORPUS.groupby('term_str')['document_id'].nunique().rename('df')

# Calculate IDF using log scaling
total_documents = CORPUS['document_id'].nunique()
IDF = np.log(total_documents / DF).rename('idf')

# Calculate DFIDF
DFIDF = (DF * IDF).rename('dfidf')

# Stemming and identifying stopwords
VOCAB = pd.DataFrame(index=TF.index)
VOCAB['n'] = TF
VOCAB['df'] = DF
VOCAB['idf'] = IDF
VOCAB['dfidf'] = DFIDF
VOCAB['porter_stem'] = VOCAB.index.map(lambda x: stemmer.stem(x))
VOCAB['stop'] = VOCAB.index.isin(stop_words)

# Get max POS and POS group for each term
max_pos = CORPUS.groupby('term_str')['pos'].agg(lambda x: x.value_counts().idxmax()).rename('max_pos')
max_pos_group = CORPUS.groupby('term_str')['pos_group'].agg(lambda x: x.value_counts().idxmax()).rename('max_pos_group')

VOCAB = VOCAB.join(max_pos)
VOCAB = VOCAB.join(max_pos_group)

# Assuming handling of ngrams if applicable
# Here we assume unigram as example; modify if you have actual ngrams data
VOCAB['ngram_length'] = VOCAB.index.map(lambda x: len(x.split()))

In [None]:
VOCAB

In [None]:
VOCAB.to_csv(base_path + "output/VOCAB.csv", sep='|', index=True)

### Top 20 most significant words

In [None]:
# Sorting by DFIDF to find the top 20 significant words
top_20_significant = VOCAB.sort_values(by='dfidf', ascending=False).head(20)
top_20_significant