# Metadata

```yaml
Course:   DS5001: Exploratory Text Analytics
Topic:    Final Project, Data Prep
Author:   Andrew Avitabile
Date:     24 March 2024 (Edited April 25, 2024)
```

# Set Up

## Packages

In [1]:
# Importing required libraries
import pandas as pd
import numpy as np
from collections import Counter

#nltk packages
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

#Sklearn
from sklearn.feature_extraction.text import CountVectorizer

# Downloading necessary data from nltk
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('vader_lexicon')

#spacy for POS tagging
import spacy
# Load the English NLP model
nlp = spacy.load('en_core_web_sm')

# Creating a list of stop words for later use
stop_words = set(stopwords.words('english'))

# Initialize Porter Stemmer
stemmer = PorterStemmer()

In [2]:
# Define the base path
base_path = "C:/Users/Andre/Box/DS5001 Final Project/"

## Import Data

In [3]:
eval_text = pd.read_excel(base_path + "data/eval_text.xlsx")

# Parse Data

## Initial Data Cleaning

In [4]:
# Counting documents written by each supervisor
n_documents = eval_text.groupby('supervisor_id').size().rename('n_documents')

# Counting PSTs evaluated by each supervisor
n_psts = eval_text.groupby('supervisor_id')['uin'].nunique().rename('n_psts')

# Merge the counts back to the original DataFrame, preserving the original index
eval_text = eval_text.merge(n_psts, left_on='supervisor_id', right_index=True)
eval_text = eval_text.merge(n_documents, left_on='supervisor_id', right_index=True)

In [5]:
eval_text.set_index('observationid')

Unnamed: 0_level_0,uin,supervisor_id,observationdate,overallcomments,overallrating_num,observation_order,n_psts,n_documents
observationid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
32408,1,56,2018-01-26,Objective: SW recall reasons why the Industri...,2.0,1,54,54
3204,119007252,56,2013-03-28,Children returned from lunch. Counted backward...,4.0,1,54,54
9355,121001063,56,2014-09-17,Short 3 question test. Good review of yesterd...,4.0,1,54,54
9926,122000541,56,2014-09-29,Actively engage more students during whole gro...,3.0,1,54,54
3072,217006305,56,2013-03-27,I thought the lesson went well despite what yo...,3.0,1,54,54
...,...,...,...,...,...,...,...,...
33158,922008437,949,2018-02-13,The focus of the lesson was composing an intro...,3.0,1,1,1
35065,922009906,994,2018-04-03,Hands out student checking app keys for the wa...,3.0,1,1,1
34388,924000055,1137,2018-03-21,Good lesson! Much better job of handling the c...,3.0,1,1,1
22965,924002403,1061,2016-09-29,Nice observation. Good rapport with the class....,3.0,1,1,1


## Creating CORPUS

In [6]:
#Tokenize with SciKitLearn
engine = CountVectorizer()
model = engine.fit_transform(eval_text.overallcomments)

In [7]:
# Load your spaCy model
nlp = spacy.load("en_core_web_sm")

# Define a simple POS grouping function adapted for spaCy tags
def pos_group(tag):
    if tag.startswith('N'):
        return 'NOUN'
    elif tag.startswith('V'):
        return 'VERB'
    elif tag.startswith('J'):
        return 'ADJECTIVE'
    elif tag.startswith('R'):
        return 'ADVERB'
    else:
        return 'OTHER'

# Initialize the list to collect token data
long_format_data = []

# Ensure eval_text['observationid'] is indeed the column you want to iterate with the text
# Iterate through each row in the DataFrame using itertuples for better performance and direct column access
for row in eval_text.itertuples():
    document = getattr(row, 'overallcomments')  # Replace 'overallcomments' with the actual column name if different
    observation_id = getattr(row, 'observationid')  # Replace 'observationid' with the actual column name if different
    doc = nlp(document)
    for sentence_num, sentence in enumerate(doc.sents):
        for token_num, token in enumerate(sentence):
            long_format_data.append({
                'observationid': observation_id,
                'sentence_num': sentence_num + 1,
                'token_num': token_num + 1,
                'token_str': token.text.lower(),   # Terms are stored in lower case
                'term_str': token.text,            # Original token as it appears
                'pos': token.tag_,                 # POS tag using spaCy's fine-grained tags
                'pos_group': pos_group(token.tag_) # Grouped POS tag
            })

# Create DataFrame from long-format data
CORPUS = pd.DataFrame(long_format_data)

In [8]:
CORPUS.set_index(['observationid', 'sentence_num', 'token_num'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,token_str,term_str,pos,pos_group
observationid,sentence_num,token_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,1,objective,Objective,JJ,ADJECTIVE
0,1,2,:,:,:,OTHER
0,1,3,,,_SP,OTHER
0,1,4,sw,SW,NNP,NOUN
0,1,5,recall,recall,NN,NOUN
...,...,...,...,...,...,...
18721,24,24,or,or,CC,OTHER
18721,24,25,pencil,pencil,NN,NOUN
18721,24,26,and,and,CC,OTHER
18721,24,27,paper,paper,NN,NOUN


In [9]:
CORPUS.to_csv(base_path + "output/CORPUS.csv", sep='|', index=True)

## Creating LIB

In [10]:
# Creating the new DataFrame LIB from eval_text
LIB = eval_text.copy()

# Count the number of sentences per document
sentence_counts = CORPUS.groupby('observationid')['sentence_num'].nunique().rename('sentence_count')

# Count the number of tokens per document
token_counts = CORPUS.groupby('observationid')['token_num'].size().rename('token_count')

# Combine sentence and token counts into a single DataFrame
doc_counts = pd.DataFrame({'sentence_count': sentence_counts, 'token_count': token_counts})

#Merge LIB with document count information
LIB = LIB.join(doc_counts)

# Get a count of the characters in the comments
LIB['char_count'] = eval_text['overallcomments'].str.len()

In [11]:
LIB.set_index('observationid')

Unnamed: 0_level_0,uin,supervisor_id,observationdate,overallcomments,overallrating_num,observation_order,n_psts,n_documents,sentence_count,token_count,char_count
observationid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
32408,1,56,2018-01-26,Objective: SW recall reasons why the Industri...,2.0,1,54,54,20,442,2198
3204,119007252,56,2013-03-28,Children returned from lunch. Counted backward...,4.0,1,54,54,13,127,687
9355,121001063,56,2014-09-17,Short 3 question test. Good review of yesterd...,4.0,1,54,54,3,20,94
9926,122000541,56,2014-09-29,Actively engage more students during whole gro...,3.0,1,54,54,3,44,237
3072,217006305,56,2013-03-27,I thought the lesson went well despite what yo...,3.0,1,54,54,1,50,264
...,...,...,...,...,...,...,...,...,...,...,...
33158,922008437,949,2018-02-13,The focus of the lesson was composing an intro...,3.0,1,1,1,6,93,494
35065,922009906,994,2018-04-03,Hands out student checking app keys for the wa...,3.0,1,1,1,21,141,764
34388,924000055,1137,2018-03-21,Good lesson! Much better job of handling the c...,3.0,1,1,1,22,183,924
22965,924002403,1061,2016-09-29,Nice observation. Good rapport with the class....,3.0,1,1,1,27,219,1160


In [12]:
LIB['char_count'].fillna(0).mean()

651.1416771885522

In [13]:
LIB.to_csv(base_path + "output/LIB.csv", sep='|', index=True)

## Creating Sentence-Level Data

In [14]:
# Step 1: Aggregate terms to form sentences and count terms
grouped = CORPUS.groupby(['observationid', 'sentence_num'])
SENTENCES = pd.DataFrame({
    'sentence': grouped['term_str'].apply(' '.join),
    'term_count': grouped['term_str'].size()
}).reset_index()

# Flatten the column multi-levels generated by agg
SENTENCES.columns = ['observationid', 'sentence_num', 'sentence', 'term_count']

SENTENCES.set_index(['observationid', 'sentence_num'])

Unnamed: 0_level_0,Unnamed: 1_level_0,sentence,term_count
observationid,sentence_num,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,Objective : SW recall reasons why the Indust...,19
0,2,Class began with you posing a question to the ...,21
0,3,( Car ),4
0,4,You introduced the Industrial Revolution and t...,24
0,5,The fact that most students arrive at school e...,30
...,...,...,...
19005,1,The class schedule was switched today because ...,23
19005,2,The University Supervisor was thus not able ...,25
19006,1,Leslie was not in the field today .,8
19007,1,Great lesson ; all students engaged throughout...,11


In [15]:
SENTENCES.to_csv(base_path + "output/SENTENCES.csv", sep='|', index=True)

## Creating VOCAB

In [16]:
# Calculate Term Frequency across the corpus
CORPUS['term_str'] = CORPUS['token_str'].str.lower()  # normalize to lowercase
TF = CORPUS['term_str'].value_counts().rename('n')

# Calculate Document Frequency
#DF = CORPUS.groupby('term_str')['document_id'].nunique().rename('df')

# Calculate IDF using log scaling
total_documents = CORPUS['observationid'].nunique()
#IDF = np.log(total_documents / DF).rename('idf')

# Calculate DFIDF
#DFIDF = (DF * IDF).rename('dfidf')

# Stemming and identifying stopwords
VOCAB = pd.DataFrame(index=TF.index)
VOCAB['n'] = TF
#VOCAB['df'] = DF
#VOCAB['idf'] = IDF
#VOCAB['dfidf'] = DFIDF
VOCAB['porter_stem'] = VOCAB.index.map(lambda x: stemmer.stem(x))
VOCAB['stop'] = VOCAB.index.isin(stop_words)

# Get max POS and POS group for each term
max_pos = CORPUS.groupby('term_str')['pos'].agg(lambda x: x.value_counts().idxmax()).rename('max_pos')
max_pos_group = CORPUS.groupby('term_str')['pos_group'].agg(lambda x: x.value_counts().idxmax()).rename('max_pos_group')

VOCAB = VOCAB.join(max_pos)
VOCAB = VOCAB.join(max_pos_group)

# Assuming handling of ngrams if applicable
# Here we assume unigram as example; modify if you have actual ngrams data
VOCAB['ngram_length'] = VOCAB.index.map(lambda x: len(x.split()))

In [17]:
VOCAB

Unnamed: 0_level_0,n,porter_stem,stop,max_pos,max_pos_group,ngram_length
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
.,129667,.,False,.,OTHER,1
the,104907,the,True,DT,OTHER,1
and,74281,and,True,CC,OTHER,1
to,69038,to,True,TO,OTHER,1
,66793,,False,_SP,OTHER,0
...,...,...,...,...,...,...
aubfey,1,aubfey,False,NNP,NOUN,1
a.was,1,a.wa,False,NNP,NOUN,1
b.why,1,b.whi,False,NN,NOUN,1
c.why,1,c.whi,False,NNPS,NOUN,1


In [18]:
VOCAB.to_csv(base_path + "output/VOCAB.csv", sep='|', index=True)