# Metadata

```yaml
Course:   DS5001: Exploratory Text Analytics
Topic:    Final Project, Data Prep
Author:   Andrew Avitabile
Date:     24 March 2024 (Edited April 25, 2024)
```

# Set Up

## Packages

In [1]:
# Importing required libraries
import pandas as pd
import numpy as np
from collections import Counter

#nltk packages
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

#Sklearn
from sklearn.feature_extraction.text import CountVectorizer

# Downloading necessary data from nltk
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('vader_lexicon')

#spacy for POS tagging
import spacy
# Load the English NLP model
nlp = spacy.load('en_core_web_sm')

# Creating a list of stop words for later use
stop_words = set(stopwords.words('english'))

# Initialize Porter Stemmer
stemmer = PorterStemmer()

In [2]:
# Define the base path
base_path = "C:/Users/yaj3ma/Box/DS5001 Final Project/"

## Import Data

In [3]:
eval_text = pd.read_excel(base_path + "data/eval_text.xlsx")

# Parse Data

## Initial Data Cleaning

In [4]:
# Counting documents written by each supervisor
n_documents = eval_text.groupby('supervisor_id').size().rename('n_documents')

# Counting PSTs evaluated by each supervisor
n_psts = eval_text.groupby('supervisor_id')['uin'].nunique().rename('n_psts')

# Merge the counts back to the original DataFrame, preserving the original index
eval_text = eval_text.merge(n_psts, left_on='supervisor_id', right_index=True)
eval_text = eval_text.merge(n_documents, left_on='supervisor_id', right_index=True)

In [5]:
eval_text.set_index('observationid')

Unnamed: 0_level_0,uin,supervisor_id,observationdate,overallcomments,overallrating_num,observation_order,program_cl,gradelevel_cl,n_psts,n_documents
observationid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3863,620009838,612,2013-09-26,"Stephanie, you did a nice job of leading discu...",3.0,2,SPED,Missing,285,612
4778,821002819,612,2013-10-17,"Haley, you did a nice job of establishing the ...",3.0,2,SPED,Missing,285,612
5363,121006491,612,2013-10-31,"Amanda, you stated your lesson's objective cle...",3.0,2,SPED,Missing,285,612
4317,221004921,612,2013-10-08,"Julie, we talked after your lesson about what ...",3.0,1,SPED,Missing,285,612
5060,621003468,612,2013-10-24,"Faye, you did a terrific job using time and vi...",3.0,1,SPED,Missing,285,612
...,...,...,...,...,...,...,...,...,...,...
15229,420003595,94,2015-04-07,Great lesson and great response from students....,4.0,3,KINE,All,5,20
12754,719007523,94,2015-02-08,Ryan showed that he is growing more and more c...,3.0,4,KINE,All,5,20
12749,420003595,94,2015-02-04,Lindsay did a great job! Her confidence in th...,4.0,1,KINE,All,5,20
10744,119005662,94,2014-10-28,Damion prepared a great lesson and is learning...,3.0,2,HLTH,All,5,20


## Creating CORPUS

In [6]:
#Tokenize with SciKitLearn
engine = CountVectorizer()
model = engine.fit_transform(eval_text.overallcomments)

In [7]:
# Load your spaCy model
nlp = spacy.load("en_core_web_sm")

# Define a simple POS grouping function adapted for spaCy tags
def pos_group(tag):
    if tag.startswith('N'):
        return 'NOUN'
    elif tag.startswith('V'):
        return 'VERB'
    elif tag.startswith('J'):
        return 'ADJECTIVE'
    elif tag.startswith('R'):
        return 'ADVERB'
    else:
        return 'OTHER'

# Initialize the list to collect token data
long_format_data = []

# Ensure eval_text['observationid'] is indeed the column you want to iterate with the text
# Iterate through each row in the DataFrame using itertuples for better performance and direct column access
for row in eval_text.itertuples():
    document = getattr(row, 'overallcomments')  # Replace 'overallcomments' with the actual column name if different
    observation_id = getattr(row, 'observationid')  # Replace 'observationid' with the actual column name if different
    doc = nlp(document)
    for sentence_num, sentence in enumerate(doc.sents):
        for token_num, token in enumerate(sentence):
            long_format_data.append({
                'observationid': observation_id,
                'sentence_num': sentence_num + 1,
                'token_num': token_num + 1,
                'token_str': token.text.lower(),   # Terms are stored in lower case
                'term_str': token.text,            # Original token as it appears
                'pos': token.tag_,                 # POS tag using spaCy's fine-grained tags
                'pos_group': pos_group(token.tag_) # Grouped POS tag
            })

# Create DataFrame from long-format data
CORPUS = pd.DataFrame(long_format_data)

In [8]:
CORPUS.set_index(['observationid', 'sentence_num', 'token_num'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,token_str,term_str,pos,pos_group
observationid,sentence_num,token_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3863,1,1,stephanie,Stephanie,NNP,NOUN
3863,1,2,",",",",",",OTHER
3863,1,3,you,you,PRP,OTHER
3863,1,4,did,did,VBD,VERB
3863,1,5,a,a,DT,OTHER
...,...,...,...,...,...,...
10985,2,5,maximize,maximize,VB,VERB
10985,2,6,on,on,IN,OTHER
10985,2,7,task,task,NN,NOUN
10985,2,8,behavior,behavior,NN,NOUN


In [9]:
CORPUS.to_csv(base_path + "output/CORPUS.csv", sep='|', index=True)

## Creating LIB

In [10]:
# Creating the new DataFrame LIB from eval_text
LIB = eval_text.copy()

# Count the number of sentences per document
sentence_counts = CORPUS.groupby('observationid')['sentence_num'].nunique().rename('sentence_count')

# Count the number of tokens per document
token_counts = CORPUS.groupby('observationid')['token_num'].size().rename('token_count')

# Combine sentence and token counts into a single DataFrame
doc_counts = pd.DataFrame({'sentence_count': sentence_counts, 'token_count': token_counts})

#Merge LIB with document count information
LIB = LIB.join(doc_counts)

# Get a count of the characters in the comments
LIB['char_count'] = eval_text['overallcomments'].str.len()

In [11]:
LIB.set_index('observationid')

Unnamed: 0_level_0,uin,supervisor_id,observationdate,overallcomments,overallrating_num,observation_order,program_cl,gradelevel_cl,n_psts,n_documents,sentence_count,token_count,char_count
observationid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3863,620009838,612,2013-09-26,"Stephanie, you did a nice job of leading discu...",3.0,2,SPED,Missing,285,612,,,752
4778,821002819,612,2013-10-17,"Haley, you did a nice job of establishing the ...",3.0,2,SPED,Missing,285,612,,,467
5363,121006491,612,2013-10-31,"Amanda, you stated your lesson's objective cle...",3.0,2,SPED,Missing,285,612,,,315
4317,221004921,612,2013-10-08,"Julie, we talked after your lesson about what ...",3.0,1,SPED,Missing,285,612,,,630
5060,621003468,612,2013-10-24,"Faye, you did a terrific job using time and vi...",3.0,1,SPED,Missing,285,612,,,244
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15229,420003595,94,2015-04-07,Great lesson and great response from students....,4.0,3,KINE,All,5,20,9.0,126.0,142
12754,719007523,94,2015-02-08,Ryan showed that he is growing more and more c...,3.0,4,KINE,All,5,20,6.0,120.0,188
12749,420003595,94,2015-02-04,Lindsay did a great job! Her confidence in th...,4.0,1,KINE,All,5,20,3.0,25.0,188
10744,119005662,94,2014-10-28,Damion prepared a great lesson and is learning...,3.0,2,HLTH,All,5,20,15.0,487.0,267


In [12]:
LIB['char_count'].fillna(0).mean()

651.7162866621031

In [13]:
LIB.to_csv(base_path + "output/LIB.csv", sep='|', index=True)

## Creating Sentence-Level Data

In [14]:
# Step 1: Aggregate terms to form sentences and count terms
grouped = CORPUS.groupby(['observationid', 'sentence_num'])
SENTENCES = pd.DataFrame({
    'sentence': grouped['term_str'].apply(' '.join),
    'term_count': grouped['term_str'].size()
}).reset_index()

# Flatten the column multi-levels generated by agg
SENTENCES.columns = ['observationid', 'sentence_num', 'sentence', 'term_count']

SENTENCES.set_index(['observationid', 'sentence_num'])

Unnamed: 0_level_0,Unnamed: 1_level_0,sentence,term_count
observationid,sentence_num,Unnamed: 2_level_1,Unnamed: 3_level_1
35,1,Ms. Simmons led a group lesson addressing writ...,12
36,1,Ms. De Luna worked with students individually ...,20
36,2,Observation comments come from observations of...,17
36,3,Comments below are not from one lesson alone .,9
37,1,Ms. Brecheen led morning circle time instructi...,9
...,...,...,...
43899,9,She also praised the class as a group .,10
43899,10,She did give specific praise to a student who ...,19
43899,11,Ms Early redirected the students by acknowledg...,19
43899,12,"Ms Early had good visuals for the I Do , We Do...",17


In [15]:
SENTENCES.to_csv(base_path + "output/SENTENCES.csv", sep='|', index=True)

## Creating VOCAB

In [16]:
# Calculate Term Frequency across the corpus
CORPUS['term_str'] = CORPUS['token_str'].str.lower()  # normalize to lowercase
TF = CORPUS['term_str'].value_counts().rename('n')

# Calculate Document Frequency
#DF = CORPUS.groupby('term_str')['document_id'].nunique().rename('df')

# Calculate IDF using log scaling
total_documents = CORPUS['observationid'].nunique()
#IDF = np.log(total_documents / DF).rename('idf')

# Calculate DFIDF
#DFIDF = (DF * IDF).rename('dfidf')

# Stemming and identifying stopwords
VOCAB = pd.DataFrame(index=TF.index)
VOCAB['n'] = TF
#VOCAB['df'] = DF
#VOCAB['idf'] = IDF
#VOCAB['dfidf'] = DFIDF
VOCAB['porter_stem'] = VOCAB.index.map(lambda x: stemmer.stem(x))
VOCAB['stop'] = VOCAB.index.isin(stop_words)

# Get max POS and POS group for each term
max_pos = CORPUS.groupby('term_str')['pos'].agg(lambda x: x.value_counts().idxmax()).rename('max_pos')
max_pos_group = CORPUS.groupby('term_str')['pos_group'].agg(lambda x: x.value_counts().idxmax()).rename('max_pos_group')

VOCAB = VOCAB.join(max_pos)
VOCAB = VOCAB.join(max_pos_group)

# Assuming handling of ngrams if applicable
# Here we assume unigram as example; modify if you have actual ngrams data
VOCAB['ngram_length'] = VOCAB.index.map(lambda x: len(x.split()))

In [17]:
VOCAB

Unnamed: 0_level_0,n,porter_stem,stop,max_pos,max_pos_group,ngram_length
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
.,129666,.,False,.,OTHER,1
the,104906,the,True,DT,OTHER,1
and,74281,and,True,CC,OTHER,1
to,69038,to,True,TO,OTHER,1
,66791,,False,_SP,OTHER,0
...,...,...,...,...,...,...
reume,1,reum,False,NNP,NOUN,1
reprieval,1,repriev,False,NN,NOUN,1
8.connected,1,8.connect,False,VBD,VERB,1
1-,1,1-,False,CD,OTHER,1


In [18]:
VOCAB.to_csv(base_path + "output/VOCAB.csv", sep='|', index=True)