# Topic modelling from Ireland schools inspection reports

## Imports

In [99]:
import pandas as pd
import os
from nltk.corpus import stopwords 
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

## text preprocessing function

In [100]:
def clean(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
    lowercased = text.lower() # Lower Case
    tokenized = word_tokenize(lowercased) # Tokenize
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    stop = set(stopwords.words('english')) # Make stopword list
    
    newStopWords = set(['school','schools','learning','student','students','pupil','pupils','teacher','teachers','management','managements','teaching','support','suppports', 'lesson','lessons','boards', 'board'])
    stop = stop.union(newStopWords)
 
    without_stopwords = [word for word in words_only if word not in stop] # Remove Stop Words
    
    lemma=WordNetLemmatizer() # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords] # Lemmatize
   
    return lemmatized

## from PDF to raw text

## starting with a small sample of 5 texts from 5 PDFs

In [101]:
list_sample = os.listdir('../../IIPE-data/IIPE/data_sample/plain_text_sample')
list_sample = [file for file in list_sample if file[-4:] == '.txt']
list_sample

['Reports_Plain text_05933G_08_10_2020.txt',
 'Reports_Plain text_03220F_15_11_2019.txt',
 'Reports_Plain text_07518E_15_12_2020.txt',
 'Reports_Plain text_03917V_23_09_2020.txt',
 'Reports_Plain text_01300Q_08_10_2020.txt']

## putting the text into a DataFrame

In [102]:
cols , dfs = [], []
for file in list_sample:
    cols.append('../../IIPE-data/IIPE/data_sample/plain_text_sample/'+file)
for file in list_sample:
    dfs.append(pd.read_csv('../../IIPE-data/IIPE/data_sample/plain_text_sample/'+file,header=None,sep='\t',))
    
sample_df = pd.concat(dfs,)
sample_df.reset_index(inplace=True)   
sample_df.drop(columns='index',inplace=True)
sample_df.rename(columns={0:'text'},inplace=True)
sample_df



Unnamed: 0,text
0,"Whole-School Evaluation – Management, Leadersh..."
1,"Whole-School Evaluation – Management, Leadersh..."
2,"Whole-School Evaluation – Management, Leadersh..."
3,"Whole-School Evaluation – Management, Leadersh..."
4,"Whole-School Evaluation – Management, Leadersh..."


## preprocessing the text

In [103]:
# Apply to all texts
sample_df['clean_text'] = sample_df['text'].apply(clean)

sample_df.drop(columns='text',inplace=True)

sample_df.head()

Unnamed: 0,clean_text
0,"[whole, evaluation, leadership, date, inspecti..."
1,"[whole, evaluation, leadership, date, inspecti..."
2,"[whole, evaluation, leadership, date, inspecti..."
3,"[whole, evaluation, leadership, date, inspecti..."
4,"[whole, evaluation, leadership, date, inspecti..."


In [104]:
sample_df.clean_text[0]

['whole',
 'evaluation',
 'leadership',
 'date',
 'inspection',
 'inspection',
 'activity',
 'undertaken',
 'meeting',
 'principal',
 'leadership',
 'team',
 'meeting',
 'representative',
 'meeting',
 'parent',
 'representative',
 'meeting',
 'review',
 'relevant',
 'document',
 'analysis',
 'parent',
 'questionnaire',
 'observation',
 'examination',
 'work',
 'interaction',
 'feedback',
 'principal',
 'deputy',
 'principal',
 'parent',
 'representative',
 'context',
 'presentation',
 'primary',
 'george',
 'hill',
 'located',
 'dublin',
 'caters',
 'boy',
 'infant',
 'first',
 'class',
 'girl',
 'infant',
 'sixth',
 'class',
 'operates',
 'patronage',
 'catholic',
 'archbishop',
 'dublin',
 'participates',
 'band',
 'delivering',
 'equality',
 'opportunity',
 'deis',
 'action',
 'plan',
 'department',
 'education',
 'skill',
 'educational',
 'inclusion',
 'time',
 'evaluation',
 'enrolled',
 'overall',
 'attendance',
 'rate',
 'good',
 'actively',
 'addressing',
 'improvement',
 'expe

## feature engineering?

## vectorizing the data: TF-IDF  


In [105]:
# the argument passed to the TF-IDF vectorizer must be a list of strings
corpus = []
for text_ in list(sample_df.clean_text):
    corpus.append(' '.join(text_))
corpus

['whole evaluation leadership date inspection inspection activity undertaken meeting principal leadership team meeting representative meeting parent representative meeting review relevant document analysis parent questionnaire observation examination work interaction feedback principal deputy principal parent representative context presentation primary george hill located dublin caters boy infant first class girl infant sixth class operates patronage catholic archbishop dublin participates band delivering equality opportunity deis action plan department education skill educational inclusion time evaluation enrolled overall attendance rate good actively addressing improvement experienced significant turnover staff recent year current staffing includes administrative principal eight mainstream class four special educational need sen access home community liaison hscl co ordinator shared another summary main finding recommendation finding overall quality good demonstrate commendable dispo

In [106]:
tf_idf_vectorizer = TfidfVectorizer(max_df=0.8,min_df=0.1,ngram_range=(2,3))

X = tf_idf_vectorizer.fit_transform(corpus)

In [107]:
pd.DataFrame(X.toarray(),columns = tf_idf_vectorizer.get_feature_names())

Unnamed: 0,abairtí iomlána,abairtí iomlána úsáid,abairtí simplí,abairtí simplí thuiscint,ability apply,ability apply knowledge,ability highly,ability highly effective,ability irish,ability irish good,...,ábalta cumarsáid,ábalta cumarsáid fhiúntach,éagsúlacht cur,éagsúlacht cur bhfeidhm,úsáid ach,úsáid ach téann,úsáid agus,úsáid agus ceisteanna,úsáid mar,úsáid mar bhunús
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.026876,0.026876,0.0,0.0,0.0,0.0,0.0,0.0,0.026876,0.026876
2,0.0,0.0,0.025656,0.025656,0.025656,0.025656,0.025656,0.025656,0.0,0.0,...,0.0,0.0,0.0,0.0,0.025656,0.025656,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.025187,0.025187,0.0,0.0,0.0,0.0,0.0,0.0,0.025187,0.025187,...,0.0,0.0,0.025187,0.025187,0.0,0.0,0.025187,0.025187,0.0,0.0


In [119]:
lda_model = LatentDirichletAllocation(n_components=3).fit(X)

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-11:-1]])
        

print_topics(lda_model, tf_idf_vectorizer)

Topic 0:
[('overall quality', 0.46894118846435257), ('good overall', 0.44221359816395506), ('mainstream setting', 0.43186019055022784), ('communication skill', 0.4160063878522351), ('self assessment', 0.410134826534921), ('provided opportunity', 0.410134826534921), ('good practice', 0.40333445765524845), ('overall quality good', 0.40166171384688765), ('quality good overall', 0.398860395515905), ('leadership role', 0.3969425833205068)]
Topic 1:
[('priority need', 0.41995798281331564), ('assessment data', 0.41995269625893683), ('sse process', 0.38724952923649136), ('questionnaire returned', 0.38695396395125015), ('skill progressed', 0.38695396395125015), ('sharing good practice', 0.38695396395125015), ('address priority need', 0.38695396395125015), ('gathered used consistently', 0.38695396395125015), ('gathered used', 0.38695396395125015), ('address priority', 0.38695396395125015)]
Topic 2:
[('approach assessment', 0.41774286726073573), ('require development', 0.4166461677266475), ('good

In [118]:
example = ["please have a positive attitude towards assessment data and written work"]

example_vectorized = tf_idf_vectorizer.transform(example)

lda_vectors = lda_model.transform(example_vectorized)
print(lda_vectors)

#print("topic 0 :", lda_vectors[0][0])
#print("topic 1 :", lda_vectors[0][1])

[[0.26693105 0.73306895]]


## vectorizing the data: n-grams