# Preprocessing from package

## Imports packages and sample .txt

Checking virtualenv with `pyenv`

In [1]:
!pyenv virtualenvs

  3.7.7/envs/IIPE (created from /Users/alexisgourdol/.pyenv/versions/lewagon)
  3.7.7/envs/article2db (created from /Users/alexisgourdol/.pyenv/versions/3.7.7)
  3.7.7/envs/lewagon (created from /Users/alexisgourdol/.pyenv/versions/3.7.7)
* IIPE (created from /Users/alexisgourdol/.pyenv/versions/lewagon)
  article2db (created from /Users/alexisgourdol/.pyenv/versions/3.7.7)
  lewagon (created from /Users/alexisgourdol/.pyenv/versions/3.7.7)


Importing necessary packages

In [2]:
#!pip install nltk

In [3]:
import os
import pandas as pd
from IIPE.preproc import make_contents_df, make_tokens
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

Select the data_samples to read .txt files

In [4]:
print(os.getcwd())
os.chdir(os.path.join('..', 'IIPE', 'data_sample', 'plain_text_sample'))
print(os.getcwd())

/Users/alexisgourdol/code/alexisgourdol/IIPE-data/notebooks
/Users/alexisgourdol/code/alexisgourdol/IIPE-data/IIPE/data_sample/plain_text_sample


In [5]:
file_names = os.listdir()
file_names

['Reports_Plain text_03220F_15_11_2019.txt',
 'Reports_Plain text_01300Q_08_10_2020.txt',
 'Results\\Word count\\Whole sample.png',
 'Reports_Plain text_03917V_23_09_2020.txt',
 'Reports_Plain text_05933G_08_10_2020.txt',
 'Reports_Plain text_07518E_15_12_2020.txt']

## Use our preprocessing functions

In [6]:
df_contents = make_contents_df(file_names)
df_contents

Unnamed: 0,date,reference,text
0,2019-11-15,03220F,"Whole-School Evaluation – Management, Leadersh..."
1,2020-10-08,01300Q,"Whole-School Evaluation – Management, Leadersh..."
2,2020-09-23,03917V,"Whole-School Evaluation – Management, Leadersh..."
3,2020-10-08,05933G,"Whole-School Evaluation – Management, Leadersh..."
4,2020-12-15,07518E,"Whole-School Evaluation – Management, Leadersh..."


In [7]:
tokens = make_tokens(df_contents)
print(len(tokens), ' tokens available. Here are the 5 first in no particular order: ')
tokens[:5]

4040  tokens available. Here are the 5 first in no particular order: 


['evaluation', 'leadership', 'date', 'inspection', 'inspection']

In [10]:
from IIPE.constants import ALL_STOP_WORDS
from sklearn.feature_extraction import text

In [14]:
all_stop_words = text.ENGLISH_STOP_WORDS.union(ALL_STOP_WORDS)
type(all_stop_words), len(all_stop_words)

(frozenset, 398)

In [15]:
vectorizer = TfidfVectorizer().fit(df_contents['text'])

data_vectorized = vectorizer.transform(df_contents['text'])

lda_model = LatentDirichletAllocation(n_components=2).fit(data_vectorized)

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        

print_topics(lda_model, vectorizer)

Topic 0:
[('the', 0.5043432782910858), ('of', 0.5043235898301934), ('and', 0.5043173740137139), ('to', 0.504246364895143), ('in', 0.5042323008293276), ('school', 0.504212296386216), ('pupils', 0.5041910954025344), ('is', 0.5041532723032424), ('learning', 0.5040995702859548), ('for', 0.5038913292821663)]
Topic 1:
[('the', 2.8700750994892403), ('of', 2.608140402596899), ('and', 2.4265474828374383), ('to', 1.7005286524812306), ('in', 1.65505920474929), ('school', 1.5144521845212247), ('pupils', 1.4617358444676518), ('is', 1.3782118639161736), ('learning', 1.2396058079979895), ('for', 0.997152044021224)]


In [16]:
vectorizer = TfidfVectorizer(stop_words=all_stop_words).fit(df_contents['text'])

data_vectorized = vectorizer.transform(df_contents['text'])

lda_model = LatentDirichletAllocation(n_components=2).fit(data_vectorized)

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        

print_topics(lda_model, vectorizer)


Topic 0:
[('good', 0.5030952616665783), ('quality', 0.5030606191072694), ('leadership', 0.5029480532086316), ('development', 0.5029146845710577), ('skills', 0.5028311104615628), ('overall', 0.5028187117684683), ('principal', 0.5028096494995531), ('ensure', 0.5027500126708212), ('needs', 0.502745050318159), ('education', 0.5027438637019254)]
Topic 1:
[('good', 1.7637517043053446), ('quality', 1.5970291165339665), ('leadership', 1.2259758741460305), ('development', 1.0960112639191455), ('skills', 1.0449508574621453), ('principal', 1.0150251404910915), ('overall', 0.9982457526773381), ('needs', 0.9528415819557876), ('range', 0.9355884494672351), ('ensure', 0.929247922107451)]


## Topic modelling