# 3. Text preprocessing

## Setup

In [1]:
%run __init__.py

In [2]:
import pandas as pd
import pyLDAvis
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()



In [30]:
RANDOM_SEED = 42

## Common pipeline

## COVID-19
bla bla bla

### Loading the dataframe

In [None]:
CORD_DATASET_DIR = os.path.join(DATA_DIR, 'cord19')
CORD19_FILE_PATH = os.path.join(CORD_DATASET_DIR, 'cord19_dataframe.pkl')

cord19_df = pd.read_pickle(CORD19_FILE_PATH)

## Agriculture

### Loading the dataframe

In [4]:
AGRICULTURE_DATASET_DIR = os.path.join(DATA_DIR, 'agriculture')
PMC_FILE_PATH = os.path.join(AGRICULTURE_DATASET_DIR, 'pmc_dataframe.pkl')

pmc_df = pd.read_pickle(PMC_FILE_PATH)

### Preprocessing text

In [5]:
publications = pmc_df['text_cleaned'].values

In [6]:
import numpy as np
import multiprocessing as mp

import string
import spacy 
import en_core_web_sm
from sklearn.base import TransformerMixin, BaseEstimator

nlp = en_core_web_sm.load()
nlp.Defaults.stop_words |= {"et","al", "introduction", "Fig", "fig", "figure"}

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, n_jobs=1):
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        return self

    def transform(self, X, *args, **kwargs):
        return [self._preprocess_text(text) for text in X]

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        doc = nlp(text)
        return [t.lemma_ for t in doc if len(t.text) > 2 and
                not t.is_stop and t.text not in string.punctuation
                and t.is_alpha and not t.is_digit]

    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if not t.is_stop]
    
    def _remove_digits(self, doc):
        return [t for t in doc if t.is_alpha and not t.is_digit]

    def _lemmatize(self, doc):
        return [t.lemma_ for t in doc if len(t.text) > 2]
    

In [7]:
preprocessed_texts = TextPreprocessor().fit_transform(publications)

### Common functions
TODO: move to herc_common

In [9]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()



### LDA

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

def dummy(doc):
    return doc


tf_vectorizer = CountVectorizer(preprocessor=dummy, tokenizer=dummy, ngram_range=(1, 1))
dtm_tf = tf_vectorizer.fit_transform(preprocessed_text)

In [23]:
from tmtoolkit.topicmod.evaluate import metric_coherence_gensim


def base_scoring_function(vectorizer, texts, model, X, top_n=20):
    return metric_coherence_gensim(measure='c_v', top_n=top_n, 
                                   topic_word_distrib=model.components_, 
                                   dtm=X, 
                                   vocab=np.array([x for x in tf_vectorizer.vocabulary_.keys()]), 
                                   texts=preprocessed_text, return_mean=True)

In [25]:
from functools import partial

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV


lda_scoring_func = lambda clf, X: base_scoring_function(vectorizer=tf_vectorizer,
                                        texts=preprocessed_text, model=clf, X=X)

search_params = {
    'n_components': [5, 10, 15, 20, 25, 30],
    'learning_decay': [.5, .7, .9]
}

search = RandomizedSearchCV(LatentDirichletAllocation(random_state=RANDOM_SEED), 
                            param_distributions=search_params, n_iter=15,
                            scoring=lda_scoring_func)
search.fit(dtm_tf)

RandomizedSearchCV(estimator=LatentDirichletAllocation(random_state=42),
                   n_iter=1,
                   param_distributions={'learning_decay': [0.7],
                                        'n_components': [10]},
                   scoring=<function <lambda> at 0x7f3cedafb950>)

In [26]:
best_lda_model = search.best_estimator_

print(f"Best pipeline parameters: {search.best_params_}")
print(f"Best Topic coherence: {search.best_score_}")

Best pipeline parameters: {'n_components': 10, 'learning_decay': 0.7}
Best Topic coherence: 0.7470491784003583


Visualization. Do this with the best model obtained before.

In [27]:
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(best_lda_model, tf_feature_names, 10)

Topic #0: plant gene stress expression transgenic protein control line show level
Topic #1: plant soil rice endophyte treatment root sample study increase high
Topic #2: plant gene expression level show protein leave mutant acid type
Topic #3: plant heat metabolite stress ABA light high group temperature yield
Topic #4: plant stress increase soil water rice root high level activity
Topic #5: crop system soil yield base practice high food rice study
Topic #6: genotype gliadin high group LABE show type protein osteoclast modern
Topic #7: plant medicinal take informant decoction Herb orally MED Wild report
Topic #8: plant system food gene sequence fungus fagaceous fungal specie sample
Topic #9: plant growth root cell phytolith sensor node different acid Arabidopsis



In [29]:
pyLDAvis.sklearn.prepare(best_lda_model, dtm_tf, tf_vectorizer, mds='tsne')

In [32]:
from sklearn.pipeline import Pipeline

from sklearn.decomposition import TruncatedSVD

lda_pipeline = Pipeline([('preprocessing', TextPreprocessor()),
                         ('vectorizer', CountVectorizer(preprocessor=dummy, tokenizer=dummy)),
                         ('model', LatentDirichletAllocation(**best_lda_model.get_params()))])

lda_pipeline.fit_transform(publications)

### Latent Semantic Analysis (LSA)

In [30]:
from sklearn.decomposition import TruncatedSVD

lsa_scoring_func = lda_scoring_func

lsa_search_params = {
    'n_components': [5, 10, 15, 20, 25, 30],
    'algorithm': ["arpack", "randomized"]
}

lsa_search = RandomizedSearchCV(TruncatedSVD(random_state=RANDOM_SEED),
                                param_distributions=lsa_search_params, n_iter=15,
                                scoring=lsa_scoring_func)
lsa_search.fit(publications)

TypeError: If no scoring is specified, the estimator passed should have a 'score' method. The estimator Pipeline(steps=[('preprocessing', TextPreprocessor()),
                ('vectorizer',
                 TfidfVectorizer(preprocessor=<function dummy at 0x7f01195fd7b8>,
                                 tokenizer=<function dummy at 0x7f01195fd7b8>)),
                ('model', TruncatedSVD(n_components=10))]) does not.

In [None]:
best_lsa_model = lsa_search.best_estimator_

print(f"LSA model")
print("-" * 10)
print(f"Best pipeline parameters: {best_lsa_model.best_params_}")
print(f"Best Topic coherence: {best_lsa_model.best_score_}")

In [21]:
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(best_lsa_model, tf_feature_names, 10)

Topic #0: plant stress gene rice soil expression transgenic root increase protein
Topic #1: rice soil crop season emission residue system farm farmer winter
Topic #2: food farmer farm system household farming livestock crop climate diversity
Topic #3: stress heat salt tolerance drought NaCl food ROS salinity ALDH
Topic #4: transgenic gene rice expression emission crop line season protein drought
Topic #5: medicinal salt heat yield plant stress informant NaCl hypertension season
Topic #6: transgenic salt rice sensor food NaCl iot emission node system
Topic #7: emission sensor rice node season iot temperature heat defense Las
Topic #8: medicinal gene mutant informant hypertension wild decoction hydrophilin protein RNAi
Topic #9: endophyte yield sensor node grain root iot bean soil irrigation



Visualization

In [None]:
tf_feature_names = lsa_pipeline['vectorizer'].get_feature_names()
print_top_words(lsa_pipeline['model'], tf_feature_names, 10)

In [16]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

lsa_pipeline = Pipeline([('preprocessing', TextPreprocessor()),
                 ('vectorizer', TfidfVectorizer(preprocessor=dummy, tokenizer=dummy)),
                 ('model', TruncatedSVD(**lsa_search.get_params()))])
lsa_pipeline.fit_transform(publications)

### NMF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = CountVectorizer(preprocessor=dummy, tokenizer=dummy, ngram_range=(1, 1))
dtm_tfidf = tfidf_vectorizer.fit_transform(preprocessed_text)

In [None]:
from sklearn.decomposition import NMF

nmf_scoring_func = lambda clf, X: base_scoring_function(vectorizer=tfidf_vectorizer,
                                                        texts=preprocessed_text, model=clf, X=X)

nmf_search_params = {
    'n_components': [5, 10, 15, 20, 25, 30]
}

nmf_search = RandomizedSearchCV(NMF(random_state=RANDOM_SEED),
                                param_distributions=nmf_search_params, n_iter=15,
                                scoring=nmf_scoring_func)

In [None]:
best_nmf_model = nmf_search.best_estimator_

print(f"NMF model")
print("-" * 10)
print(f"Best pipeline parameters: {best_nmf_model.best_params_}")
print(f"Best Topic coherence: {best_nmf_model.best_score_}")

In [23]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(best_nmf_model, tfidf_feature_names, 10)

Topic #0: plant endophyte defense herbivore metabolite volatile GSL insect touch expression
Topic #1: rice soil emission season crop residue paddy straw winter treatment
Topic #2: food farmer farm system household CSA climate farming crop livestock
Topic #3: stress plant salt heat tolerance NaCl drought gene expression treatment
Topic #4: transgenic plant gene expression line protein sequence transcript overexpressor RNAi
Topic #5: metal toxicity soil plant root concentration heavy acid uptake increase
Topic #6: sensor iot node IoT wireless network communication Smart power Campus
Topic #7: plant infect CMV Las virus infected infection NahG citri symptomatic
Topic #8: medicinal plant informant hypertension decoction Herb traditional specie orally ailment
Topic #9: crop yield soil legume grain bird bean habitat residue wheat



In [58]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

nmf_pipeline = Pipeline([('preprocessing', TextPreprocessor()),
                         ('vectorizer', TfidfVectorizer(preprocessor=dummy, tokenizer=dummy)),
                         ('model', NMF(**best_nmf_model.get_params()))])

nmf_pipeline.fit_transform(publications)