ktrain/tests/test_lda.py

#!/usr/bin/env python3
"""
Tests of ktrain text classification flows
"""
import testenv
import IPython
from unittest import TestCase, main, skip
import ktrain
from ktrain.imports import ACC_NAME, VAL_ACC_NAME

class TestLDA(TestCase):


    def test_qa(self):
        rawtext = """
            Elon Musk leads Space Exploration Technologies (SpaceX), where he oversees
            the development and manufacturing of advanced rockets and spacecraft for missions
            to and beyond Earth orbit.
            """
        
        # collect data
        import numpy as np
        import pandas as pd
        from sklearn.datasets import fetch_20newsgroups
        remove = ('headers', 'footers', 'quotes')
        newsgroups_train = fetch_20newsgroups(subset='train', remove=remove)
        newsgroups_test = fetch_20newsgroups(subset='test', remove=remove)
        texts = newsgroups_train.data +  newsgroups_test.data

        # buld and test LDA topic model
        tm = ktrain.text.get_topic_model(texts, n_features=10000)
        tm.build(texts, threshold=0.25)
        texts = tm.filter(texts)
        tags = tm.topics[ np.argmax(tm.predict([rawtext]))]
        self.assertEqual(tags, 'space nasa earth data launch surface solar moon mission planet')
        tm.save('/tmp/tm')
        tm = ktrain.text.load_topic_model('/tmp/tm')
        tm.build(texts, threshold=0.25)
        tags = tm.topics[ np.argmax(tm.predict([rawtext]))]
        self.assertEqual(tags, 'space nasa earth data launch surface solar moon mission planet')

        # document similarity
        tech_topics = [51, 85, 94, 22]
        tech_probs = tm.get_doctopics(topic_ids=tech_topics)
        doc_ids = [doc['doc_id'] for doc in tm.get_docs(topic_ids=tech_topics)]
        tm.train_scorer(topic_ids=tech_topics)
        other_topics = [i for i in range(tm.n_topics) if i not in tech_topics]
        other_texts = [d['text'] for d in tm.get_docs(topic_ids=other_topics)]
        other_scores = tm.score(other_texts)
        # display results in Pandas dataframe
        other_preds = [int(score > 0) for score in other_scores]
        data = sorted(list(zip(other_preds, other_scores, other_texts)), key=lambda item:item[1], reverse=True)
        df = pd.DataFrame(data, columns=['Prediction', 'Score', 'Text'])
        self.assertTrue('recommendations for a laser printer' in df['Text'].values[0])

        # recommender
        tm.train_recommender()
        results = tm.recommend(text=rawtext, n=1)
        self.assertTrue(results[0]['text'].startswith('Archive-name'))


if __name__ == "__main__":
    main()