In [1]:
#TextHero
#Under the hoods, Texthero makes use of multiple NLP and machine learning toolkits such as Gensim, NLTK, SpaCy and scikit-learn. You don't need to install them all separately, pip will take care of that.

#Texthero include tools for:

#Preprocess text data: it offers both out-of-the-box solutions but it's also flexible for custom-solutions.
#Natural Language Processing: keyphrases and keywords extraction, and named entity recognition.
#Text representation: TF-IDF, term frequency, and custom word-embeddings (wip)
#Vector space analysis: clustering (K-means, Meanshift, DBSAN and Hierarchical), topic modelling (wip) and interpretation.
#Text visualization: vector space visualization, place localization on maps (wip).

In [2]:
!pip install TextHero



In [8]:
import warnings
warnings.filterwarnings("ignore")

In [9]:
import pandas as pd
import texthero as hero

In [15]:
data=pd.read_csv('reuters.csv')
data.head()

Unnamed: 0,headline_text,category
0,Southern European bond yields hit multi-week lows,3
1,BRIEF-LG sells its entire stake in unit LG Lif...,6
2,BRIEF-Golden Wheel Tiandi says unit confirms s...,3
3,BRIEF-Sunshine 100 China Holdings Dec contract...,3
4,Euro zone stocks start 2017 with new one-year ...,3


In [17]:
#make it lower case
hero.preprocessing.lowercase(data['headline_text'])

0      southern european bond yields hit multi-week lows
1      brief-lg sells its entire stake in unit lg lif...
2      brief-golden wheel tiandi says unit confirms s...
3      brief-sunshine 100 china holdings dec contract...
4      euro zone stocks start 2017 with new one-year ...
                             ...                        
994    brief-htm international to issue 1st series se...
995    brief-revlon says expects to eliminate about 3...
996                          twitter's china chief quits
997    peru demands cash from odebrecht ahead of plea...
998            corrected-forex-dollar resumes its ascent
Name: headline_text, Length: 999, dtype: object

In [19]:
#will remove all the punctuation
hero.preprocessing.remove_punctuation(data['headline_text'])

0      Southern European bond yields hit multi week lows
1      BRIEF LG sells its entire stake in unit LG Lif...
2      BRIEF Golden Wheel Tiandi says unit confirms s...
3      BRIEF Sunshine 100 China Holdings Dec contract...
4      Euro zone stocks start 2017 with new one year ...
                             ...                        
994    BRIEF HTM International to issue 1st series se...
995    BRIEF Revlon says expects to eliminate about 3...
996                          Twitter s China chief quits
997    Peru demands cash from Odebrecht ahead of plea...
998            CORRECTED FOREX Dollar resumes its ascent
Name: headline_text, Length: 999, dtype: object

In [21]:
#remove stop words
hero.preprocessing.remove_stop_words(data['headline_text'])

0      Southern European bond yields hit multi-week lows
1      BRIEF-LG sells  entire stake  unit LG Life Sci...
2      BRIEF-Golden Wheel Tiandi says unit confirms s...
3      BRIEF-Sunshine 100 China Holdings Dec contract...
4      Euro zone stocks start 2017  new one-year high...
                             ...                        
994    BRIEF-HTM International  issue 1st series secu...
995    BRIEF-Revlon says expects  eliminate  350 posi...
996                           Twitter' China chief quits
997    Peru demands cash  Odebrecht ahead  plea deal ...
998               CORRECTED-FOREX-Dollar resumes  ascent
Name: headline_text, Length: 999, dtype: object

In [23]:
#remove digit
hero.preprocessing.remove_digits(data['headline_text'])

0      Southern European bond yields hit multi-week lows
1      BRIEF-LG sells its entire stake in unit LG Lif...
2      BRIEF-Golden Wheel Tiandi says unit confirms s...
3      BRIEF-Sunshine China Holdings Dec contracted s...
4      Euro zone stocks start with new one-year high....
                             ...                        
994    BRIEF-HTM International to issue 1st series se...
995    BRIEF-Revlon says expects to eliminate about p...
996                          Twitter's China chief quits
997    Peru demands cash from Odebrecht ahead of plea...
998            CORRECTED-FOREX-Dollar resumes its ascent
Name: headline_text, Length: 999, dtype: object

In [25]:
#Remove content within square brackets [] and the square brackets.
s = pd.Series("Texthero [is not a superhero!]")
hero.preprocessing.remove_square_brackets(s)

0    Texthero 
dtype: object

In [14]:
data['pca'] = (
   data['headline_text']
   .pipe(hero.clean)
   .pipe(hero.do_tfidf)#vectorizing
   .pipe(hero.do_pca)
)
hero.scatterplot(data, 'pca', color='category', title="PCA BBC Sport news")

In [13]:
data['tfidf'] = (
    data['headline_text']
    .pipe(hero.clean)
    .pipe(hero.do_tfidf)
)
### Kmeans

data['kmeans_labels'] = (
    data['tfidf']
    .pipe(hero.do_kmeans, n_clusters=5)
    .astype(str)
)

data['pca'] = data['tfidf'].pipe(hero.do_pca)

hero.scatterplot(data, 'pca', color='kmeans_labels', title="K-means BBC Sport news")