In [None]:
import textvisualizer as tv
import pandas as pd
import re
import plotly.graph_objects as go
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords

In [None]:
newsgroups = fetch_20newsgroups(subset='train')

In [None]:
def remove_url(string):
    if type(string)!=str:
        return(string)
    return(re.sub(r'http\S+', '', string))

def strip_html_tags(text):
    p = re.compile(r'<.*?>')
    return p.sub('', text)

def remove_numbers(string):
    if type(string)!=str:
        return(string)
    string = re.sub('[0-9]+','',string)
    return(string)

In [None]:
df = pd.DataFrame()
df['text'] = newsgroups.data
df.head()

In [None]:
df['text_clean'] = df['text'].apply(remove_url)
df['text_clean'] = df['text_clean'].apply(strip_html_tags)
df['text_clean'] = df['text_clean'].apply(remove_numbers)

In [None]:
mystopwords = stopwords.words('english')+['ax','edu','com','would','nntp','ac','co','gv','bf','db','tin','apr','gmt','na','pl','di','inc','gov','max','acs','cs',
                                         'subject','lines','organization','writes','article','one']

In [None]:
df['labels'] = [newsgroups.target_names[newsgroups.target[i]] for i in newsgroups.target]

# Using functions

## Frequencies

In [None]:
fig = tv.frequencyPlot(listText=df.text_clean.to_list(), stopwords=mystopwords)
fig.show()

In [None]:
fig = tv.frequencyPlot(listText=df.text_clean.to_list(), ngramRange=(2,2), stopwords=mystopwords)
fig.show()

In [None]:
fig = tv.frequencyPlot(listText=df.text_clean.to_list(), ngramRange=(3,3), stopwords=mystopwords)
fig.show()

## Phrase net

In [None]:
fig = tv.phraseNet(connectors=["and"], listText=df.text.to_list())
fig.show()

In [None]:
fig = tv.phraseNet(connectors=["at", "for","to","from"], listText=df.text.to_list())
fig.show()

In [None]:
fig = tv.phraseNet(connectors=["for the"], listText=df.text.to_list())
fig.show()

## WordCloud

In [None]:
tv.wordcloudPlot( ' '.join(df.text.to_list()),mystopwords)

# Using Class Corpus

## Corpus

In [None]:
c = tv.Corpus(df.text_clean.tolist(), df.labels.tolist())

In [None]:
c.frequencyPlot(stopwords=mystopwords)

In [None]:
c.frequencyPlot(stopwords=mystopwords, labels = "sci.space")

In [None]:
c.frequencyPlot(stopwords=mystopwords, labels = ["rec.autos","rec.motorcycles"])

In [None]:
c.phraseNet(connectors=["at"],number_of_pairs=15)

In [None]:
c.phraseNet(connectors=["at"],number_of_pairs=15, labels = "sci.space")

In [None]:
c.phraseNet(connectors=["at"],number_of_pairs=15, labels = ["rec.autos","rec.motorcycles"])

In [None]:
c.wordcloudPlot(stopwords=mystopwords)

In [None]:
df.labels.value_counts()