In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import operator

from wordcloud import WordCloud
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction import text

import spacy
nlp = spacy.load("en_core_web_lg")

import string
punctuation = string.punctuation

In [None]:
%matplotlib inline

## install spacy

### bash

`pip install spacy`

`python -m spacy download en`

`python -m spacy download en_core_web_lg`

### python

`import spacy`

`nlp = spacy.load("en")`

`nlp = spacy.load("en_core_web_lg")`

In [None]:
# load data
data = fetch_20newsgroups(subset='train')

# make dataframe
df = pd.DataFrame(data.data)

# add targets
df['target'] = data.target

# rename text column
df.rename(columns={0:'text'}, inplace=True)

# add label names
target_names = data.target_names
df['label'] = df['target'].apply(lambda x: target_names[x])

In [None]:
df = df[(df['label'] == 'rec.autos') | (df['label'] == 'sci.med') | (df['label'] == 'rec.sport.baseball')]

In [None]:
df['label'].unique()

### at this stage the df has 'text', 'target', and 'label' columns. the rest should be able to be done to any df provided it has these columns

## preprocessing

add to built in stopwords

lemmatization would be very helpful

In [None]:
# make spacy docs

df['spacy'] = df['text'].apply(nlp)

In [None]:
# custom stop words based off of sklearn
my_stop_words = text.ENGLISH_STOP_WORDS.union({"edu",
                                               "ca",
                                               "com",
                                               "gov",
                                               "university",
                                               "posting",
                                               "line",
                                               "lines",
                                               "host",
                                               "nntp",
                                               "write",
                                               "subject",
                                               "organization",
                                               "article",
                                               "like",
                                               "think",
                                               "know",
                                               "do",
                                               "just",
                                               "use",
                                               "say",
                                               "from"
                                                })

In [None]:
# lemmatize and filter out stop words
def lemmatize(spacy_doc):
    
    lemmata = []
    
    for tok in spacy_doc:
        if not tok.is_punct and tok.text not in punctuation and "\n" not in tok.text:
            if tok.lemma_ not in my_stop_words:
                lemmata.append(tok.lemma_)
            
    return " ".join(lemmata)

df['lemmatized'] = df['spacy'].apply(lemmatize)

## clustering

In [None]:
count_vect = CountVectorizer(stop_words=my_stop_words,
                            ngram_range = (2,4)
                            )
tfidf_transformer = TfidfTransformer()

counts = count_vect.fit_transform(df['lemmatized'])
tfidf = tfidf_transformer.fit_transform(counts)

In [None]:
from sklearn.cluster import KMeans

num_clusters = 3

km = KMeans(n_clusters=num_clusters)

km.fit(tfidf)

clusters = km.labels_.tolist()

In [None]:
df['cluster'] = clusters

In [None]:
df['label'].unique()

In [None]:
for label in df['label'].unique():
    print(label)
    print(df[df['label'] == label].cluster.value_counts())
    print()

## classification

In [None]:
# create X (features) and y (targets)

X = df['lemmatized']
y = df['target']

## bernoulli naive bayes classifier
used for binary features, i.e., word's occurrence in a document, _not_ its count

In [None]:
# create vectorizer with stop words, optional to add ngrams

count_vect = CountVectorizer(stop_words=my_stop_words, #stop_words='english'
                            #ngram_range = (1,1)
                            )

# no tfidf because bernoulli does binary, tfidf relies on counts

In [None]:
# create count vectors
counts = count_vect.fit_transform(X)

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(counts, y, test_size=.25)

In [None]:
# fit model
bern_clf = BernoulliNB().fit(X_train, y_train) # use binary occurrence for bernoulli

In [None]:
# get score on test data

bern_clf.score(X_test, y_test)

In [None]:
# get top 10 features for each class and their weights
# weights are log probabilities, which are negative since log of everything in the interval (0,1) is negative

def get_top_10_features(vectorizer, clf, class_labels, class_names):
    
    top10_features = {}
    
    feature_names = vectorizer.get_feature_names()
    
    for i, class_label in enumerate(class_labels):
        
        name = class_names[i]
        
        top10_weights = sorted(clf.coef_[i])[-10:]
        top10_indices = np.argsort(clf.coef_[i])[-10:]
        top10_names = [feature_names[j] for j in top10_indices]
        
        top10_features[name] = {n:w for n,w in zip(top10_names, top10_weights)}
        
    return top10_features
        
top10 = get_top_10_features(count_vect, # vectorizer
                            bern_clf, # classifier
                            bern_clf.classes_, # classes (as ints)
                            target_names) # labels (as strs)

In [None]:
# print out features

for k,v in top10.items():
    print(k)
    features = list(sorted(v.items(), key=operator.itemgetter(1)))[::-1]
    for k,v in features:
        print(k, v)
    print()

## multinomial naive bayes classifier
used for word counts in each document, not simply occurrence

In [None]:
count_vect = CountVectorizer(stop_words=my_stop_words,
                            #ngram_range = (1,2)
                            )
tfidf_transformer = TfidfTransformer()

In [None]:
counts = count_vect.fit_transform(X)
tfidf = tfidf_transformer.fit_transform(counts)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tfidf, y, test_size=.25)

In [None]:
mn_clf = MultinomialNB().fit(X_train, y_train) # use word counts for multinomial

In [None]:
mn_clf.score(X_test, y_test)

In [None]:
print_top10(count_vect, mn_clf, mn_clf.classes_)

## old print top 10

In [None]:
def print_top10(vectorizer, clf, class_labels):
    """Prints features with the highest coefficient values, per class"""
    
    feature_names = vectorizer.get_feature_names()
    for i, class_label in enumerate(class_labels):
        print(data.target_names[i])
        top10 = np.argsort(clf.coef_[i])[-10:]
        print("%s: %s" % (class_label,
              ", ".join(feature_names[j] for j in top10)))
        print()

print_top10(count_vect, bern_clf, bern_clf.classes_)

# ngrams visualization

In [None]:
bb = df[df['label'] == "rec.sport.baseball"]

In [None]:
# make spacy docs

bb['spacy'] = bb['text'].apply(nlp)

In [None]:
# preprocess/clean docs

def preprocess(spacy_doc):
    '''
    This function takes a spacy doc and filters out tokens that are punctuation, determiners, pronouns, numbers, etc.
    It filters out stop words and returns the lemma of each remaining word.
    The lemmata are rejoined to form a string.
    '''
    
    cleaned = []
    
    bad_pos = ['PUNCT',
              'SYM',
              'X',
              'NIL',
              'PRON',
              'SPACE',
              'DET',
              'NUM',
              'PROPN']
    
    for token in spacy_doc:
        if token.text not in punctuation:
            if token.pos_ not in bad_pos:
                if token.text not in my_stop_words and token.lemma_ not in my_stop_words:
                    cleaned.append(token.lemma_)
                
    return " ".join(cleaned)

bb['cleaned'] = bb['spacy'].apply(preprocess)

In [None]:
# get bigrams and trigrams

def find_bigrams(s):
    
    words = s.split()
    
    return list(zip(words, words[1:]))


def find_trigrams(s):
    
    words = s.split()
    
    return list(zip(words, words[1:], words[2:]))

# clever way to do ngrams
def find_ngrams(input_list, n):
    return list(zip(*[input_list[i:] for i in range(n)]))

bb['bigrams'] = bb['cleaned'].apply(find_bigrams)
bb['trigrams'] = bb['cleaned'].apply(find_trigrams)

In [None]:
# create lists of all bigrams and trigrams in the collection

bigrams = []
for b in bb['bigrams']:
    bigrams.extend(b)
    
trigrams = []
for t in bb['trigrams']:
    trigrams.extend(t)

In [None]:
# get bigram and trigram counts

bigram_counts = {}

for b in bigrams:
    if b not in bigram_counts:
        bigram_counts[b] = 1
    else:
        bigram_counts[b] += 1
        
trigram_counts = {}

for t in trigrams:
    if t not in trigram_counts:
        trigram_counts[t] = 1
    else:
        trigram_counts[t] += 1

In [None]:
# sort bigrams and trigrams to get most frequent

sorted_bigrams = sorted(bigram_counts.items(), key=operator.itemgetter(1))[::-1]
sorted_trigrams = sorted(trigram_counts.items(), key=operator.itemgetter(1))[::-1]

In [None]:
# word cloud of most frequent bigrams

bigrams_string = ""

for b in sorted_bigrams[:200]:
    bigrams_string += b[0][0] + "_" + b[0][1] +  " "
    
wordcloud = WordCloud(max_font_size=80).generate(bigrams_string)

plt.figure()
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# word cloud of most frequent trigrams

trigrams_string = ""

for t in sorted_trigrams[:200]:
    trigrams_string += t[0][0] + "_" + t[0][1] +  "_" + t[0][2] + " "
    
wordcloud = WordCloud(max_font_size=80).generate(trigrams_string)

plt.figure()
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()