In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups
import nltk

# Create a simple classifier to determine the type of topic an entry is

## Currently implemented using 20 newsgroups and their respective categories

In [2]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)

### The possible classification groups

In [3]:
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

### Word counts for all the articles

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)

### Calculate tfidf to weed out common not important words

In [5]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

### Actually create the classifier (Naive Bayes)

In [6]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

### Test on some data

In [7]:
docs_new = ['Im a starman waiting in the sun.']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, twenty_train.target_names[category]))

'Im a starman waiting in the sun.' => rec.motorcycles


# Create a word recommender using bigrams of a corpus

In [13]:
def filter_stopwords(wordset, stop):
    return [x for x in wordset if x not in stop]

In [26]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop += [',', '.', '--']

### Generate bigrams and a frequency distribution for each of the words used in the corpus

In [27]:
from nltk.corpus import brown
cfreq = nltk.ConditionalFreqDist(nltk.bigrams(filter_stopwords(brown.words(), stop)))

### Generate a probability distribution for this data

In [28]:
cprob = nltk.ConditionalProbDist(cfreq, nltk.MLEProbDist)

In [29]:
cprob['cat'].samples()

dict_keys(['dusty', 'monkey', 'gave', 'kittens', '(', 'dominant', 'bag', 'leaped', 'came', 'suddenly', 'even', 'confronted', 'But', 'cat', 'rubbing', 'second', 'thin', 'rather', 'guard', 'somehow'])

In [30]:
import tensorflow