# Datasets

### Newsgroups

In [134]:
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=2,
                             remove=('headers', 'footers', 'quotes'))
text_samples = dataset.data[:1000]
text = text_samples[0]
text

u"Something about how Koresh had threatened to cause local \nproblems with all these wepaons he had and was alleged to\nhave.  \n\nSomeone else will post more details soon, I'm sure.\n\nOther News:\nSniper injures 9 outside MCA buildling in L.A.  Man arrested--suspect\nwas disgruntled employee of Universal Studios, which\nis a division of M.C.A.\n\n\nQUESTION:\nWhat will Californians do with all those guns after the Reginald\ndenny trial?"

### Reuters

In [1]:
import lda.datasets
reuters_word_vector = lda.datasets.load_reuters()
reuters_vocab = lda.datasets.load_reuters_vocab()
reuters_titles = lda.datasets.load_reuters_titles()

# Tokenizing

In [4]:
import nltk
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]

# Stemming

In [5]:
import nltk
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
stems = [stemmer.stem(t) for t in tokens]

# Stop Words

In [82]:
from nltk.corpus import stopwords
stoplist = stopwords.words('english')

# Vectorizing
### CountVectorizer

In [47]:
from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000,
                                stop_words='english')
tf_matrix = tf_vectorizer.fit_transform(text_samples)
tf_vocab = tf_vectorizer.vocabulary_.keys()
tf_matrix

<1000x1000 sparse matrix of type '<type 'numpy.int64'>'
	with 23656 stored elements in Compressed Sparse Row format>

### TfidfVectorizer

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=20000,
                                 min_df=0.1)
                                 #use_idf=True, tokenizer=nltk.word_tokenize, ngram_range=(1,3)
tfidf_matrix = tfidf_vectorizer.fit_transform(text_samples) #fit the vectorizer to synopses
tfidf_vocab = tfidf_vectorizer.vocabulary_.keys()
tfidf_matrix

<1000x108 sparse matrix of type '<type 'numpy.float64'>'
	with 25613 stored elements in Compressed Sparse Row format>

# Topic Modeling
## LDA: Latent Drichlet Allocation

[Lecture ipynb](https://github.com/thisismetis/nyc16_ds8/blob/master/class_lectures/week08-fletcher2/01-LDA/LDA.ipynb)

### sklearn

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
lda.fit(tf)

### lda

In [2]:
import lda
model = lda.LDA(n_topics=20, n_iter=1500, random_state=1)
model.fit(reuters_word_vector)  # model.fit_transform(X) is also available

<lda.lda.LDA instance at 0x103894050>

## Vectorization
### Gensym

word2vec with gensym
[lecture ipynb](https://github.com/thisismetis/nyc16_ds8/blob/master/class_lectures/week08-fletcher2/02-word2vec/WORD2VEC_GENSIM.ipynb)

In [102]:
import gensim
# The type of input that Word2Vec is looking for.. 
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in text_samples]
model = gensim.models.Word2Vec(texts, size=100, window=5, min_count=1, workers=4,sg=1)

In [133]:
# Some useful attributes
print model.vocab[u'koresh']
print model.vocab[u'cult']
print model.vocab[u'waco']
print model.vocab[u'motherboard']
print model.most_similar('usenet' ,topn=4)
print model.similarity('woman','man')
print model.n_similarity(['woman', 'girl'], ['man', 'boy'])
print model.doesnt_match("waco koresh cult motherboard".split()) # this doesn't work very well

Vocab(count:11, index:1408, sample_int:4294967296)
Vocab(count:6, index:2670, sample_int:4294967296)
Vocab(count:5, index:3018, sample_int:4294967296)
Vocab(count:9, index:1696, sample_int:4294967296)
[(u'engine', 0.9990893006324768), (u'explained', 0.9990804195404053), (u'property', 0.9990782141685486), (u'swap', 0.9990752935409546)]
0.997880684459
0.998612564475
koresh


# Dimensionality Reduction
Lecture about PCA and SVD [ipynb](https://github.com/thisismetis/nyc16_ds8/blob/master/class_lectures/week07-fletcher1/04-dim_reduct/pca_svd.ipynb)  
## PCA

In [72]:
from sklearn.decomposition import PCA
pca = PCA(n_components=20)
X = pca.fit_transform(tfidf_matrix.toarray()) 

## SVD

In [76]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=20, random_state=42)
X = svd.fit_transform(tfidf_matrix)

# Clustering
## KMeans

In [78]:
# from sklearn.preprocessing import scale
# X = scale(tfidf_matrix.toarray())

from sklearn.cluster import KMeans
model = KMeans().fit(X)
clusters = model.predict(X)

## Hierarchical clustering
Lectore [PDF](https://github.com/thisismetis/nyc16_ds8/blob/master/class_lectures/week08-fletcher2/03-more_clustering/Other_Clustering_Algorithms.pdf)  

Hierarchical clustering requires two steps
1. Compute distances
2. Compute linkage

Once these are applied, you can plot a dendrogram.

### AgglomerativeClustering
This function can do it all

In [None]:
# NOTE: I have not used this function yet.  This code will crash.
from sklearn.cluster import AgglomerativeClustering
AgglomerativeClustering(..., linkage='ward', affinity='euclidean')

## Distance

In [58]:
from sklearn.metrics.pairwise import cosine_distances
dist = cosine_distances(tfidf_matrix)

In [39]:
from sklearn.metrics.pairwise import euclidean_distances
dist = euclidean_distances(tfidf_matrix)

In [49]:
from scipy.spatial.distance import pdist, squareform
dist = pdist(tfidf_matrix.toarray(), metric='jaccard') # cosine, euclidean, jaccard, cityblock

## Linkage

In [54]:
# Note: this crashes because the result of above has NaNs in it
from scipy.cluster.hierarchy import average#, ward, single, complete
linkage_matrix = average(dist) 

In [59]:
from scipy.cluster.hierarchy import linkage
linkage_matrix = linkage(dist, 'ward') # ward, single, complete, average

## Dendrogram

In [None]:
from scipy.cluster.hierarchy import dendrogram
import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="right");

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.tight_layout() #show plot with tight layout

#uncomment below to save figure
plt.savefig('ward_clusters.png', dpi=200) #save figure as ward_clusters

### Choosing clusters
You can also make clusters from hierarchical clusters, but I haven't researched the methods yet

# Resources
Lecture about PCA and SVD [ipynb](https://github.com/thisismetis/nyc16_ds8/blob/master/class_lectures/week07-fletcher1/04-dim_reduct/pca_svd.ipynb)  
Solution to pair programming exercise about yelp reviews [ipynb](https://github.com/thisismetis/nyc16_ds8/blob/master/class_lectures/week07-fletcher1/05-unsup_kmeans/pair.ipynb)  
NLP lecture: [ipynb](https://github.com/thisismetis/nyc16_ds8/blob/master/class_lectures/week07-fletcher1/03-nlp/NLP_nltk.ipynb)  
Topic modeling lecture [PDF](https://github.com/thisismetis/nyc16_ds8/blob/master/class_lectures/week08-fletcher2/01-LDA/Topic_Modeling.pdf)  
LDA lecture: [ipynb](https://github.com/thisismetis/nyc16_ds8/blob/master/class_lectures/week08-fletcher2/01-LDA/LDA.ipynb)  
Solution to name/gender pair programming exercize [ipynb](https://github.com/thisismetis/nyc16_ds8/blob/master/class_lectures/week07-fletcher1/03-nlp/NLP_Notebook.ipynb)  
Word2vec with gensym [ipynb](https://github.com/thisismetis/nyc16_ds8/blob/master/class_lectures/week08-fletcher2/02-word2vec/WORD2VEC_GENSIM.ipynb)  
Other clustering [PDF](https://github.com/thisismetis/nyc16_ds8/blob/master/class_lectures/week08-fletcher2/03-more_clustering/Other_Clustering_Algorithms.pdf)  