In [339]:
# Standard library
import os
import random
import itertools
from pathlib import Path
# from collections import Counter
import unicodedata
import string
import copy

# NLTK
from nltk.corpus import stopwords
from nltk.tokenize.regexp import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

# GENSIM
from gensim.models.phrases import Phrases, Phraser
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim import corpora
from gensim.matutils import corpus2csc

### Load Data

 <div class="alert alert-block alert-info">
    <b>Note:</b> Change here to read/load data from S3 instead.
</div>

In [174]:
class MyCorpus:
    """ Corpus generator."""
    def __init__(self, filenames):
        self.filenames = filenames
            
    def __iter__(self):
        for file in self.filenames:
            p = Path('txt-human') / file
            doc = p.read_text()
            yield doc

In [308]:
# Choose 10 random files
filenames = random.choices(os.listdir('txt-human'), k=10)
print(filenames)

# Create corpus iterator
corpus = MyCorpus(filenames)

['20003343_human.txt', '20006021_human.txt', '20008375_human.txt', '20013334_human.txt', '20001009_human.txt', '20013594_human.txt', '20009076_human.txt', '20000888_human.txt', '20000996_human.txt', '20002641_human.txt']


### Basic preprocessing

In [309]:
STOPWORDS = set(stopwords.words('english'))

In [310]:
# Helper functions

def is_stop(token):
    """ Check if token is a stopwords. """
    return token.lower() in STOPWORDS

def is_punct(token):
    """ Check if token is a punctation."""
    return all(
            unicodedata.category(char).startswith('P') for char in token
        )

def weird_token(token):
    """ Check if token is a weird word that starts with "-end" or "(figure". """
    return token.startswith(tuple(punct for punct in string.punctuation))

def normalize_documents(corpus):
    """ Tokenize, lemmatize, and return only tokens with length > 2. """
    tokenizer = RegexpTokenizer('\w+')
    lemmatizer = WordNetLemmatizer()
    
    for doc in corpus:
        tokenized_doc = tokenizer.tokenize(doc)
        normalized_doc = [
            lemmatizer.lemmatize(token.lower()) 
            for token in tokenized_doc
                if not is_punct(token) 
                    and not is_stop(token) 
                    and not weird_token(token)
                    and (len(token) > 2)
        ]
        yield normalized_doc

In [344]:
# Create two iterators coz used twice
docs_normalized_1, docs_normalized_2 = itertools.tee(normalize_documents(corpus))

### Bigrams and trigrams

In [345]:
def train_ngrams(docs_normalized):
    """ Train bigram and trigram language models. """
    bigrams = Phrases(docs_normalized, min_count=3, threshold=1, delimiter=b' ')
    bigram_model = Phraser(bigrams)

    trigrams = Phrases(bigrams[docs_normalized], min_count=2, threshold=1, delimiter=b' ')
    trigram_model = Phraser(trigrams)
    
    return bigram_model, trigram_model



def create_ngrams(docs_normalized, bigram_model, trigram_model):
    """ Create ngram dataset using trained ngram language models. 
    Return list coz needed needed twice.
    """
    documents = list()
    for doc in docs_normalized:
        doc_bigrams = bigram_model[list(doc)]
        doc_ngram = trigram_model[doc_bigrams]
        documents.append(doc_ngram)
        
    return documents

In [346]:
bigram_model, trigram_model = train_ngrams(docs_normalized_1)
docs_ngram = create_ngrams(docs_normalized_2, bigram_model, trigram_model)

# Should be size of your corpus
len(docs_ngram)

10

### Doc2vec

In [348]:
def tag_documents(docs_ngram):
    """ Create TaggedDocument object required by gensim.Doc2Vec """
    tagged_docs = list()
    for i, doc in enumerate(docs_ngram):
        tagged_document = TaggedDocument(doc, [i])
        tagged_docs.append(tagged_document)
    return tagged_docs
        
    
# tagged documents
documents = tag_documents(docs_ngram)

# create doc2vec
model = Doc2Vec(
    documents,
    vector_size=300, 
    window=4, 
    min_count=2, 
    workers=8, 
    dbow_words=1, 
    dm=0, 
    dm_concat=1, 
    max_epochs=30
)

print('Done!')

Done!


In [239]:
# model.save('doc2vec.gensim')

### Anchor terms

In [349]:
ANCHOR_DICT = {'Important Minerals' : [ 

        # associated with Au deposits
        ['pyrrhotite', 'pyrite', 'chalcopyrite', 'sphalerite', 'galena', 'arsenopyrite', 'molybdenite'],
        # associated with CuAu deposits
        ['sphalerite', 'galena', 'arsenopyrite', 'molybdenite', 'marcasite', 'fluorite', 'bornite', 
        'chalcocite', 'covellite'],
        # associated with hydrothermal alteration 
        ['biotite', 'albite', 'chlorite', 'sericite', 'epidote', 'actinolite', 'calcite', 'ankerite',
        'plagioclase', 'hornblende', 'muscovite'],
        # asssociated with metamorphism
        ['corundum', 'diopside', 'topaz', 'forsterite', 'gypsum'],
        # toxic or cause problems with processing
        ['millerite', 'cobaltite', 'barite', 'cassiterite','gersdorffite', 'hessite', 'cinnabar', 
        'tennantite', 'acanthite', 'orpiment']
    ]
}

In [350]:
# list of lists
anchor_terms = ANCHOR_DICT['Important Minerals']

In [351]:
keyed_vecs = model.wv
print(keyed_vecs)

<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x000002D71D2AAD68>


**Expanding anchor terms:**

In [352]:
expanded_anchor_terms = []
expand_anchors = 0.25

for sublist in anchor_terms:
    expand_list = []
    for item in sublist:
        try:
            candidates = keyed_vecs.most_similar_cosmul(item)
        except KeyError:
            continue
        for candidate in candidates:
            if candidate[1] >= 0.75:
                expand_list.append(candidate[0])
        expanded_sublist = list(set(expand_list + sublist))
        expanded_anchor_terms.append(expanded_sublist)

### Build TF matrix

In [353]:
id2word = corpora.Dictionary(docs_ngram)
print(id2word)

Dictionary(4176 unique tokens: ['accessible', 'activation laboratory', 'activity', 'actually', 'adam']...)


In [354]:
# If using small corpus, no need to filter
# id2word.filter_extremes(no_below=1, no_above=0.30, keep_n=35000, keep_tokens=None)

Peek at first 10 words in vocabulary:

In [355]:
i = 0
for term in id2word.items():
    print(term)
    i += 1
    if i == 10: break

(0, 'accessible')
(1, 'activation laboratory')
(2, 'activity')
(3, 'actually')
(4, 'adam')
(5, 'additional')
(6, 'adjacent')
(7, 'adjoining')
(8, 'agreement')
(9, 'aid')


Create new corpus object (replaces above):

In [358]:
corpus = [id2word.doc2bow(doc) for doc in docs_ngram]
vocab = [term[1] for term in id2word.items()]
len(corpus), len(vocab)

(10, 4176)

In [359]:
corpus[0][:10]

[(0, 1),
 (1, 4),
 (2, 2),
 (3, 2),
 (4, 1),
 (5, 4),
 (6, 1),
 (7, 1),
 (8, 3),
 (9, 2)]

In [360]:
tf_matrix = corpus2csc(corpus, num_terms=len(vocab)).T   # <----- transposed
tf_matrix

<10x4176 sparse matrix of type '<class 'numpy.float64'>'
	with 8449 stored elements in Compressed Sparse Row format>

Create binary matrix (required by corex):

In [361]:
binary_matrix = tf_matrix.astype('bool')
binary_matrix = binary_matrix.astype('int')

### CorEx model

In [362]:
from corextopic import corextopic as ct



In [363]:
len(anchor_terms)

5

In [364]:
ct_model = ct.Corex(n_hidden=len(anchor_terms), max_iter=50, seed=42, verbose=True)

corex, rep size: 5


 <div class="alert alert-block alert-info">
    <b>Note:</b> It's okay if several anchor terms not found coz a small corpus was chosen.
</div>

In [365]:
ct_model.fit(binary_matrix, words=vocab, anchors=anchor_terms)

word counts [2. 3. 9. ... 1. 1. 1.]
[80.933 77.345 49.584 76.506 84.966]
[205.413 158.487 189.51  191.828 155.633]
[202.999 158.487 183.312 187.188  56.394]
[205.458 158.756 190.173 191.85  155.724]
[203.388 158.756  65.439 191.302 148.862]
[202.999 158.756  98.102 186.64   89.057]
[205.458 158.756 190.173 191.85  155.724]
[202.999 158.487 183.312 187.188  56.394]
[205.458 158.756 190.173 191.85  155.724]
[202.999 158.487 183.312 187.188  56.394]
[202.999 158.756 183.312 187.188  56.394]
[205.458 158.756 190.173 191.85  155.724]
[205.458 158.756 190.173 191.85  155.723]
[205.458 158.756 190.173 191.85  155.723]
[205.458 158.756 190.173 191.85  155.723]
[205.457 158.756 190.173 191.85  155.723]
[202.998 158.487  98.102 186.639  89.057]
[202.997 158.756 183.312 187.187  56.394]
[202.997 158.756 183.311 187.187  56.394]
[205.456 158.756 190.172 191.848 155.723]
[202.996 158.487 183.311 187.186  56.394]
[205.454 158.756 190.172 191.846 155.723]
[205.452 158.756 190.171 191.845 155.72 ]
[20

<corextopic.corextopic.Corex at 0x2d71c86f8d0>

In [366]:
ct_model.get_topics(n_words=100)[0][:10]

[('employed', 0.9115924128610783),
 ('smaller', 0.5544965692873933),
 ('suite', 0.5544965692873933),
 ('detailed', -0.5544965692873932),
 ('design', -0.5544965692873932),
 ('recommend', -0.5544965692873932),
 ('lithological', 0.5067720616814062),
 ('aware', 0.5067720616814062),
 ('extensive', 0.5067720616814062),
 ('black', 0.5067720616814062)]

In [367]:
topics = ct_model.get_topics()
for topic_n, topic in enumerate(topics):
    words, mis = zip(*topic)
    topic_str = str(topic_n+1) + ': ' + ', '.join(words)
    print(topic_str)

1: employed, smaller, suite, detailed, design, recommend, lithological, aware, extensive, black
2: sheet, obtain, northing, although, page, relative, specific, technique, september, background
3: either, sample taken, possible, designed, shipping, gold analysis, overall, geochemical, ont, exposed
4: required, offer, analyzed, top, determined, swamp, per, around, geophysical survey, nominal
5: identify, throughout, rock type, outcrop, statistical, essentially, secondary, drive, comparison, cultural


In [368]:
top_docs = ct_model.get_top_docs()
for topic_n, topic_docs in enumerate(top_docs):
    docs,probs = zip(*topic_docs)

    topic_str = str(topic_n+1.0)+ ': ' + str(docs)
    print(topic_str)

NOTE: 'docs' not provided to CorEx. Returning top docs as lists of row indices
1.0: (2, 4, 8, 9, 7, 0, 3, 5, 1, 6)
2.0: (0, 3, 4, 5, 7, 2, 9, 6, 8, 1)
3.0: (0, 6, 7, 9, 8, 1, 2, 5, 4, 3)
4.0: (1, 2, 4, 7, 9, 0, 8, 6, 5, 3)
5.0: (1, 2, 3, 5, 7, 9, 8, 0, 6, 4)


In [370]:
# from corextopic import vis_topic as vt
# vt.vis_rep(ct_model, column_label=vocab, prefix='topic-model-example')

Print topics in text file


In [372]:
ct_model.clusters

array([0, 4, 3, ..., 2, 0, 2], dtype=int64)

In [373]:
ct_model.p_y_given_x.shape

(10, 5)

In [374]:
ct_model.p_y_given_x # no. of docs x no. of topics

array([[0., 1., 1., 0., 0.],
       [0., 0., 0., 1., 1.],
       [1., 0., 0., 1., 1.],
       [0., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 1., 1., 1., 1.],
       [1., 0., 0., 0., 0.],
       [1., 0., 1., 0., 1.]])

In [375]:
ct_model.labels

array([[False,  True,  True, False, False],
       [False, False, False,  True,  True],
       [ True, False, False,  True,  True],
       [False,  True, False, False,  True],
       [ True,  True, False,  True, False],
       [False,  True, False, False,  True],
       [False, False,  True, False, False],
       [False,  True,  True,  True,  True],
       [ True, False, False, False, False],
       [ True, False,  True, False,  True]])

In [376]:
ct_model.log_z

array([[ 163.083, -145.629,   96.634,  155.554,   83.416],
       [ 211.735,  225.753,  162.916,  -99.729,   20.496],
       [  84.792,  175.644,  166.231,   30.477,   41.063],
       [ 191.295,  465.908,  217.255,  197.598,   92.533],
       [ 401.258,  159.631,  177.851,  485.727,  112.847],
       [ 213.473,  272.235,  199.782,  200.208,   24.286],
       [ 218.777,  165.382, -140.482,  185.087,   92.759],
       [ 168.325, -115.477,  344.515,  337.864,    9.922],
       [-155.675,  217.818,  143.034,  181.091,   30.503],
       [ 485.397,  166.191,  458.798,  153.964,   54.054]])

# Gensim

In [124]:
from gensim import corpora
from gensim import models

In [127]:
dictionary = corpora.Dictionary(processed_corpus)
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

In [123]:
print(dictionary)

Dictionary(2022 unique tokens: ['(the', 'accepts', 'accessible', 'area', 'bay']...)


In [128]:
tfidf = models.TfidfModel(bow_corpus)

In [140]:
corpus_tfidf = tfidf[bow_corpus]

In [141]:
lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
corpus_lsi = lsi_model[corpus_tfidf]
lsi_model.print_topics(2)

[(0,
  '0.233*"field" + 0.175*"survey" + 0.151*"precession" + 0.146*"prospect" + 0.144*"snowdon" + 0.124*"township," + 0.106*"larder" + 0.103*"em" + 0.100*"services" + 0.097*"signal"'),
 (1,
  '-0.240*"gold" + -0.167*"jackfish" + -0.139*"g/t" + -0.137*"prospected" + -0.107*"excavation" + -0.102*"twp." + -0.100*"patent" + 0.094*"precession" + 0.093*"field" + 0.092*"snowdon"')]

In [49]:
# ct_model.__dict__

In [40]:
# for token in doc:
#     print(token, '--->', token.pos_)

In [51]:
keywords = doc._.textrank
{key:keywords[key] for key in list(keywords.keys())[:10]}

{'gabbro': 3.58274470901244,
 'sulphide': 3.46089932485539,
 'lake': 3.4120833049871466,
 'property': 3.285412386830057,
 'diabase': 3.1680234315156883,
 'sample': 3.1069113027312683,
 'pardee': 2.8982177155859388,
 'zone': 2.837039679539933,
 'disseminate': 2.581838230299123,
 'crystal': 2.549048435533453}

In [28]:
# displacy.render(doc, style='ent', jupyter=True)

***

In [51]:
import nltk
import string

def tokenize(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()

    for token in nltk.word_tokenize(text):
        if token in string.punctuation: continue
        yield stem.stem(token)

In [52]:
corpus = [
    "The elephant sneezed at the sight of potatoes.",
    "Bats can see via echolocation. See the bat sight sneeze!",
    "Wondering, she opened the door to the studio.",
]

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
corpus = tfidf.fit_transform(corpus)

print(corpus)

# LDA Tutorial

In [64]:
corpus = [
    'I like bananas',
    'Frogs and fish live in ponds',
    'Kittens and puppies are fluffy',
    'I had a spinach and apple smoothie',
    'My kitten loves kale'
]

In [68]:
docs = nlp.pipe(corpus)
corpus = tokenize_documents(docs)
corpus

[['like', 'banana'],
 ['frog', 'fish', 'live', 'pond'],
 ['kitten', 'puppy', 'fluffy'],
 ['spinach', 'apple', 'smoothie'],
 ['kitten', 'love', 'kale']]

In [80]:
bigram = Phrases(docs, min_count=1)
for idx in range(len(corpus)):
    for token in bigram[corpus[idx]]:
        if '_' in token:
            corpus[idx].append(token)

In [82]:
corpus

[['like', 'banana'],
 ['frog', 'fish', 'live', 'pond'],
 ['kitten', 'puppy', 'fluffy'],
 ['spinach', 'apple', 'smoothie'],
 ['kitten', 'love', 'kale']]

In [77]:
bigram[corpus[4]]

['kitten', 'love', 'kale']

In [86]:
from gensim.corpora import Dictionary
dictionary = Dictionary(corpus)
dictionary

<gensim.corpora.dictionary.Dictionary at 0x2312fdda048>

In [87]:
print(dictionary)

Dictionary(14 unique tokens: ['banana', 'like', 'fish', 'frog', 'live']...)


In [88]:
corpus = [dictionary.doc2bow(doc) for doc in corpus]
corpus

[[(0, 1), (1, 1)],
 [(2, 1), (3, 1), (4, 1), (5, 1)],
 [(6, 1), (7, 1), (8, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(7, 1), (12, 1), (13, 1)]]

In [105]:
dictionary.token2id

{'banana': 0,
 'like': 1,
 'fish': 2,
 'frog': 3,
 'live': 4,
 'pond': 5,
 'fluffy': 6,
 'kitten': 7,
 'puppy': 8,
 'apple': 9,
 'smoothie': 10,
 'spinach': 11,
 'kale': 12,
 'love': 13}

In [109]:
id2word = {v:k for k, v in dictionary.token2id.items()}

In [110]:
from gensim.models import LdaModel

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    iterations=30,
    num_topics=2
)

In [111]:
model.top_topics(corpus)

[([(0.10749391, 'pond'),
   (0.106098294, 'frog'),
   (0.10473225, 'fish'),
   (0.10241872, 'live'),
   (0.076010086, 'like'),
   (0.0716033, 'banana'),
   (0.06602298, 'fluffy'),
   (0.061876055, 'kitten'),
   (0.05738665, 'puppy'),
   (0.05699933, 'spinach'),
   (0.051828057, 'apple'),
   (0.04708853, 'kale'),
   (0.046053573, 'smoothie'),
   (0.0443882, 'love')],
  -21.492061795003995),
 ([(0.13846493, 'kitten'),
   (0.08966731, 'love'),
   (0.08826458, 'smoothie'),
   (0.08739277, 'kale'),
   (0.08340056, 'apple'),
   (0.07904477, 'spinach'),
   (0.07871855, 'puppy'),
   (0.071444064, 'fluffy'),
   (0.06674365, 'banana'),
   (0.06303176, 'like'),
   (0.040787507, 'live'),
   (0.03883874, 'fish'),
   (0.037688162, 'frog'),
   (0.036512636, 'pond')],
  -21.54538080889322)]

In [112]:
import pandas as pd
import pickle

data = pd.read_pickle('dtm_stop.pkl')
data

Unnamed: 0,aaaaah,aaaaahhhhhhh,aaaaauuugghhhhhh,aaaahhhhh,aaah,aah,abc,abcs,ability,abject,...,zee,zen,zeppelin,zero,zillion,zombie,zombies,zoning,zoo,éclair
ali,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
anthony,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
bill,1,0,0,0,0,0,0,1,0,0,...,0,0,0,1,1,1,1,1,0,0
bo,0,1,1,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
dave,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hasan,0,0,0,0,0,0,0,0,0,0,...,2,1,0,1,0,0,0,0,0,0
jim,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
joe,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
john,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
louis,0,0,0,0,0,3,0,0,0,0,...,0,0,0,2,0,0,0,0,0,0


In [113]:
from gensim import matutils, models
import scipy.sparse

In [115]:
tdm = data.transpose()
tdm.head()

Unnamed: 0,ali,anthony,bill,bo,dave,hasan,jim,joe,john,louis,mike,ricky
aaaaah,0,0,1,0,0,0,0,0,0,0,0,0
aaaaahhhhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaaaauuugghhhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaaahhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaah,0,0,0,0,1,0,0,0,0,0,0,0


In [116]:
sparse_counts = scipy.sparse.csr_matrix(tdm)
sparse_counts

<7468x12 sparse matrix of type '<class 'numpy.int64'>'
	with 16367 stored elements in Compressed Sparse Row format>

In [117]:
corpus = matutils.Sparse2Corpus(sparse_counts)
corpus

<gensim.matutils.Sparse2Corpus at 0x23151579ac8>

In [118]:
cv = pickle.load(open('cv_stop.pkl', 'rb'))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())



In [119]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=20)
lda.print_topics()

[(0,
  '0.009*"fucking" + 0.006*"shit" + 0.005*"didnt" + 0.005*"went" + 0.005*"fuck" + 0.005*"going" + 0.005*"hes" + 0.005*"say" + 0.005*"want" + 0.005*"theyre"'),
 (1,
  '0.006*"say" + 0.005*"fuck" + 0.005*"fucking" + 0.005*"life" + 0.005*"theyre" + 0.004*"shit" + 0.004*"love" + 0.004*"good" + 0.004*"dad" + 0.004*"going"')]

In [121]:
# lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=5, passes=20)
# lda.print_topics()

In [122]:
from nltk import word_tokenize, pos_tag

def nouns(text):
    is_noun = lambda pos: pos[:2] == "NN"
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)]
    return ' '.join(all_nouns)

In [123]:
data_clean = pd.read_pickle('data_clean.pkl')
data_clean.head()

Unnamed: 0,transcript
ali,ladies and gentlemen please welcome to the sta...
anthony,thank you thank you thank you san francisco th...
bill,all right thank you thank you very much thank...
bo,bo what old macdonald had a farm e i e i o and...
dave,this is dave he tells dirty jokes for a living...


In [124]:
data_nouns = pd.DataFrame(data_clean.transcript.apply(nouns))
data_nouns.head()

Unnamed: 0,transcript
ali,ladies gentlemen stage ali hi thank hello na s...
anthony,thank thank people i em i francisco city world...
bill,thank thank pleasure georgia area oasis i june...
bo,macdonald farm e i o farm pig e i i snort macd...
dave,jokes living stare work profound train thought...


In [128]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.transcript)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

Unnamed: 0,aaaaahhhhhhh,aaaaauuugghhhhhh,aaaahhhhh,aah,abc,abcs,ability,abortion,abortions,abuse,...,yummy,ze,zealand,zee,zeppelin,zillion,zombie,zombies,zoo,éclair
ali,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
anthony,0,0,0,0,0,0,0,2,0,0,...,0,0,10,0,0,0,0,0,0,0
bill,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,1,1,1,0,0
bo,1,1,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dave,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
hasan,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
jim,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
joe,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
john,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
louis,0,0,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [135]:
data_dtmn.transpose()

Unnamed: 0,ali,anthony,bill,bo,dave,hasan,jim,joe,john,louis,mike,ricky
aaaaahhhhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaaaauuugghhhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaaahhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aah,0,0,0,0,0,0,0,0,0,3,0,0
abc,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
zillion,0,0,1,0,0,0,0,0,0,0,0,0
zombie,1,0,1,0,0,0,0,0,0,0,0,0
zombies,0,0,1,0,0,0,0,0,0,0,0,0
zoo,0,0,0,0,0,0,0,0,0,0,0,1


In [138]:
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [139]:
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()

[(0,
  '0.011*"shit" + 0.009*"thing" + 0.009*"man" + 0.008*"life" + 0.008*"hes" + 0.008*"day" + 0.008*"fuck" + 0.007*"gon" + 0.006*"lot" + 0.006*"way"'),
 (1,
  '0.009*"day" + 0.007*"dad" + 0.007*"cause" + 0.007*"thing" + 0.006*"way" + 0.006*"shes" + 0.005*"life" + 0.005*"house" + 0.005*"night" + 0.005*"guy"')]

In [141]:
word1 = nlp('the')
word1

the

In [142]:
word2 = nlp('The')
word2

The

In [144]:
word1 == word2

False

In [145]:
nlp.pipe_names

['tagger', 'parser', 'ner']