In [183]:
docs = ['aa aa bb cc.',
        'cc dd ee ff.',
        'ff ff gg aa']

In [184]:
from sklearn.feature_extraction.text import CountVectorizer

count_model = CountVectorizer(ngram_range=(1,1)) # default unigram model
X = count_model.fit_transform(docs)
# X[X > 0] = 1 # run this line if you don't want extra within-text cooccurence (see below)
Xc = (X.T * X) # this is co-occurrence matrix in sparse csr format
Xc.setdiag(0) # sometimes you want to fill same word cooccurence to 0
print(Xc.todense()) # print out matrix in dense format

[[0 2 2 0 0 2 1]
 [2 0 1 0 0 0 0]
 [2 1 0 1 1 1 0]
 [0 0 1 0 1 1 0]
 [0 0 1 1 0 1 0]
 [2 0 1 1 1 0 2]
 [1 0 0 0 0 2 0]]


In [185]:
count_model.get_feature_names()

['aa', 'bb', 'cc', 'dd', 'ee', 'ff', 'gg']

In [186]:
count_model.vocabulary_

{'aa': 0, 'bb': 1, 'cc': 2, 'dd': 3, 'ee': 4, 'ff': 5, 'gg': 6}

In [187]:
# note this method of computing bigrams treats the document as a bag of words and does not
# preserve distance
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2)) # by saying 2,2 you are telling you only want pairs of 2 words

X = bigram_vectorizer.fit_transform(docs)
# X[X > 0] = 1 # run this line if you don't want extra within-text cooccurence (see below)
Xc = (X.T * X) # this is co-occurrence matrix in sparse csr format
Xc.setdiag(0) # sometimes you want to fill same word cooccurence to 0
print(Xc.todense()) # print out matrix in dense format

[[0 1 1 0 0 0 0 0 0]
 [1 0 1 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0]
 [0 0 0 0 1 1 0 0 0]
 [0 0 0 1 0 1 0 0 0]
 [0 0 0 1 1 0 0 0 0]
 [0 0 0 0 0 0 0 1 1]
 [0 0 0 0 0 0 1 0 1]
 [0 0 0 0 0 0 1 1 0]]


In [188]:
bigram_vectorizer.vocabulary_

{'aa aa': 0,
 'aa bb': 1,
 'bb cc': 2,
 'cc dd': 3,
 'dd ee': 4,
 'ee ff': 5,
 'ff ff': 6,
 'ff gg': 7,
 'gg aa': 8}

In [189]:
# note this method of computing bigrams treats the document as a bag of words and does not
# preserve distance
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2)) # by saying 2,2 you are telling you only want pairs of 2 words

X = bigram_vectorizer.fit_transform(docs)
# X[X > 0] = 1 # run this line if you don't want extra within-text cooccurence (see below)
Xc = (X.T * X) # this is co-occurrence matrix in sparse csr format
Xc.setdiag(0) # sometimes you want to fill same word cooccurence to 0
print(Xc.todense()) # print out matrix in dense format

[[0 1 1 0 0 0 0 0 0]
 [1 0 1 0 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 0]
 [0 0 0 0 1 1 0 0 0]
 [0 0 0 1 0 1 0 0 0]
 [0 0 0 1 1 0 0 0 0]
 [0 0 0 0 0 0 0 1 1]
 [0 0 0 0 0 0 1 0 1]
 [0 0 0 0 0 0 1 1 0]]


In [190]:
# finding only adjacent bigrams
# defualt pattern : '(?u)\\b\\w\\w+\\b'
# new patterns : 
# starting at first word all sucessive pairs: \w+(?:\W+)\w+
# starting at second word all sucsessive pairs: r'(?u)(?:\w+\W+)(\w+(?:\W+)\w+)'
adjacent_word_pattern = r'(?u)\w+(?:\W+)\w+' # starting from first 1st word
adjacent_word_pattern_offset_1 = r'(?u)(?:\w+\W+)(\w+(?:\W+)\w+)' # starting from first 2md word

A = CountVectorizer(token_pattern=adjacent_word_pattern)
A.fit_transform(docs)
print(A.vocabulary_)

B = CountVectorizer(token_pattern=adjacent_word_pattern_offset_1)
B.fit_transform(docs)
print(B.vocabulary_)

# combine the two sets
from sklearn.pipeline import FeatureUnion
adjacent_bigram_vectorize = FeatureUnion([('CountVectorizer', A),('CountVect', B)])
adjacent_bigram_vectorize.fit_transform(docs)

#print(adjacent_bigram_vectorize)

{'aa aa': 0, 'bb cc': 1, 'cc dd': 2, 'ee ff': 3, 'ff ff': 4, 'gg aa': 5}
{'aa bb': 0, 'dd ee': 1, 'ff gg': 2}


<3x9 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [191]:
bigram_vectorizer.vocabulary_

{'aa aa': 0,
 'aa bb': 1,
 'bb cc': 2,
 'cc dd': 3,
 'dd ee': 4,
 'ee ff': 5,
 'ff ff': 6,
 'ff gg': 7,
 'gg aa': 8}

In [192]:
count_model.vocabulary_

{'aa': 0, 'bb': 1, 'cc': 2, 'dd': 3, 'ee': 4, 'ff': 5, 'gg': 6}

In [193]:
#unique_words = list(count_model.vocabulary_.keys())
unique_words = count_model.get_feature_names()
unique_words

['aa', 'bb', 'cc', 'dd', 'ee', 'ff', 'gg']

In [194]:
word_2_index = dict(zip(unique_words,range(len(unique_words))))
word_2_index

{'aa': 0, 'bb': 1, 'cc': 2, 'dd': 3, 'ee': 4, 'ff': 5, 'gg': 6}

In [195]:
index_2_word = {v:k for k,v in word_2_index.items()}
index_2_word

{0: 'aa', 1: 'bb', 2: 'cc', 3: 'dd', 4: 'ee', 5: 'ff', 6: 'gg'}

In [196]:
# build empty co occurence matrix
n = len(unique_words)
co_mat = np.zeros(shape=(n,n))
co_mat

array([[0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.]])

In [197]:
# fill cooccurence matrix
for bigram,num_occurences_minus_1 in bigram_vectorizer.vocabulary_.items():
    
    # correct for counted number of occurences being 1 less than desired value
    num_occurences = num_occurences_minus_1 + 1
    
    # the bigram is tokens seperated by a space, num_occurences is the number of occurences of that bigram
    bigram_tokens = bigram.split(' ')

    # get the index of each token
    index_token_1 = word_2_index[bigram_tokens[0]]
    index_token_2 = word_2_index[bigram_tokens[1]]
    
    # lazy make symetric
    co_mat[index_token_1, index_token_2] += num_occurences
    co_mat[index_token_2, index_token_1] += num_occurences
    
    
co_mat
    

array([[ 2.,  2.,  0.,  0.,  0.,  0.,  9.],
       [ 2.,  0.,  3.,  0.,  0.,  0.,  0.],
       [ 0.,  3.,  0.,  4.,  0.,  0.,  0.],
       [ 0.,  0.,  4.,  0.,  5.,  0.,  0.],
       [ 0.,  0.,  0.,  5.,  0.,  6.,  0.],
       [ 0.,  0.,  0.,  0.,  6., 14.,  8.],
       [ 9.,  0.,  0.,  0.,  0.,  8.,  0.]])

In [198]:
import numpy as np
Y = bigram_vectorizer.fit_transform(docs)
sum_occ = np.sum(Y.todense(),axis=0)
#Yc = (Y.T * Y) # this is co-occurrence matrix in sparse csr format
#Yc.todense()
sum_occ

matrix([[1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int64)