In [15]:
!pip install spacy
!python -m spacy.en.download
!pip install autocorrect
!pip install seaborn
!pip install Jinja2

In [8]:

import warnings
import spacy
import autocorrect

from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns
green_cm = sns.light_palette("green", as_cmap=True)
diverging_colors = sns.color_palette("coolwarm")
diverging_cm = LinearSegmentedColormap.from_list('diverge', diverging_colors)


warnings.filterwarnings('ignore')




def highlight_pos(val):
    '''
    highlight the maximum in a Series yellow.
    '''
    color = 'white' if val < 1 else 'yellow'
    return 'background-color: %s' % color

In [9]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

documents = [
    'He playedd baseball',
    'He plays football',
    'He had a sandwich'
]

toDTM = CountVectorizer(lowercase=False)
DTM = toDTM.fit_transform(documents).todense()
words = toDTM.get_feature_names()

summary = pd.DataFrame(DTM, columns = words, index = documents)
summary.style.applymap(highlight_pos)

Unnamed: 0,He,baseball,football,had,playedd,plays,sandwich
He playedd baseball,1,1,0,0,1,0,0
He plays football,1,0,1,0,0,1,0
He had a sandwich,1,0,0,1,0,0,1


### Reducing the number of columns in Document Term Matrix:
* Normalization and lemmatization
* Spelling correction
* Setting vocabulary size thresholds

In [10]:
#create the processing function
def process(document):
    """
    Parameters
    ----------
    document: str
        The document we want to process
        
    Returns
    ----------
    
    """
    #create spacy object
    
    spacy_doc = nlp(unicode(document), parse=False, entity=False)
    
    #grab the lemma for each token in the document
    processed_tokens = map(lambda token: token.lemma_, spacy_doc)
    
    #join lemmas to a string
    result = " ".join(processed_tokens)
    return result


#Create the Custom tokenizer
class SpellTokenizer(object):
    
    def __init__(self, nlp):
        self.vocab = nlp.vocab
    
    def __call__(self, text):
        doc = nlp.tokenizer(unicode(text))
        words = [autocorrect.spell(i.orth_) for i in doc]
        return spacy.tokens.Doc(self.vocab, words = words)

#create a language model that uses the custom tokenizer
nlp = spacy.load('en')
nlp.make_doc = SpellTokenizer(nlp)        
    
#pass in the process function to sklearns vectorizer
toDTM = CountVectorizer(preprocessor=process 
                        , min_df = 0. #set minimum of token instance
                       )

DTM = toDTM.fit_transform(documents).todense()
words = toDTM.get_feature_names()

summary = pd.DataFrame(DTM, columns = words, index = documents)
summary.style.applymap(highlight_pos)

Unnamed: 0,baseball,football,have,he,play,sandwich
He playedd baseball,1,0,0,1,1,0
He plays football,0,1,0,1,1,0
He had a sandwich,0,0,1,1,0,1


### Retrieving Documents in a DTM

0) Encode documents as a DTM

1) Encode the query

2) Compute similarities/distances of query vector and dtm

3) Pick argmin/argmax

In [11]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def getMostSimilarSentence(query, documents):
    """
    Transforms query into vector, and computes cosine similarity 
    of query vector against training documents.
    
    Parameters
    ----------
    query:                (string) document to compare
    vectorizer:            sklearn vectorizer class 
    document_term_matrix: (pandas.DataFrame) table of 
                          term instances in each document
                          
    Returns
    -------
    most similar document (string)
    """
    #create vectorizer and use it to build dtm
    vectorizer = CountVectorizer(preprocessor=process)
    dtm = vectorizer.fit_transform(documents).todense()
    
    #transform query to vector
    query_vector = encodeQuery(query, vectorizer)
    
    #compute similarityes
    similarities = computeSimilarities(query_vector, dtm)
    
    #grab most similar document
    closest_idx = getMostSimilarIdx(similarities)
    return documents[closest_idx]

def encodeQuery(query, vectorizer):
    
    #transform query to vector
    query_vector = vectorizer.transform([query]).todense()
    return query_vector

def computeSimilarities(query_vector, dtm):
    
    #compute similarities
    all_vectors = np.concatenate((dtm, query_vector))
    similarities = cosine_similarity(all_vectors)[-1][:-1]
    return similarities

def getMostSimilarIdx(similarities):
    #grab most similar document
    return np.argmax(similarities)
    
query = 'Foootball'
getMostSimilarSentence(query, documents)

'He plays football'

In [12]:
#make labels 0/1 about sports
about_sports = {
    'He playedd baseball':1,
    'He plays football':1,
    'He had a sandwich':0
}
summary['about_sports'] = pd.Series(about_sports)

X = summary[words].values
y = summary['about_sports']

#regress labels on elements of DTM
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C = 100000.)
model.fit(X, y)

#show coeficients
pd.Series(model.coef_[0], index = words).sort_values()

have       -5.474318
sandwich   -5.474318
he          0.584732
baseball    3.029525
football    3.029525
play        6.059050
dtype: float64

In [13]:
from sklearn.linear_model import LogisticRegression
mod = LogisticRegression(C = 100000.)
X = summary[words].values
y = summary['about_sports']

mod.fit(X, y)
coefficients = pd.DataFrame(mod.coef_[0], index = words, columns = ['coefficient']).sort('coefficient')
s = coefficients.style.background_gradient(cmap=diverging_cm)
s

Unnamed: 0,coefficient
have,-5.47432
sandwich,-5.47432
he,0.584732
baseball,3.02952
football,3.02952
play,6.05905


In [14]:
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns

#dataframe of pairwise cosine similarites
df = pd.DataFrame(cosine_similarity(summary[words].T.values), index = words, columns = words)

#apply some style to the dataframe
green_cm = sns.light_palette("green", as_cmap=True)
s = df.style.background_gradient(cmap=green_cm)
s

Unnamed: 0,baseball,football,have,he,play,sandwich
baseball,1.0,0.0,0.0,0.57735,0.707107,0.0
football,0.0,1.0,0.0,0.57735,0.707107,0.0
have,0.0,0.0,1.0,0.57735,0.0,1.0
he,0.57735,0.57735,0.57735,1.0,0.816497,0.57735
play,0.707107,0.707107,0.0,0.816497,1.0,0.0
sandwich,0.0,0.0,1.0,0.57735,0.0,1.0
