In [229]:
import pandas as pd
import numpy as np
import nltk
from nltk import FreqDist
from nltk.corpus import stopwords
import os, glob, re


def df_tf(doc_lst = None):
    """gets doc freq and term frequency per document
    params:
        optional: a list of file names, if None uses glob to read all txt in directory 
    returns:
        word_freq: frequency of each word
        all_words: set of all words"""
        
    word_lst = [] # list of all words returned as set
    doc_dict = {} # each doc paired with a list of ALL words in it as list
    
    if doc_lst is None:
        path = os.getcwd() # get the directory we're in
        doc_lst = glob.glob(os.path.join(path, '*.txt')) # change this if using different file extensions
        
    for f in doc_lst:
        doc_name = os.path.split(f)[-1] # grab filename
        doc_title = doc_name.split('.')[0]
        with open(f, 'r') as read_file:
            text = read_file.readlines() # read all lines, stored as list of lines
            text = [x.lower().strip() for x in text] # lowercase and strip \n, spacing
            all_lines = " ".join(text) # combine lines to one string
            clean_text = re.sub(r'[^\w\s]','',all_lines) # pull out punctuation
            word_lst.append(clean_text) # add new string to giant word list
            doc_dict[doc_title] = sorted(clean_text.split()) # add sorted list of words per doc to doc_dict value

    words = " ".join(word_lst) # join each list of doc words to one giant string
#     word_freq = {k : words.count(k) for k in set(words.split())} # slower than nltk
    word_freq = FreqDist(words.split()) # get freq dist for entire vocab
    all_words = sorted(set(x for x in words.split())) # split and sort vocab

    return all_words, word_freq, doc_dict


def make_term_doc_incidence(doc_dict, all_words):
    """
    parameters: 
        doc_dict: dict with list of text per doc
        all_words: set(vocab of all docs)
    returns:
        term document incidence matrix
        one hot df if word in doc for all vocab
    """
    
    d = {"word" : all_words}
    df = pd.DataFrame(d)
    # append df with each doc and one hot encoding
    for i, word in enumerate(all_words):
        for doc in doc_dict.keys():
            df.loc[i, doc] = np.where(word in doc_dict[doc], 1, 0)

    return df


def make_tf(doc_dict, all_words):
    """
    parameters: 
        doc_dict: dict created earlier with list of text in each doc
        all_words: set(vocab of all docs)
    returns:
        term frequency
        count of word in each doc
    """
    
    d = {"word" : all_words}
    df = pd.DataFrame(d)
    # append df with each doc and one hot encoding
    for i, word in enumerate(all_words):
        for doc in doc_dict.keys():
            df.loc[i, doc] = np.where(word in doc_dict[doc], doc_dict[doc].count(word), 0)
        
    return df


def weighted_tf(df_tf):
    """calculates weighted term freq per doc
    params:
        df_tf: raw term doc freq (actual word counts per doc)
    returns:
        new df with weighted tf
    """
    
    docs = df_tf.columns[1:]
    for doc in docs:
        
        df_tf[doc] = 1 + np.log10(df_tf[doc])
        df_tf[doc] = np.where( - df_tf[doc] == np.inf, 0, df_tf[doc])

    return df_tf
                         

def inv_df(df):
    """calculates inverse doc freq
    params:
         df: term document incidence
    returns:
        inverse document frequency
        log10(number of docs/doc freq)
    """

#     doc_freq = df.sum(axis = 1, numeric_only = True)
    docs = df.columns[1:]
    n = len(docs)
    for doc in docs:
        doc_freq = df[doc]
        df[doc] = np.log10(n/doc_freq)
        df[doc] = np.where(df[doc] == np. inf, 0, df[doc])
        
    return df


def tf_idf(weighted_tf, idf):
    """calculates tf-idf 
    (1 + log10(tf)) * log10(n/df)
    params:
        df_tf = dataframe of doc freq and term freq
        weighted_tf: weighed term freq
        idf: inverse doc freq
    returns:
        tf-idf
    """
    
    df = weighted_tf
    docs = df.columns[1:]
    for doc in docs:
        df[doc] = weighted_tf[doc] * idf[doc]

    return df


def cos_sim(q, d):
    """calculates cosine similarity between vectors"""

    euclid = lambda x: np.linalg.norm(x)
    # skip calc if ==
    if np.all(q == d):
        return 1

    dot_prod = np.dot(q,d)
    denom = euclid(q) * euclid(d)

    # avoid /0
    if np.isclose(denom, 0, atol = 1e-32):
        sim = 0
    else:
        sim = dot_prod / denom
        
    return sim

def cos_sim_compare(tfidf):
    """
    creates matrix of cosine similarities
    params:
        tfidf: term freq inverse doc freq
    returns:
        similarity matrix to compare docs using cosine similarity
    """
    
    docs = tfidf.columns[1:]
    sim_matrix = pd.DataFrame()
        
    for doc in docs:
        for row in docs:
            sim_matrix.loc[row, doc] = cos_sim(tfidf[row], tfidf[doc])
    
    return sim_matrix


def main(lst_of_docs = None):
    """runs nlp processing
    params:
        lst_of_docs: optional list of file names, if blank glob will choose all txt files in directory
    returns:
    
    """
    
    aw, fd, dd = df_tf(doc_lst = lst_of_docs)
    df = make_term_doc_incidence(dd, aw)
    tf = make_tf(dd, aw)
    wtf = weighted_tf(tf)
    idf = inv_df(df)
    tfidf = tf_idf(wtf, idf)
    cos_sim_matrix = cos_sim_compare(tfidf)
    return cos_sim_matrix


In [230]:
main()

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,doc1,doc2,doc3
doc1,1.0,0.864301,0.310088
doc2,0.864301,1.0,0.260563
doc3,0.310088,0.260563,1.0
