In [29]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import pdb
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [30]:
import nltk;
from nltk.corpus import stopwords;

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)

nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/avichanales/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from gensim.corpora import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.matutils import sparse2full
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)

import spacy
nlp  = spacy.load('en_core_web_md')

## 0.0 Load Data

In [32]:
file_name = '/Users/avichanales/Dropbox/Insight/Project/insight_project/data/interim/charity_data_cleaned.csv'
all_charity = pd.read_csv(file_name)

## 1.0 Pre-process raw text (lamentize and remove stopwords) 

In [33]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS:
            #result.append(lemmatize_stemming(token))
            result.append(token)
            
    return result

def preprocess_docs(docs):
    result = []
    
    for doc in docs:
        result.append(preprocess(doc))
    
    return result

In [41]:
def preprocess_spacy(raw_text):
    
    doc = nlp(raw_text)
    
    #Remove organizations and people from documnet text
    tokens_ner = [entity.text for entity in doc.ents if entity.label_ in {'DATE', 'PERSON', 'ORG'}]

    for term in tokens_ner:
        raw_text = raw_text.replace(term,"")
    
    #Re-convert preprocessed text to spacy object    
    doc = nlp(raw_text)


    #Remove stopwords and lemmatize
    tokens = [token.lemma_ for token in doc if not (token.is_stop or token.is_punct)]

    return tokens

def preprocess_docs(docs):
    result = []
    
    for doc in docs:
        result.append(preprocess_spacy(doc))
    
    return result

In [42]:
all_charity['description_noname'] = all_charity.apply(lambda x: x['description'].replace(x['name'],""),axis=1)

In [43]:
mission_text = all_charity['description_noname'].astype('str')

#Preprocess mission descriptions
mission_text_pre = preprocess_docs(mission_text)

## 2.0 Convert pre-processed text to vectors and embed

In [44]:
def word_embed_charity_tfidf(processed_docs,word_min=5, word_max_perc=.2):
    
    'Assumes docs have already been pre-processed'
    
    #Create dictionary from corpus
    docs_dict = Dictionary(processed_docs)
    docs_dict.filter_extremes(no_below=word_min, no_above=word_max_perc)
    docs_dict.compactify()
    
    #Convert docs into tf-idf vectors
    docs_corpus = [docs_dict.doc2bow(doc) for doc in processed_docs]
    model_tfidf = TfidfModel(docs_corpus, id2word=docs_dict)
    docs_tfidf  = model_tfidf[docs_corpus]
    docs_vecs   = np.vstack([sparse2full(c, len(docs_dict)) for c in docs_tfidf])
    
    num_docs= np.shape(docs_vecs)[0]
    num_words = np.shape(docs_vecs)[1]

    print("Total # of docs: {}".format(num_docs))
    print("Total # of words in dict: {}".format(num_words))
    
    #For each word in dict obtain embedding vector (Glove vectors)
    tfidf_emb_vecs = np.vstack([nlp(docs_dict[i]).vector for i in range(len(docs_dict))])
    
    # Weight glove vectors by tf-idf values
    docs_emb = np.dot(docs_vecs, tfidf_emb_vecs) 
        
    return docs_emb, docs_dict, model_tfidf, tfidf_emb_vecs

In [45]:
word_min = 0
word_max_perc = .5
charity_docs_emb, charity_docs_dict, charity_model_tfidf, charity_tfidf_emb_vecs = word_embed_charity_tfidf(mission_text_pre,
                                                                                                            word_min=word_min, 
                                                                                                            word_max_perc=word_max_perc)




Total # of docs: 5637
Total # of words in dict: 12690


In [46]:
#Store output in dictionary and save
charity_model_dict = {'charity_docs_emb': charity_docs_emb, 
                      'charity_docs_dict': charity_docs_dict,
                      'charity_model_tfidf': charity_model_tfidf,
                     'charity_tfidf_emb_vecs': charity_tfidf_emb_vecs}

with open('/Users/avichanales/Dropbox/Insight/Project/insight_project/data/processed/charity_model_min_{}_max_{}.pickle'.format(word_min,word_max_perc), 'wb') as handle:
    pickle.dump(charity_model_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## 4.0 Preprocess Headline

In [159]:
def process_embed_text(text,charity_docs_dict,charity_model_tfidf,charity_tfidf_emb_vecs):
    
    text_pre = preprocess_spacy(text)
     
    #Convert docs into tf-idf vectors
    doc_corpus = charity_docs_dict.doc2bow(text_pre)
    doc_tfidf  = charity_model_tfidf[doc_corpus]
    doc_vec   = np.vstack([sparse2full(doc_tfidf, len(charity_docs_dict))])
    
    # sum of glove vectors linearlly weighted by tfidf 
    art_emb = np.dot(doc_vec, charity_tfidf_emb_vecs)
    
    
    return art_emb

In [160]:
def compute_similarity_output_n(art_emb,charity_docs_emb,topn):
    
    #compute cosine distance from article embedding to all charities
    sim_to_charities = cosine_similarity(art_emb,charity_docs_emb)
    
    #find topN similarity scores
    sim_scores_sorted = -np.sort(-sim_to_charities).flatten()
    topN_scores = sim_scores_sorted[:topn]
    
    #find topN indices
    indices_sorted = (-sim_to_charities).argsort().flatten()
    topN_indices = indices_sorted[:topn].flatten()
    
    return topN_scores, topN_indices
    
    

In [161]:
def topN_ranked_charities(charity_df, topN_scores, topN_indices):
    
    charity_df_slim = charity_df[['name','subcategory','score','description']]
    
    #Extract topN charities and info
    similar_charities = charity_df_slim.iloc[topN_indices].reset_index()
    
    #Add their similarity scores
    similar_charities['sim_score'] = topN_scores
    
    return similar_charities

In [174]:
text = 'Greta Thunberg became a climate activist not in spite of her autism, but because of it'

In [175]:
art_emb = process_embed_text(text,charity_docs_dict,charity_model_tfidf,charity_tfidf_emb_vecs)

In [176]:
topN_scores, topN_indices = compute_similarity_output_n(art_emb,charity_docs_emb,20)
topN_charities = topN_ranked_charities(all_charity, topN_scores, topN_indices)

In [177]:
topN_charities

Unnamed: 0,index,name,subcategory,score,description,sim_score
0,2858,The Austin Community in Action,Patient and Family Support,85.09,The Autism Community in Action (TACA) formerly...,0.717931
1,4458,Doug Flutie Jr. Foundation For Autism,"Diseases, Disorders, and Disciplines",87.31,The Doug Flutie Jr. Foundation For Autism's go...,0.712984
2,3952,Climate Central,Environmental Protection and Conservation,84.53,Climate Central works to communicate the scien...,0.693686
3,2347,Turning Pointe Autism Foundation,Patient and Family Support,94.74,Turning Pointe Autism Foundation was founded i...,0.690892
4,4679,ecoAmerica,Environmental Protection and Conservation,88.77,"ecoAmerica builds institutional leadership, pu...",0.676833
5,492,Lewy Body Dementia Association,Patient and Family Support,89.94,"Through outreach, education and research, we s...",0.670822
6,4795,FRAXA Research Foundation,Medical Research,97.17,FRAXA Research Foundation was founded in 1994 ...,0.66861
7,1175,National Center for Transgender Equality,Advocacy and Education,83.37,The National Center for Transgender Equality (...,0.668078
8,1194,National Alliance for the Mentally Ill of New ...,"Diseases, Disorders, and Disciplines",87.19,The National Alliance on Mental Illness of New...,0.667901
9,4232,Center for International Policy,"International Peace, Security, and Affairs",88.46,The Center for International Policy is a nonpr...,0.666624
