In [4]:
from __future__ import division
import numpy as np
import pandas as pd
import os
from glob import glob
import seaborn as sns
from matplotlib.colors import ListedColormap
%pylab inline
pylab.rcParams['figure.figsize'] = (12.0, 6.0) # set size of figures"
plt.rcParams.update({'font.size': 24})
pd.options.display.max_rows=50
import re
import json
import ast

from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence, TaggedDocument
from gensim import corpora, models, similarities

Populating the interactive namespace from numpy and matplotlib


In [14]:
def find_sheet_name(target_text,sheet_names):
    '''Return a list of sheet names containing a specific string'''
    return [name for name in sheet_names if target_text.lower() in name.lower()]
def find_column_name(target,columns):
    return [name for name in columns if target.lower() in name.lower()]
def pq_lookup(date):
    '''retun period, quarter given a date (year,month,day)'''
    #3 month index to period quarter
    quarter_map={1:(1,1),2:(1,2),3:(2,3),4:(2,4),5:(3,1),6:(3,2),7:(4,3),8:(4,4),9:(5,1),10:(5,2),11:(6,3),12:(6,4)}
    qnum=1+int((date.month-pd.datetime(2016,4,1).month)/3)
    p,q=quarter_map[qnum]
    return p,q

def save_dict_as_json(filename, mapping):
    '''Save dict as json object'''
    with open(filename, 'w') as outfile:
        json.dump(mapping, outfile, indent=4, sort_keys=True, separators=(',', ':'))

def load_dict_from_json(filepath):
    '''given file path to json mapping return a dict'''
    with open(filepath) as data_file:    
        return json.load(data_file)

<h1> Build DF from all quarter 1 reports </h1>

In [15]:
Q1base='/Users/attiladobi/nacosa/New SW Finance Data (Oct 2016)/April to June/'
Q1_reports=[Q1base+fname for fname in os.listdir(Q1base) if 'xlsx' in fname]

year=2016
q1_months=[4,5,6]
df_list=[]

for q1_report in Q1_reports:
    #get SR name from file path
    SR=q1_report.split('/')[-1].split(' Finance')[0]
    #read in the reports for q1
    try:
        xlsx_obj=pd.ExcelFile(q1_report)
    except IOError:
        print 'no such file'
    #read in relavent sheet name
    sheet_names=xlsx_obj.sheet_names
    sheet_name=find_sheet_name('payment recommendation',sheet_names)[0]
    #find the column that contains "indicator"
    col_name=find_column_name('indicator',xlsx_obj.parse(sheetname=sheet_name,parse_cols=12,skiprows=7, nrows=1).columns)
    df=xlsx_obj.parse(sheetname=sheet_name,parse_cols=12,skiprows=7).dropna(subset=col_name) #drop from 1st nan entry
    df=df.rename(columns={col_name[0]:'Indicator'})
    to_drop=find_column_name('Unnamed',df.columns) #find all unnamed columns
    df=df.drop(to_drop,1) #drop all unnamed columns
    df=df[df['Indicator'].str.contains('Number',na=False)] #only keep rows where the indicator is a counts of something
    
    column_heads=['','.1','.2'] #.3 represents the quarterly total along with % of target hit
    
    for month, col_head in zip(q1_months,column_heads):
        date=pd.datetime(year,month,1)
        p,q= pq_lookup(date)

        sub_df=df[['Indicator','TARGET  '+col_head,'ACTUAL     '+col_head]][1:].replace(nan,0)
        sub_df.columns=['Indicator','Target','Actual'] #normalize naming convention
        if SR=='Oasis':
            SR='OASIS'
        if SR == 'Nqobile Women':
            SR='NQOBILE'
        if SR == 'THCA':
            SR ='TBHIV'
        sub_df['Subrecipient']=SR
        sub_df['Date']=date
        sub_df['Period']=p
        sub_df['Quarter']=q
        df_list.append(sub_df)
DF=pd.concat(df_list)

In [4]:
DF.loc[(DF['Actual']>=0) & (DF['Indicator'].str.contains('B4'))].groupby(['Subrecipient','Indicator']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Target,Actual,Period,Quarter
Subrecipient,Indicator,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CPC,B4a: Number of newly diagnosed HIV positive SWs referred for ART,0.0,0.0,3,3
CPC,B4b: Number of newly diagnosed HIV positive SWs initiated on ART,0.0,62.0,3,3
NQOBILE,B4a: Number of newly diagnosed HIV positive SWs referred for ART,0.0,0.0,3,3
NQOBILE,B4b: Number of newly diagnosed HIV positive SWs initiated on ART,0.0,0.0,3,3
OASIS,B4a: Number of newly diagnosed HIV positive SWs referred for ART,0.0,1.0,3,3
OASIS,B4b: Number of newly diagnosed HIV positive SWs initiated on ART,0.0,0.0,3,3
PHRU,B4a: Number of newly diagnosed HIV positive SWs referred for ART,0.0,14.0,3,3
PHRU,B4b: Number of newly diagnosed HIV positive SWs initiated on ART,0.0,8.0,3,3
QAC,B4a: Number of newly diagnosed HIV positive SWs referred for ART,0.0,0.0,3,3
QAC,B4b: Number of newly diagnosed HIV positive SWs initiated on ART,0.0,0.0,3,3


<h1> all M&E names </h1>

In [16]:
[val for val in DF.groupby('Indicator').count().index]

[u'A2: Number of sex workers who received one or more services through outreach',
 u'B1: Number of Sex Workers that have received an HIV test during the reporting period and know their results',
 u'B2: Number of SWs who tested HIV negative in the reporting period',
 u'B3: Number of newly diagnosed HIV positive SWs ',
 u'B4a: Number of newly diagnosed HIV positive SWs referred for ART',
 u'B4b: Number of newly diagnosed HIV positive SWs initiated on  ART',
 u'B4b: Number of newly diagnosed HIV positive SWs initiated on ART',
 u'B5: Number of SWs with known HIV positive status',
 u'B6: Number of SWs who refused HIV test ',
 u'B7: Number of HIV negative SWs receiving PrEP ',
 u'B8a: Number of HIV positive SWs referred for Adherence support',
 u'B8a: Number of HIV positive SWs succesfully referred for Adherence support',
 u'B8b: Number of HIV positive SWs succesfully referred for Adherence support ',
 u'C1: Number of SWs screened  for Sexually Transmitted Infections (STI) ',
 u'C2: Number 

<h3> Create dictorany of M&E names </h3>

In [17]:
def cleanUp(sentence):
    sentence=sentence.replace('succesfully','successfully')#ahh misspelling!
    sentence=sentence.replace('Centres','Centers')#ahh misspelling!
    sentence=sentence.replace('(','').replace(')','')
    sentence=re.sub("sex workers", "sex workers", sentence, flags=re.I)
    sentence=re.sub("SWs", "sex workers", sentence, flags=re.I)
    sentence=sentence.replace('&','and').replace('tested positive','newly diagnosed').replace('-','')
    sentence=' '.join(sentence.split()) #clean up extra whitespaces
    return sentence

stopwords=['number','of','-','by','the','who','for','(SR)','an']
def clean_line(sentence):
    return ' '.join([word for word in sentence.lower().split() if word not in stopwords])

In [18]:
S=dict()
A1=['A1:Number of sex workers reached with HIV prevention programs - individual and/or smaller group level interventions'.split(':')]
#adding A1 from targets sheet (A1=A2)
potential_indicators=[val.split(':') for val in DF.groupby('Indicator').count().index if len(val.split(':'))>1]
                    
for indicator in potential_indicators+A1:
    sub_id=''
    key=indicator[0][:2]
    description=cleanUp(indicator[1])
    if 'referred' in description:
        sub_id='a'
    if ('successfully' in description) or ('initiated' in description):
         sub_id='b'
    key+=sub_id
    #if sexual violence create a new catigory:
    if 'violence' in description:
        key='V'
    #print(key, description)
    S[key]=key+': '+description

#add sexual violence as a new catigory:
keynum=1
for indicator in [val for val in DF.groupby('Indicator').count().index if 'violence' in val]:
    description=cleanUp(indicator)
    if description not in S.values():
        S['V'+str(keynum)]='V' + str(keynum)+': '+description
        keynum+=1
S

{'A1': 'A1 Number of sex workers reached with HIV prevention programs individual and/or smaller group level interventions',
 u'A2': u'A2 Number of sex workers who received one or more services through outreach',
 u'B1': u'B1 Number of sex workers that have received an HIV test during the reporting period and know their results',
 u'B2': u'B2 Number of sex workers who tested HIV negative in the reporting period',
 u'B3': u'B3 Number of newly diagnosed HIV positive sex workers',
 u'B4a': u'B4a Number of newly diagnosed HIV positive sex workers referred for ART',
 u'B4b': u'B4b Number of newly diagnosed HIV positive sex workers initiated on ART',
 u'B5': u'B5 Number of sex workers with known HIV positive status',
 u'B6': u'B6 Number of sex workers who refused HIV test',
 u'B7': u'B7 Number of HIV negative sex workers receiving PrEP',
 u'B8a': u'B8a Number of HIV positive sex workers referred for Adherence support',
 u'B8b': u'B8b Number of HIV positive sex workers successfully referred fo

In [28]:
S['A2']

u'A2 Number of sex workers who received one or more services through outreach'

<h1> Save M&E dict </h1>

In [19]:
np.save('meMAP.npy',S)
save_dict_as_json('meMAP.json', S) #ME_CODE
#read_dictionary = np.load('ME_dict.npy').item()

<h1> loop through the dict and find the best match </h1>

In [20]:
for description_all in DF.groupby('Indicator').count().index:
    description=cleanUp(clean_line(description_all.split(':')[-1]))
    words=description.lower().split()
    best_key,best_overlap,best_extra='',0,10
    for key,val in S.iteritems():
        val=clean_line(val)
        l=[re.findall(word, val) for word in words]
        len_term=len(val.split())
        overlap=len([item for sublist in l for item in sublist])/(len_term+len(words))
        extra_terms=abs(len_term-overlap)
        #print(overlap, key)
        if (overlap>best_overlap):
            best_key,best_overlap,best_extra=key,overlap,extra_terms
            #check for sucessfully and initiated (b vs a flag)
            if ((('successfully' in words) or ('initiated' in words)) & (best_key[-1]=='a')):
                    best_key.replace('a','b')
            #if ('victims' in words):
            #        best_key='SV'
    print(best_key, description_all)

(u'A2', u'A2: Number of sex workers who received one or more services through outreach')
(u'B1', u'B1: Number of Sex Workers that have received an HIV test during the reporting period and know their results')
(u'B2', u'B2: Number of SWs who tested HIV negative in the reporting period')
(u'B3', u'B3: Number of newly diagnosed HIV positive SWs ')
(u'B4a', u'B4a: Number of newly diagnosed HIV positive SWs referred for ART')
(u'B4b', u'B4b: Number of newly diagnosed HIV positive SWs initiated on  ART')
(u'B4b', u'B4b: Number of newly diagnosed HIV positive SWs initiated on ART')
(u'B5', u'B5: Number of SWs with known HIV positive status')
(u'B6', u'B6: Number of SWs who refused HIV test ')
(u'B7', u'B7: Number of HIV negative SWs receiving PrEP ')
(u'B8a', u'B8a: Number of HIV positive SWs referred for Adherence support')
(u'B8b', u'B8a: Number of HIV positive SWs succesfully referred for Adherence support')
(u'B8b', u'B8b: Number of HIV positive SWs succesfully referred for Adherence supp

<h1> Loop through and build a dictionary lookup of the best match </h1>

In [21]:
ME_NAME_MAP=dict()

for description_all in DF.groupby('Indicator').count().index:
    description=cleanUp(clean_line(description_all.split(':')[-1]))
    words=description.lower().split()
    best_key,best_overlap,best_extra='',0,10
    for key,val in S.iteritems():
        val=clean_line(val)
        l=[re.findall(word, val) for word in words]
        len_term=len(val.split())
        overlap=len([item for sublist in l for item in sublist])/(len_term+len(words))
        extra_terms=abs(len_term-overlap)
        #print(overlap, key)
        if (overlap>best_overlap):
            best_key,best_overlap,best_extra=key,overlap,extra_terms
            #check for sucessfully and initiated (b vs a flag)
            if ((('successfully' in words) or ('initiated' in words)) & (best_key[-1]=='a')):
                    best_key.replace('a','b')
            #if ('victims' in words):
            #        best_key='SV'
    ME_NAME_MAP[description_all] = S[best_key]

In [23]:
save_dict_as_json('ME_NAME_MAP.json', ME_NAME_MAP)
#also save in Target_analysis.ipynb

In [24]:
ME_NAME_MAP

{u'A2: Number of sex workers who received one or more services through outreach': u'A2 Number of sex workers who received one or more services through outreach',
 u'B1: Number of Sex Workers that have received an HIV test during the reporting period and know their results': u'B1 Number of sex workers that have received an HIV test during the reporting period and know their results',
 u'B2: Number of SWs who tested HIV negative in the reporting period': u'B2 Number of sex workers who tested HIV negative in the reporting period',
 u'B3: Number of newly diagnosed HIV positive SWs ': u'B3 Number of newly diagnosed HIV positive sex workers',
 u'B4a: Number of newly diagnosed HIV positive SWs referred for ART': u'B4a Number of newly diagnosed HIV positive sex workers referred for ART',
 u'B4b: Number of newly diagnosed HIV positive SWs initiated on  ART': u'B4b Number of newly diagnosed HIV positive sex workers initiated on ART',
 u'B4b: Number of newly diagnosed HIV positive SWs initiated o

<h3> We will take all names with a code and train a Doc2vec model </h3>

In [298]:
labels=[val.split(':')[0] for val in DF.groupby('Indicator').count().index if len(val.split(':'))>1]
descriptions=[val.split(':')[1] for val in DF.groupby('Indicator').count().index if len(val.split(':'))>1]
#need to do a little cleaning. The 11th entry B8a is incorrect. Also the second to last G2a
labels.pop(11),descriptions.pop(11)
#labels.pop(-2),descriptions.pop(-2)

(u'B8a',
 u' Number of HIV positive SWs succesfully referred for Adherence support')

<p> Create a lookup for M&E identifier and bow associated with it </p>

In [301]:
#first clean up double white spaces in all descriptions
clean_descriptions=[' '.join(desc.split()) for desc in descriptions]

In [305]:
len(labels),len(set(clean_descriptions))

(31, 27)

In [304]:
set(labels)

{u'A2',
 u'B1',
 u'B2',
 u'B3',
 u'B4a',
 u'B4b',
 u'B5',
 u'B6',
 u'B7',
 u'B8a',
 u'B8b',
 u'C1',
 u'C2',
 u'C2a',
 u'C2b',
 u'D1',
 u'D2',
 u'D2a',
 u'D2b',
 u'E1',
 u'E2',
 u'E3',
 u'E4',
 u'F1',
 u'F2',
 u'F3',
 u'G1',
 u'G2a',
 u'G2b'}

<h1> Setup for training </h1>

In [213]:
def process_line(sentence):
    '''Splits sentence if punctuation is identified. Returns a list of list of words for each sentence'''
    sentences=re.split(r"(?<![0-9])[.?!;](?![0-9])", sentence)
    result= [re.findall("[a-z'.-0-9]+", sent.lower()) for sent in sentences if \
            re.findall("[a-z'.-0-9]+", sent.lower())!=[]]
    if result==[]:
        result=[['']]
    return result
stoplist = set('number for a an of or the and to in rt'.split())

In [215]:
desc_words=[[word for word in process_line(d)[0] if word not in stoplist] for d in desc]
#desc_words

In [216]:
docs=[]
#for i,words in enumerate(review_sentence):
#    docs.append(TaggedDocument(words,['R_'+str(i)]))
for label,words in zip(labels,desc_words):
    docs.append(TaggedDocument(words,[label]))

In [217]:
docs[15:30]

[TaggedDocument(words=[u'sws', u'with', u'positive', u'sti', u'symptom', u'successfully', u'referred', u'sti', u'treatment'], tags=[u'C2b']),
 TaggedDocument(words=[u'sws', u'screened', u'tb'], tags=[u'D1']),
 TaggedDocument(words=[u'sws', u'successfully', u'referred', u'tb', u'investigations'], tags=[u'D2']),
 TaggedDocument(words=[u'sws', u'referred', u'tb', u'investigations'], tags=[u'D2a']),
 TaggedDocument(words=[u'sws', u'successfully', u'referred', u'tb', u'investigations'], tags=[u'D2b']),
 TaggedDocument(words=[u'creative', u'space', u'workshops'], tags=[u'E1']),
 TaggedDocument(words=[u'sws', u'attending', u'creative', u'space', u'workshops'], tags=[u'E2']),
 TaggedDocument(words=[u'small', u'group', u'workshops'], tags=[u'E3']),
 TaggedDocument(words=[u'sws', u'attending', u'small', u'group', u'workshops'], tags=[u'E4']),
 TaggedDocument(words=[u'male', u'condoms', u'distributed'], tags=[u'F1']),
 TaggedDocument(words=[u'female', u'condoms', u'distributed'], tags=[u'F2']),
 

In [223]:
model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-3, negative=0, workers=2, hs=1,alpha=0.01,dm_mean=0)
model.build_vocab(docs)
model.train(docs)
v=model.infer_vector( 'male condoms distributed'.split(),alpha=0,steps=1)
model.docvecs.most_similar([v])
#model.docvecs.most_similar?

[(u'E3', 0.1120995581150055),
 (u'E2', 0.09986546635627747),
 (u'D2a', 0.06480303406715393),
 (u'E4', 0.055689867585897446),
 (u'B2', 0.05426891893148422),
 (u'G2a', 0.053596362471580505),
 (u'E1', 0.03409339487552643),
 (u'D2b', 0.029231755062937737),
 (u'C2a', 0.012065872550010681),
 (u'F1', 0.010497007519006729)]

In [226]:
model.docvecs['F1']

array([  1.20246725e-03,  -2.74422695e-03,  -4.58507426e-03,
         1.90203520e-03,   2.20909668e-03,   4.67715785e-03,
         1.45936885e-03,  -2.92332983e-03,   1.86492607e-03,
        -1.70439074e-03,   1.63799257e-03,   1.63047179e-03,
        -2.75255111e-03,   3.72715038e-03,   1.31533016e-04,
        -1.67188537e-03,   2.00551073e-03,  -3.94337764e-03,
         3.08526866e-03,  -3.15058371e-03,  -5.51420671e-04,
         3.77022196e-04,  -3.18640377e-03,   3.45052336e-03,
        -3.50580830e-03,  -3.83757614e-03,   7.38978386e-04,
         2.06110836e-03,   9.54967574e-04,  -4.11084015e-03,
        -4.49457264e-04,   4.71988274e-03,   1.48073595e-03,
        -3.45566310e-03,  -2.44019227e-03,  -4.90715308e-03,
        -4.72123874e-03,  -8.03103147e-04,  -4.40521585e-03,
        -3.72836599e-03,  -2.90851993e-03,  -1.64185534e-03,
        -2.43012421e-03,  -3.67528549e-03,  -3.68785835e-03,
        -3.49540589e-03,   2.15889723e-03,  -4.60592564e-03,
         4.64956556e-03,

In [245]:
v=model['condoms']
model.docvecs.most_similar([v])

[(u'C1', 0.18439769744873047),
 (u'B3', 0.15565700829029083),
 (u'D2a', 0.15408849716186523),
 (u'D2b', 0.15310299396514893),
 (u'D2', 0.15099598467350006),
 (u'G2b', 0.14870649576187134),
 (u'F1', 0.10721241682767868),
 (u'B8b', 0.09765207022428513),
 (u'B1', 0.08222004771232605),
 (u'B5', 0.0711875706911087)]

In [255]:
desc_arrays=[mean([model[word] for word in words],0) for words in desc_words]

In [287]:
doc_vector=mean([model[word] for word in 'sex workers referred for testing who are tested and know their result'.split() if word in model.vocab],0)

In [290]:
from scipy import spatial
[spatial.distance.cosine(doc_vector,test) for test in desc_arrays]

[0.59707956084329239,
 0.53726799691662963,
 0.70134343682233358,
 1.1218231088896424,
 0.97270868927827991,
 1.1289073889248638,
 1.1289073889248638,
 1.0977539365432369,
 0.85187168365246047,
 1.2164591332854133,
 0.90084780482709437,
 0.8703264177483907,
 0.92678032973064506,
 1.0464044991890233,
 1.0464044991890233,
 1.0480793620443678,
 1.1142790077306961,
 1.0085297462966636,
 1.0006379935242538,
 1.0085297462966636,
 0.91764618032367151,
 0.98319462278412029,
 0.99200908313476033,
 1.0420463905617701,
 1.0675259935934358,
 1.136192711729175,
 0.99487167417007882,
 0.87730433030585586,
 0.80572000351735451,
 0.80572001611316812,
 0.83974807703596843]

In [266]:
#for "condoms distributed" this will find F2 over F1, which will not work for the red umbrella case
#unless I label red umbrella as F1, also lube

#works for identifying A2 and B1 !

<h1> Conclusion... this will be best achived with a dictionary lookup </h1>