In [1]:
import pandas as pd
url = 'https://raw.githubusercontent.com/ICT4SD/Rapid_Assessment_Tools/Fordham-Brahms/Model_Part1_FeatureLexicon/bhutan_input.csv'
bhutan = pd.read_csv(url,index_col=0,parse_dates=[0])

In [2]:
bhutan.head()

Unnamed: 0,sector,text
0,﻿5.1,Education Sector\n\nThe single most important ...
1,5.2,Health Sector\n\nAll Bhutanese avail free heal...
2,5.3,Human Resource Development & Management\n\nDev...
3,5.4,Employment\n\nThe Labour and Employment Act 20...
4,5.5,Renewable Natural Resources (RNR) Sector\n\nRN...


In [3]:
for i,p in enumerate(bhutan.ix[0,'text'].split('\n\n')):
    for j,sentence in enumerate(p.split('. ')):
        print(j,sentence)

(0, 'Education Sector')
(0, 'The single most important factor towards achieving the socio-economic development goals will be the success of the education sector in enhancing the social, economic, cultural, environmental, and political capabilities of our most important asset, our people')
(1, 'Education virtually impacts all dimensions of development and is critical for ensuring sustainable poverty alleviation which can only come through the empowerment of our people, particularly the remote and most vulnerable sections of our population')
(2, 'An educated citizenry that is able to understand, translate and act upon information, laws, rights and responsibilities is critical for ensuring a "vibrant democracy" and effective governance')
(3, 'Similarly, the realization of our vision of a "knowledge based society" is absolutely dependent on a technically skilled workforce with competencies benchmarked to leading industry standards and capabilities to think globally but act locally')
(4, 'R

In [4]:
para_dict = {}
def sperate(row):
    text = row['text']
    lst = text.split('\n\n')
    sentences = []
    for p in lst:
        sentences = sentences+p.split('. ')
    
    for i,sentence in enumerate(sentences):
        key = str(str(row['sector']).split('.')[-1])+'_'+str(i)
        para_dict[key]=sentence

    return row

In [5]:
para = bhutan.apply(sperate,axis=1)

In [6]:
para = pd.DataFrame(para_dict.values(),index = para_dict.keys(),columns=['p'])

In [7]:
import re
def process(row):
    
    s = row['p']
    s= s.decode('ascii','ignore').encode('ascii','ignore')
    s = s.replace('\t','').replace('\n','')
    s = re.sub(r'^v?i+\)','',s)
    s = s.strip()
    row['p'] = s
    return row

In [8]:
para = para.apply(process, axis=1)

In [9]:
target = pd.read_excel('SDG_Features1and6-8.xlsx')
target.head()

Unnamed: 0,Number,SDG sub Goal content,"F1_Name,S/W",F1_Dictionary,"F2_Name,S/W",F2_Dictionary,"F3_Name,S/W",F3_Dictionary,"F4_Name,S/W",F4_Dictionary
0,1.1,"By 2030, eradicate extreme povert yfor all peo...","extreme poverty,S","extreme poverty, absolute poverty, destitution...","poverty,W","poverty,poor,famine,poornes,impoverishment,poo...","living cost, W","aliveness,bread and butter,dwell,endure,experi...",,
1,1.2,"By 2030, reduce at least by half the proportio...","proportion,W","portion, proportion, rate","poverty, S","poverty,poor,need,famine,poornes,impoverishmen...","reduce,S","reduce, lessen, make less, make smaller, lower...",,
2,1.3,Implement nationally appropriate social protec...,"social protection,S","social protection, labour markets, capacity, u...","poverty, S","poverty,poor,need,famine,poornes,impoverishmen...","coverage,W","coverage, analysis, insurance, report, broadca...",,
3,1.4,"By 2030, ensure that all men and women, in par...","vulnerables,S","vulnerable, unemployment, ill health, disabili...","equal right, W","equal rights, feminism, rights of women, siste...","economic resource, S","economic resources, goods, services, valuable ...","service,W","service, resource, utilization, distribution, ..."
4,1.5,"By 2030, build the resilience of the poor and ...","vulnerables,S","unemployment, ill health, disability, work-rel...","extreme event,S","extreme event\n climate-related, extreme event...","resilience, W","elasticity, flexibility, pliancy, recoil, snap...",,


In [10]:
target = target.fillna(0)

In [11]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

porter = nltk.PorterStemmer()

def get_ngrams(text, n ):
    n_grams = ngrams(word_tokenize(text), n)
    return [ ' '.join(grams) for grams in n_grams]

def pre(row):
    text = row['p']
    tokens = word_tokenize(text)
    tokens = [w.lower() for w in tokens]
    
    tokens = tokens + get_ngrams(text,2)
    tokens= tokens + get_ngrams(text,3)
    tokens= tokens + get_ngrams(text,4)
    
    tokens = [porter.stem(w) for w in tokens]
    row['tokens'] = tokens
    return row

In [13]:
para = para.apply(pre,axis=1)

In [14]:
para.head()

Unnamed: 0,p,tokens
1_44,Addressing the last mile challenge of enrollme...,"[address, the, last, mile, challeng, of, enrol..."
1_45,Relevance and Quality of Education: In spite o...,"[relev, and, qualiti, of, educ, :, in, spite, ..."
1_46,Judging solely by improvements in efficiency i...,"[judg, sole, by, improv, in, effici, indic, su..."
1_47,"However, what remains of great concern is that...","[howev, ,, what, remain, of, great, concern, i..."
1_40,"According to the Survey, the adjusted Net Prim...","[accord, to, the, survey, ,, the, adjust, net,..."


In [15]:
target.columns = ['target','content','F1','F1D','F2','F2D','F3','F3D','F4','F4D']

In [16]:
target['target']=target.target.astype('str')

In [17]:
target.head()

Unnamed: 0,target,content,F1,F1D,F2,F2D,F3,F3D,F4,F4D
0,1.1,"By 2030, eradicate extreme povert yfor all peo...","extreme poverty,S","extreme poverty, absolute poverty, destitution...","poverty,W","poverty,poor,famine,poornes,impoverishment,poo...","living cost, W","aliveness,bread and butter,dwell,endure,experi...",0,0
1,1.2,"By 2030, reduce at least by half the proportio...","proportion,W","portion, proportion, rate","poverty, S","poverty,poor,need,famine,poornes,impoverishmen...","reduce,S","reduce, lessen, make less, make smaller, lower...",0,0
2,1.3,Implement nationally appropriate social protec...,"social protection,S","social protection, labour markets, capacity, u...","poverty, S","poverty,poor,need,famine,poornes,impoverishmen...","coverage,W","coverage, analysis, insurance, report, broadca...",0,0
3,1.4,"By 2030, ensure that all men and women, in par...","vulnerables,S","vulnerable, unemployment, ill health, disabili...","equal right, W","equal rights, feminism, rights of women, siste...","economic resource, S","economic resources, goods, services, valuable ...","service,W","service, resource, utilization, distribution, ..."
4,1.5,"By 2030, build the resilience of the poor and ...","vulnerables,S","unemployment, ill health, disability, work-rel...","extreme event,S","extreme event\n climate-related, extreme event...","resilience, W","elasticity, flexibility, pliancy, recoil, snap...",0,0


In [18]:
target[target['target'] == '1.1']['content'].to_string(index=False)

u'By 2030, eradicate extreme povert yfor all peo...'

In [19]:
goal = target.loc[target['target'] == '1.1', 'content'].item()
goal

u'By 2030, eradicate extreme povert yfor all people everywhere, currently measured as people living on less than $1.25 a day'

In [20]:
import nltk
porter = nltk.PorterStemmer()
for t_i in range(0,107):
    def func(row):
        tokens = row['tokens']
        
        s1,s2,s3,s4 = 1,1,1,1
        w1,w2,w3,w4 = 0,0,0,0
        
        flag = 0 
        
        if target.ix[t_i,'F1']==0: #isnan 
            s1=0 #missing for the entire target
        else:
            for word in target.ix[t_i,'F1D'].split(','):
                if porter.stem(word.strip()) in tokens:
                    if target.ix[t_i,'F1'].split(',')[-1].strip()=='S':
                        flag = 1 
                    else: # 'W'
                        w1 = 0.1 
            if (flag==0) & (target.ix[t_i,'F1'].split(',')[-1].strip() =='S'): #is strong feature, and not match 
                s1 = 0 
                    
        flag = 0
        if target.ix[t_i,'F2']==0: #isnan 
            s2=1
        else:
            for word in target.ix[t_i,'F2D'].split(','):
                if porter.stem(word.strip()) in tokens:
                    if target.ix[t_i,'F2'].split(',')[-1].strip() =='S':
                        flag = 1 
                    else: # 'W'
                        w2 = 0.1 
            if (flag ==0) & (target.ix[t_i,'F2'].split(',')[-1].strip() =='S'): #is strong feature, and not match 
                s2 = 0
        
        
        flag = 0
        if target.ix[t_i,'F3']==0:
            s3=1
        else:
            for word in target.ix[t_i,'F3D'].split(','):
                if porter.stem(word.strip()) in tokens:
                    if target.ix[t_i,'F3'].split(',')[-1].strip() =='S':
                        flag = 1 
                    else: # 'W'
                        w3 = 0.1
                        
            if (flag ==0) & (target.ix[t_i,'F3'].split(',')[-1].strip() =='S'): #is strong feature, and not match 
                s3 = 0
                    
        flag = 0
        if target.ix[t_i,'F4']==0:
            s4=1
        else:
            for word in target.ix[t_i,'F4D'].split(','):
                if porter.stem(word.strip()) in tokens:
                    if target.ix[t_i,'F4'].split(',')[-1].strip() =='S':
                        flag = 1 
                    else: # 'W'
                        w4 = 0.1  
            if (flag ==0) & (target.ix[t_i,'F4'].split(',')[-1].strip() =='S'): #is strong feature, and not match 
                s4 = 0
            
            
        score = s1*s2*s3*s4*(1+w1+w2+w3+w4)
        
        
        row[str(target.ix[t_i,'target'])] = score 
        
        return row
    
    para = para.apply(func,axis=1)

KeyError: (23L, u'occurred at index 1_44')

In [21]:
para.head()

Unnamed: 0,p,tokens,1.1,1.2,1.3,1.4,1.5,6.1,6.2,6.3,...,7.3,8.1,8.2,8.3,8.4,8.5,8.6,8.7,8.8,8.9
1_44,Addressing the last mile challenge of enrollme...,"[address, the, last, mile, challeng, of, enrol...",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,...,0.0,0.0,0.0,1.1,0.0,0.0,0,0.0,0,0.0
1_45,Relevance and Quality of Education: In spite o...,"[relev, and, qualiti, of, educ, :, in, spite, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0
1_46,Judging solely by improvements in efficiency i...,"[judg, sole, by, improv, in, effici, indic, su...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0
1_47,"However, what remains of great concern is that...","[howev, ,, what, remain, of, great, concern, i...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0
1_40,"According to the Survey, the adjusted Net Prim...","[accord, to, the, survey, ,, the, adjust, net,...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0


In [22]:
para=para.rename(columns={'Unnamed: 0':'sector_sent'})

In [23]:
para = para.reset_index()
para=para.rename(columns={'index':'sector_sent'})

In [24]:
def splt(row):
    row['sector'] = str(row['sector_sent']).split('_')[0]
    row['sent_id'] = str(row['sector_sent']).split('_')[-1]
    return row

In [26]:
para=para.apply(splt,axis=1)

In [27]:
para['sector'] = para.sector.astype('int')
para['sent_id'] = para.sent_id.astype('int')
para = para.set_index(['sector','sent_id'])
para = para.sort_index()

In [28]:
para.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sector_sent,p,tokens,1.1,1.2,1.3,1.4,1.5,6.1,6.2,...,7.3,8.1,8.2,8.3,8.4,8.5,8.6,8.7,8.8,8.9
sector,sent_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,0,1_0,Education Sector,"[educ, sector, Education Sector]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0
1,1,1_1,The single most important factor towards achie...,"[the, singl, most, import, factor, toward, ach...",0.0,0.0,0.0,1.1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0
1,2,1_2,Education virtually impacts all dimensions of ...,"[educ, virtual, impact, all, dimens, of, devel...",0.0,0.0,0.0,0.0,1.1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0
1,3,1_3,An educated citizenry that is able to understa...,"[an, educ, citizenri, that, is, abl, to, under...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0
1,4,1_4,"Similarly, the realization of our vision of a ...","[similarli, ,, the, realiz, of, our, vision, o...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0


In [29]:
from Tkinter import *
import tkMessageBox

In [30]:
class Application(Frame):
  def __init__(self, master=None):
    Frame.__init__(self, master)
    self.pack()
    self.createWidgets()
 
  def createWidgets(self):
    self.inputLabel = Label(self, text='input SDG number:')
    self.inputLabel.pack()
    self.nameInput = Entry(self)
    self.nameInput.pack()
    self.alertButton = Button(self, text='match', command=self.match)
    self.alertButton.pack()
      
  def match(self):
    name = self.nameInput.get()
    tkMessageBox.showinfo('Goal', 'Goal: %s' % target.loc[target['target'] == name, 'content'].item())
    df=pd.DataFrame(para.loc[para[name] >= 1, 'p'])
    pd.options.display.max_colwidth = 1000
    tkMessageBox.showinfo('Matched result', 'result: %s' % df)

In [31]:
class Application(Frame):
  def __init__(self, master=None):
    Frame.__init__(self, master)
    self.pack()
    self.createWidgets()
 
  def createWidgets(self):
    self.inputLabel = Label(self, text='input SDG number:')
    self.inputLabel.pack()
    self.nameInput = Entry(self)
    self.nameInput.pack()
    self.alertButton = Button(self, text='match', command=self.match)
    self.alertButton.pack()
      
  def match(self):
      try:
         name = self.nameInput.get()
         tkMessageBox.showinfo('Goal', 'Goal: %s' % target.loc[target['target'] == name, 'content'].item())
         
      except:
         tkMessageBox.showinfo('Error',  'No SDG is found!')

In [32]:
class Application(Frame):
  def __init__(self, master=None):
    Frame.__init__(self, master)
    self.pack()
    self.createWidgets()
 
  def createWidgets(self):
    self.inputLabel = Label(self, text='input SDG number:')
    self.inputLabel.pack()
    self.nameInput = Entry(self)
    self.nameInput.pack()
    self.alertButton = Button(self, text='match', command=self.create_window)
    self.alertButton.pack()
      
  def create_window(self):
      try:
         name = self.nameInput.get()
         window = Toplevel(self)
         Label(window, text="SDG").grid(row=0,sticky=W)
         Label(window, text=target.loc[target['target'] == name, 'content'].item()).grid(row=1,sticky=W)
         Label(window, text="Matched Result:").grid(row=2,sticky=W)
         df=pd.DataFrame(para.loc[para[name] >= 1, 'p'])
         for i in len(df):
             Label(window, text=df.iloc[i,:].item()).grid(row=i+3,sticky=W)
         
         
      except:
         tkMessageBox.showinfo('Error',  'No SDG is found!')

In [33]:
para.loc[para['1.1'] >= 1, 'p']

sector  sent_id
1       48         On the other hand Bhutan continues to face an ...
        70         However, our education system continues to be ...
2       32         Health Human Resource: The Bhutanese Health Sy...
3       23         For instance, in the education sector, where t...
        33         In terms of human resource development, the fo...
5       5          The sector's full potential is challenged by l...
13      45         The sector also faces acute shortage of human ...
Name: p, dtype: object

In [34]:
import pandas as pd
df=pd.DataFrame(para.loc[para['1.1'] >= 1, 'p'])
pd.options.display.max_colwidth = 1000

In [35]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,p
sector,sent_id,Unnamed: 2_level_1
1,48,"On the other hand Bhutan continues to face an acute shortage of skilled and highly specialized manpower such as doctors, engineers, ICT experts and teachers"
1,70,"However, our education system continues to be constrained by an acute shortage of teachers and limited capacities"
2,32,Health Human Resource: The Bhutanese Health System suffers from acute shortage of all categories of health personnel
3,23,"For instance, in the education sector, where there is already an acute shortage of teachers, the teacher turnover is high at about 4 percent annually"
3,33,"In terms of human resource development, the focus will be on creating skills and knowledge to achieve the objectives of EDP 2010, with an emphasis on addressing the acute shortage of skilled professionals such as doctors, educationists, engineers, architects and ICT specialists"
5,5,"The sector's full potential is challenged by low productivity due to issues such as acute shortage of farm labour driven by outbound rural-urban migration, loss of arable land to urbanization, difficult terrain, fragmented land holdings, loss of crops to wildlife, lack of access to market and agricultural credit, climate change and natural disasters, post harvest management issues and subsistence nature of farming"
13,45,The sector also faces acute shortage of human resources


In [36]:
df.iloc[6,:]

p    The sector also faces acute shortage of human resources
Name: (13, 45), dtype: object

In [37]:
app = Application()
# 设置窗口标题:
app.master.title('Rapid Assessment Tool')
# 主消息循环:
app.mainloop()