# Background of the work

The Institute of Materials Research and Engineering would like to extract the core skills of its staff based on their scientific publications in peer reviewed journals. A record of these publications are available in the "Publication Release Form" database. This work extracts keywords from each publication's title and abstract, and match (store) these keywords with the respective first authors, who are assumed to be the experts in the subject knowledge. I.e. the staff expertise is constituted by the contents of their first-author publications.

# Import libraries

In [2]:
import pandas as pd
import numpy as np
import re
import math
import itertools, nltk, string
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer


# Define functions

In [3]:
def cleanColNames(vector): # vector = PRF.columns
    newCol = []
    for i in vector:
        i = i.replace(' ', '_')
        i = i.replace('(', '')
        i = i.replace(')', '')
        i = i.replace('/', '_or_')
        newCol.append(i)
    return newCol

In [4]:
def makeIndvAuth(vector): # vector = PRF.First_Author
    authors = []
    for ind, i in enumerate(vector):
        if str(i).lower() == 'nan':
            authors.append('UNKNOWN')
        else:
            tmp = []
            for author in i.split(','):
                if '(' in author:
                    tmp.append(author.split('(')[0].strip())
                else:
                    tmp.append(author.strip())
            authors.append(tmp)
    return authors

In [5]:
def breakAbbreviations(vector): # vector = firstIMRE
    # vector is a list of lists.
    # Each list contains a single name.
    for ind, i in enumerate(vector):
        tmp = []
        for ind2, j in enumerate(i[0].split(' ')):
            if ((len(j) == 2)|(len(j) == 3)) & (j.upper() == j) & (j != 'NG'):
                consonants = [not letter in 'aeiouAEIOU' for letter in j]
                if (consonants == [True, True]) | (consonants == [True, True, True]):
                    tmp.append(' '.join(j))
                else:
                    tmp.append(j)
            else:
                tmp.append(j)
        vector[ind] = [' '.join(tmp)]
    return vector

def breakAbbreviations1(vector): # vector = corr[1]
    # vector is a list of names.
    for ind, i in enumerate(vector): # each name
        tmp = []
        for ind2, j in enumerate(i.split(' ')): # each character
            if ((len(j) == 2)|(len(j) == 3)) & (j.upper() == j) & (j != 'NG'):
                consonants = [not letter in 'aeiouAEIOU' for letter in j]
                if (consonants == [True, True]) | (consonants == [True, True, True]):
                    tmp.append(' '.join(j))
                else:
                    tmp.append(j)
            else:
                tmp.append(j)
        vector[ind] = ' '.join(tmp)
    return vector

In [51]:
def getIndInRecord(vector):
    indInRecord = [vector.index(i) for i in vector if i[0] in staff.Staff_Name.values]
    return indInRecord

In [6]:
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):

    ## exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    
    ## tokenize, POS-tag, and chunk using regular expressions

    # Chunking is to produce phrases (not just words) that make sense
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    
    # 1. Tokenize text into sentences.
    # 2. Tokenize each sentence into words.
    # 3. POS-tag the words.
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    
    # 1. Chunk each word-tagged sentence.
    # 2. Make the result into a list.
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                    for tagged_sent in tagged_sents))

    ## join constituent chunk words into a single chunked phrase
    # 1. Using IOB-chunk-tags, group the chunks into "Outside" or "not Outside".
    # 2. Each item in all_chunks is a 3-part tuple, of word, POS and chunk-tag.
    #    Each item gets examined by a 3-input function.
    #    The last input (last element of the tuple) is put through a boolean test.
    #    itertools.groupby(list_of_tuples, lambda (x, y, z): z != 'Something')
    # 2. if key: ie, if it's NOT an Outside tag (Either B-KT or I-KT), group these groups.
    #    This is a running accumulation of chunks (out of the 160 all_chunks elements) that are either 'O' or not 'O'
    #    e.g. the first six elements are O, not O, not O, O, O, O, O
    #    Then form 3 groups: (F, 1 element), (T, 2 elements), (F, 4 elements)
    #    Each group is a list of tuples with the third elements != 'O'.
    # 3. Take the 1st element out of the tuples of the same group. They are the words. Join them with ' '.
    #    This forms a phrase.
    # 4. Convert the joint phrase to lower case.
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]

    ## Remove stop words and punctuations
    return [cand for cand in candidates
            if cand not in stop_words and not all(char in punct for char in cand)]

In [7]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed
        
'''def tokenize_stem(stemmer, text = tritext):
    tokens = nltk.word_tokenize(text)
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed
'''

'def tokenize_stem(stemmer, text = tritext):\n    tokens = nltk.word_tokenize(text)\n    stemmed = []\n    for item in tokens:\n        stemmed.append(stemmer.stem(item))\n    return stemmed\n'

In [8]:
def getAuthorPubKeywords(index):
    doc_keywords = pd.DataFrame(
    {'first_author': firstIMRE[index],
     'keywords': [tfidfFull.get_feature_names()[ind] for ind, freq in enumerate(mFull[index, :].tolist()[0]) if freq != 0],
     'tfidf': [freq for freq in mFull[index, :].tolist()[0] if freq != 0]
    })
    return doc_keywords.sort_values(by = 'tfidf', ascending = False).reset_index()


# Get, preliminarily clean and explore data

The "PRF" is the system that stores all publication info.

In [9]:
PRF = pd.read_csv('/Users/yingjiang/Dropbox/Learnings/Stats_data/Projects/IMRE_work/Manpower/PRF.csv')
staff = pd.read_csv('/Users/yingjiang/Dropbox/Learnings/Stats_data/Projects/IMRE_work/Manpower/Staff.csv')

Format the column names to programming-friendly strings.

In [10]:
PRF.columns = cleanColNames(PRF.columns)

In [18]:
## Fill null entries with a string for easier processing.
PRF.loc[:, ['First_Author', 'First_Author_Organisation', 'Corresponding_Authors', 'Corresponding_Authors_Organisations']] = PRF.loc[:, ['First_Author', 'First_Author_Organisation', 'Corresponding_Authors', 'Corresponding_Authors_Organisations']].fillna('UNKNOWN')
# 2312, 3158

In [78]:
PRF.head()

Unnamed: 0,Title_of_Paper,Journal_Title,Publication_Release_Number,Dept_or_Prog_,Significance_of_Paper,Publication_Date,First_Author,First_Author_Organisation,Other_Authors,Other_Authors_organisation,Corresponding_Authors,Corresponding_Authors_Organisations,Project_Finance_Code,Project_Title,Web_ID
0,Recent Progress in Chemical Vapor Deposition G...,Progress in Crystal Growth and Characterizatio...,MPC/16-128,-MPC,Reviews the latest progress in CVD growth of T...,Jul-16,WONG Swee Liang,IMRE,"LIU Hong Fei, CHI Dongzhi","IMRE, IMRE",CHI Dongzhi,IMRE,IMRE/15-2C0115,Large area growth of atomically thin 2D semico...,4696
1,A Polydopamine Coating Ultralight Graphene Mat...,Renewable Energy,AS/16-056,Advanced Energy Storage Lab,The soft polydopmine (PDA) layer with polar fu...,Aug-16,Zhou Lan,Fudan University,"LIU Zhao Lin (IMRE), ZONG Yun (IMRE), Yu Aishui","IMRE, IMRE, Fudan University","LIU Zhao Lin (IMRE), ZONG Yun (IMRE), Yu Aishui","IMRE, IMRE, Fudan University",IMRE/14-1C0243,High-performance sulfur nanostructured cathode...,4695
2,MOLECULAR DESIGN OF BIOINSPIRED NANOSTRUCTURES...,Journal of Molecular and Engineering Materials,BB/16-185,-Biomedical & Biomimetic,"In this review, we present the recent developm...",Aug-16,stu-victorxu (IMRE),IMRE,"ZHENG Xinting, Beverly MOK Yin Leng, stu-salwa...","IMRE, IMRE, IMRE, IMRE, IMRE",TAN Yen Nee,IMRE,IMRE/14-8C0439,Development of highly efficient siRNA-conjugat...,4694
3,Biodegradable Thermogelling Polymers for Biome...,MRS Bulletin,CT/16-229,-Consumer Care Technology,Thermogelling polymers belong to a class of st...,Aug-16,LIOW Sing Shy (IMRE),IMRE,"Anis ABDUL KARIM (IMRE), LOH Xian Jun (IMRE)","IMRE, IMRE",LOH Xian Jun (IMRE),IMRE,IMRE/13-2P0806,Polymer Bank for Personal Care Applications,4693
4,Dual-responsive Reversible Photo/Thermogelling...,"Journal of Polymer Science, Part A: Polymer Ch...",CT/16-227,-Consumer Care Technology,This work describes the synthesis of a novel p...,Jun-16,DOU Qing Qing (IMRE),IMRE,"LIOW Sing Shy (IMRE), LOH Xian Jun (IMRE), Wen...","IMRE, IMRE, Xiamen University",LOH Xian Jun (IMRE),IMRE,IMRE/13-2P0806,Polymer Bank for Personal Care Applications,4692


The "staff" dataframe contains IMRE staff's official names. The "Staff_Name" column will be used to normalize author names.

In [12]:
staff.columns = cleanColNames(vector = staff.columns)

Row 293 - 298 contain missing values. Just drop them.

In [13]:
staff.drop(range(293,298), axis = 0, inplace = True)

In [14]:
staff.head()

Unnamed: 0,Staff_Name,Title,Dpt,Functions,Function_Involvement,Project_Involvement,Total_involvement
0,Afriyanti SUMBOJA,Scientist I,-CER,#NAME?,0.0,1.0,1.0
1,Agata Maria BRZOZOWSKA,Scientist I,-MOL,#NAME?,0.0,0.5,0.5
2,Ajay Kumar KUSHWAHA,Scientist I,-CER,#NAME?,0.0,0.0,0.0
3,AN Tao,Scientist I,-CER,#NAME?,0.0,1.0,1.0
4,Anas Ibrahim SaedAldin AbuTaha,Scientist I,-CER,#NAME?,0.0,1.0,1.0


# Clean and normalize authors' names
## Extract names

In [118]:
first = makeIndvAuth(PRF.First_Author)

In [117]:
corr = makeIndvAuth(vector = PRF.Corresponding_Authors)

In [80]:
nonFirst = makeIndvAuth(PRF.Other_Authors)

for otherAuth, corrAuth in zip(nonFirst, corr):
#     print nonFirst.index(otherAuth)
    
    for author in otherAuth:
        if author in corrAuth:
#             print author
            otherAuth.remove(author)
            
'''
for i in nonFirst:
    for j in i:
        if 'Costa' in j:
            print nonFirst.index(i), i.index(j)
            j = 'Vijay Richard De Costa'

            print 'String is now changed to', j
'''

"\nfor i in nonFirst:\n    for j in i:\n        if 'Costa' in j:\n            print nonFirst.index(i), i.index(j)\n            j = 'Vijay Richard De Costa'\n\n            print 'String is now changed to', j\n"

At this point, we have three author lists:
1. First authors: 'first', list of single-element lists
2. Corresponding authors 'corr', list of multiple-element lists
3. Other authors 'nonFirst', list of multiple-element lists



## Extract and normalize relevant FIRST author names

Extract the names that are IMRE staff. Ignore collaborators and students.

### Clean up multiple-first-author cases

#### Clean up for rows 2034, 2051, 2164

These rows still consist of more than one first author upon first splitting. There send the respective authors to his / her corresponding columns (either corr or nonFirst).

In [119]:
corr[2034] = [first[2034][-1]]
corr[2051] = [first[2051][-1]]
corr[2164] = [first[2164][-1]]

first[2034].pop()
first[2051].pop()
first[2164].pop()

nonFirst[2034] = first[2034][1:]
nonFirst[2164] = first[2164][1:]
nonFirst[2164].append(corr[2164][0].split(' and ')[0])

first[2034] = [first[2034][0]]
first[2051] = [first[2051][0]]
first[2164] = [first[2164][0]]

corr[2164] = [corr[2164][0].split(' and ')[1]]

In [None]:
## Clean up the affiliation columns for these three special rows
for i in [2034, 2051, 2164]:
    print PRF.First_Author_Organisation.iloc[i]
    print PRF.Corresponding_Authors_Organisations.iloc[i]

# Since we want the affiliation columns, where first/corr authors is an IMRE staff, to reflect 'IMRE',
# we need to modify row 2164 (both are IMRE staff)

PRF.First_Author_Organisation.iloc[2164] = 'IMRE'
PRF.Corresponding_Authors_Organisations.iloc[2164] = 'IMRE'

#### Clean up rows where first and last names are split up

In [120]:
for ind, i in enumerate(first):
    if len(i) > 1:
        if i != 'UNKNOWN':
            first[ind] = [' '.join(i)]

#### Clean up special characters

In [121]:
# Removed all non-alphabets and non-white spaces
for ind, i in enumerate(first):
    first[ind] = [re.sub("[^a-zA-Z\s]", "", first[ind][0])]

# Removed special characters for 2 names.
for i in first:
    if 'Milo' in i[0]:
        i[0] = 'Milos Petrovic'
    if 'Mechthild' in i[0]:
        i[0] = 'Mechthild Lubke'

#### Break up abbreviations

In [122]:
first = breakAbbreviations(first)

'''
neitherIMRE = []
for ind, i in enumerate(PRF.First_Author_Organisation):
    if 'IMRE' not in i:
        if 'IMRE' not in PRF.Corresponding_Authors_Organisations.iloc[ind]:
            neitherIMRE.append(ind)

PRF_IMRE = PRF.drop(neitherIMRE, axis=0)
firstIMRE = [i for ind, i in enumerate(first) if ind not in neitherIMRE]
corrIMRE = [i for ind, i in enumerate(corr) if ind not in neitherIMRE]
'''

"\nneitherIMRE = []\nfor ind, i in enumerate(PRF.First_Author_Organisation):\n    if 'IMRE' not in i:\n        if 'IMRE' not in PRF.Corresponding_Authors_Organisations.iloc[ind]:\n            neitherIMRE.append(ind)\n\nPRF_IMRE = PRF.drop(neitherIMRE, axis=0)\nfirstIMRE = [i for ind, i in enumerate(first) if ind not in neitherIMRE]\ncorrIMRE = [i for ind, i in enumerate(corr) if ind not in neitherIMRE]\n"

### Compare with staff names dataset

In [None]:
'''
## First, compare directly
indInRecord = [firstIMRE_break.index(i) for i in firstIMRE_break if i[0] in staff.Staff_Name.values]
print len(indInRecord) #417

## Second, compare lower cases
staffNameLower = [val.lower() for ind, val in staff.Staff_Name.iteritems()]
indLower = [ind for ind, i in enumerate(firstIMRE_break) if i[0].lower() in staffNameLower]
print len(indLower)
# Note:
# 1. When this is run for the first time, it's 484. But it's 500 after the cell below is run. (order flipped)
# 2. indLower includes the direct matches.

## Third, compare cases where surname and given names are flipped
indOrder = []
for ind, i in enumerate(firstIMRE_break):
    nameSplit = i[0].split(' ')
    lastChar = nameSplit[-1]
    nameSplit.pop()
    firstChars = ' '.join(nameSplit)
    if ' '.join((lastChar, firstChars)).lower() in staffNameLower:
        indOrder.append(ind)
        firstIMRE_break[ind] = [lastChar + ' ' + firstChars]

print len(indOrder) # 17 more. But if run for the second time, 0, because all cases have been changed.
# Total 500

indUseful = indLower + indOrder
indUseful.sort_values()
'''

In [125]:
indLower = [ind for ind, i in enumerate(first) if i[0].lower() in staffNameLower]
print len(indLower) # First time the code is run: 495. Second time: 519.

519


In [126]:
## Third, compare cases where surname and given names are flipped
indOrder = []
for ind, i in enumerate(first):
    nameSplit = i[0].split(' ')
    lastChar = nameSplit[-1]
    nameSplit.pop()
    firstChars = ' '.join(nameSplit)
    if ' '.join((lastChar, firstChars)).lower() in staffNameLower:
        indOrder.append(ind)
        first[ind] = [lastChar + ' ' + firstChars]

print len(indOrder) # First time the code is run: 24. Second time: 0, because all cases have been changed.

# Total 519

0


In [111]:
# Print a list of the authors. Looks clean.
firstIMRE = [first[i][0].lower() for i in indLower]
print firstIMRE

['wong swee liang', 'liow sing shy', 'dou qing qing', 'yu yong', 'yu yong', 'zhang yu', 'goutam kumar dalapati', 'guo shifeng', 'he jiating', 'liu rong rong', 'wang shengqin', 'liu hongwei', 'david paramelle', 'yang ming', 'ye qun', 'tan mein jin', 'ren wei', 'kai dan', 'goutam kumar dalapati', 'jiang lu', 'goh xiao ming', 'liu hong fei', 'li zibiao', 'guo shifeng', 'afriyanti sumboja', 'liu hong fei', 'huang yuli', 'wong ten it', 'zhao meng', 'aung ko ko kyaw', 'liu hong fei', 'song xiaolu', 'li zibiao', 'zong yun', 'li zibiao', 'bai shiqiang', 'meysam sharifzadeh mirshekarloo', 'wong ten it', 'jiang lu', 'li zibiao', 'michelle dela cruz regulacio', 'huang kun', 'liu hong fei', 'tam teck lip dexter', 'steve wu qing yang', 'zhang lei', 'm s m saifullah', 'wang suxi', 'li bing', 'liew siao li', 'wang guan', 'liu hong fei', 'liu hong fei', 'liu hong fei', 'zhang zheng', 'kai dan', 'kai dan', 'chua chin sheng', 'anis abdul karim', 'goutam kumar dalapati', 'wong ten it', 'ye enyi', 'goh we

In [99]:
PRF_IMRE_first = PRF.iloc[indLower, :]
PRF_IMRE_first.reset_index(inplace = True)

Now we have a publication dataset that came from IMRE first authors.

In [100]:
PRF_IMRE_first.head()

Unnamed: 0,index,Title_of_Paper,Journal_Title,Publication_Release_Number,Dept_or_Prog_,Significance_of_Paper,Publication_Date,First_Author,First_Author_Organisation,Other_Authors,Other_Authors_organisation,Corresponding_Authors,Corresponding_Authors_Organisations,Project_Finance_Code,Project_Title,Web_ID
0,0,Recent Progress in Chemical Vapor Deposition G...,Progress in Crystal Growth and Characterizatio...,MPC/16-128,-MPC,Reviews the latest progress in CVD growth of T...,Jul-16,WONG Swee Liang,IMRE,"LIU Hong Fei, CHI Dongzhi","IMRE, IMRE",CHI Dongzhi,IMRE,IMRE/15-2C0115,Large area growth of atomically thin 2D semico...,4696
1,3,Biodegradable Thermogelling Polymers for Biome...,MRS Bulletin,CT/16-229,-Consumer Care Technology,Thermogelling polymers belong to a class of st...,Aug-16,LIOW Sing Shy (IMRE),IMRE,"Anis ABDUL KARIM (IMRE), LOH Xian Jun (IMRE)","IMRE, IMRE",LOH Xian Jun (IMRE),IMRE,IMRE/13-2P0806,Polymer Bank for Personal Care Applications,4693
2,4,Dual-responsive Reversible Photo/Thermogelling...,"Journal of Polymer Science, Part A: Polymer Ch...",CT/16-227,-Consumer Care Technology,This work describes the synthesis of a novel p...,Jun-16,DOU Qing Qing (IMRE),IMRE,"LIOW Sing Shy (IMRE), LOH Xian Jun (IMRE), Wen...","IMRE, IMRE, Xiamen University",LOH Xian Jun (IMRE),IMRE,IMRE/13-2P0806,Polymer Bank for Personal Care Applications,4692
3,8,Rational Design of Biomolecular Templates for ...,Advanced Healthcare Materials,BB/16-161,-Biomedical & Biomimetic,This progress report reviews the state-of-the-...,Jul-16,YU Yong,IMRE,"Beverly MOK Yin Leng, LOH Xian Jun, TAN Yen Nee","IMRE, IMRE, IMRE",TAN Yen Nee,IMRE,IMRE/14-8C0439,Development of highly efficient siRNA-conjugat...,4688
4,9,Bovine Serum Albulmin Protein-Templated Silver...,Advanced Healthcare Materials,BB/16-160,-Biomedical & Biomimetic,Bovine serum albumin-templated silver nanoclus...,Jul-16,YU Yong,IMRE,"TAN Yen Nee, CHELLAPPAN Vijila, Geng Junlong, ...","IMRE, IMRE, UIUC, Imperial College London",TAN Yen Nee,IMRE,IMRE/14-8C0439,Development of highly efficient siRNA-conjugat...,4687


Note: One can potentially capture more first authors by comparing abbreviations - this needs a better formatted (delimited) PRF!

## Extract and normalize relevant CORR author names

In [129]:
## Compare direct matches in lower cases
indLowerCorr = []
for ind, authorList in enumerate(corr):
    for author in authorList:
        if author.lower() in staffNameLower:
            indLowerCorr.append(ind)
            
len(indLowerCorr) # First time the code is run: 931. Second time: 957.

957

In [130]:
## Third, compare cases where surname and given names are flipped
indOrderCorr = []
for ind, authorList in enumerate(corr):
    for ind2, author in enumerate(authorList):
        nameSplit = author.split(' ')
        lastChar = nameSplit[-1]
        nameSplit.pop()
        firstChars = ' '.join(nameSplit)
        if ' '.join((lastChar, firstChars)).lower() in staffNameLower:
            indOrderCorr.append(ind)
            corr[ind][ind2] = lastChar + ' ' + firstChars

print len(indOrderCorr) # First time the code is run: 26. Second time: 0.

0


In [67]:
counter = 0
for i in indLower:
    if i in indLowerCorr:
        counter += 1
print counter
print len(indLower)

360
500


Out of 500 first-author papers, 360 have IMRE CORRESPONDING authors too.

The others have no IMRE first authors? Could be due to formatting issues. But for now, just ignore this.

Therefore we have:
- 140 papers that have only first authors from IMRE
- 360 papers 

# Extract keywords

This approach uses the NLTK and scikit-learn libraries for the tfidf method.

- A corpus is created from all titles and abstract texts within the PRF_IMRE_first dataframe itself. The corpus is prefiltered to remove non-ascii characters.

In [31]:
corpus = []
printable = set(string.printable)

for i, j in zip(PRF_IMRE_first.Title_of_Paper, PRF_IMRE_first.Significance_of_Paper):
    encoding_fixed = filter(lambda x: x in printable, i+j)
    corpus.append(encoding_fixed)

corpus # A list of documents

['Recent Progress in Chemical Vapor Deposition Growth of Transition Metal Dichalcogenides  Reviews the latest progress in CVD growth of Transition Metal Dichalcogenides.  ',
 'Biodegradable Thermogelling Polymers for Biomedical Applications  Thermogelling polymers belong to a class of stimuli-responsive hydrogels that undergo a macroscopic sol-to-gel transition in response to temperature. Much of the ongoing research in this fi eld is focused on hydrogels for biomedical applications as an injectable sustained drug-release matrix or scaffolds for tissue regeneration. Despite robust developments in biodegradable thermogelling polymers in recent decades, the fi eld still faces challenges in the optimization of materials properties. Thorough investigation must be performed to understand the effectiveness of drug delivery using hydrogel-forming polymer carriers. A highlighted case study on OncoGel, an experimental drug delivery depot formulation, sheds some light on the shortcomings of biod

- The text in the corpus is preprocessed to lower upper-case letters and remove punctuation.

In [32]:
stemmer = PorterStemmer()
token_dict_full = {}
for ind, doc in enumerate(corpus):
    lowers = doc.lower()
    no_punctuation = lowers.translate(None, string.punctuation)
    token_dict_full[ind] = no_punctuation # tokenized corpus

In [33]:
corpus[4]

'Bovine Serum Albulmin Protein-Templated Silver Nanocluster (BSA-Ag13): An Effective Singlet Oxygen Generator for Photodynamic Cancer Therapy  Bovine serum albumin-templated silver nanocluster containing 13 silver atoms per cluster (BSA-Ag13) has been synthesized by using NaBH4 dissolved in NaOH as a controlling reducing agent. The as-synthesized BSA-Ag13 NC exhibits very high quantum efficiency (1.26) in singlet oxygen generation and has been demonstrated as an effective photodynamic therapy agent to kill cancer cell.  '

- To each document in the corpus (which corresponds to the title and abstract texts of a publication), the texts are tokenized and stemmed.

In [34]:
i = 0
doc = corpus[0]

def tokenize(text = doc):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

- The preprocessing function (on a specific body of text) is then passed to the tfidf vectorizer to create the tfidf model. Note that scikit-learn has its own dictionary of stopwords, which are removed from the document in the model-building process.

- The model is automatically fitted for every document in the corpus, under the fit_transform() function

In [35]:
tfidfFull = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
fit0 = tfidfFull.fit_transform(token_dict_full.values())

In [36]:
fit0.shape # A vocabulary of 3776 words, across 500 documents

(500, 3776)

In [37]:
# Expand the sparse matrix
mFull = fit0.todense()

- Extract the keywords corresponding to each document, ordered by tfidf score. The relevant author is also included in the data.

In [38]:
getAuthorPubKeywords(100)

Unnamed: 0,index,first_author,keywords,tfidf
0,15,li jian,microspher,0.369182
1,7,li jian,embryon,0.369182
2,22,li jian,stem,0.351822
3,11,li jian,human,0.327355
4,25,li jian,uniforms,0.262433
5,4,li jian,cultur,0.262433
6,18,li jian,polycaprolacton,0.234548
7,17,li jian,pcl,0.225571
8,2,li jian,cell,0.198215
9,12,li jian,matrix,0.193852


In [39]:
# You can get the top 5 keywords in 2 ways.

print getAuthorPubKeywords(100).loc[:5, 'keywords']
print getAuthorPubKeywords(100).keywords.values[:5]

0    microspher
1       embryon
2          stem
3         human
4      uniforms
5        cultur
Name: keywords, dtype: object
[u'microspher' u'embryon' u'stem' u'human' u'uniforms']


# Append keywords to names

## Aggregate authors and juxtapose with each's keywords

In [112]:
len(set(firstIMRE))

168

There are 168 unique first authors. Now:
- Get author's name from the set
- Get this name's position(s) in firstIMRE
- Pass these positions to the getAuthorPubKeywords() function
- Get the first 5 keywords from each of the name's publication

In [43]:
kw_all = []
for author in set(firstIMRE):
    pubInds = [ind for ind, i in enumerate(firstIMRE) if i == author]
    kw_indv = []
    for pubInd in pubInds:
        kw_indv.extend(getAuthorPubKeywords(pubInd).keywords[:5].tolist())
    kw_all.append(kw_indv)

In [None]:
author_keywords = pd.DataFrame(
    {'first_author': list(set(firstIMRE)),
     'keywords': kw_all
    })
author_keywords

In [None]:
author_keywords.to_csv('/Users/yingjiang/Dropbox/Learnings/Stats_data/Projects/IMRE_work/Manpower/author_keywords.csv')

## "Unstem" the keywords to give back original

For each author, go through pubInds and get
- The keywords of each pub.
- Tokenized and stemmed title+abstract of each pub, or get corpus[pubInd]
- Tokenized but unstemmed title+abstract of each pub

- For this author's list of keywords of each pub
    - Compare each one with the stemmed word list. Get the index where this occurs.
    - Use the index to find the unstemmed word on the other list.

In [None]:
kw_all_unstem = []
for author in set(firstIMRE):
    pubInds = [ind for ind, i in enumerate(firstIMRE) if i == author]
    kw_indv = []
    for pubInd in pubInds:
        kw_tmp = getAuthorPubKeywords(pubInd).keywords[:5].tolist()
        tokens = nltk.word_tokenize(token_dict_full.values()[pubInd])
        tokens_stemmed = tokenize(token_dict_full.values()[pubInd])

        for kw in kw_tmp:
            kw_indv.append(tokens[tokens_stemmed.index(kw)])

    kw_all_unstem.append(kw_indv)

In [None]:
len(kw_all_unstem)

In [None]:
author_keywords_unstem = pd.DataFrame(
    {'first_author': list(set(firstIMRE)),
     'keywords': kw_all_unstem
    })
author_keywords_unstem.to_csv('/Users/yingjiang/Dropbox/Learnings/Stats_data/Projects/IMRE_work/Manpower/author_keywords_unstem.csv')

In [None]:
for i in author_keywords_unstem.keywords:
    print i

In [None]:
'develop' in 'development'

## Remove more meaningless words

There are lots of "scientific stopwords" that are not captured by scikit-learn's algo.