In [1]:
import json

In [22]:
infile = 'data/geophysics_hack_CLEAN.json'
with open(infile, 'r') as f:
    data = json.loads(f.read())

In [23]:
KeyWords = set()
def LoadKeywordsFromFile(infile):
    f=open(infile,'r')
    cnt = 0
    for line in f:
        if len(line)<2:
            continue
        curKeyword = line
        if line[-1]=='\n':
            curKeyword = line[:-1]
        KeyWords.add(curKeyword.strip().lower())
        cnt+=1
    print cnt,'lines read, and ', len(KeyWords),'keywords loaded.'
    
LoadKeywordsFromFile('data/SEG_Keywords.txt')
LoadKeywordsFromFile('data/seg2015.json_keywords.txt')

KeyWords=list(KeyWords)
print KeyWords

310 lines read, and  308 keywords loaded.
238 lines read, and  324 keywords loaded.
['traveltime', 'magnetization', 'interpretation', 'two-dimensional', 'datuming', 'minimum entropy', 'radiation', 'random', 'induced seismicity', 'multicomponent', 'integration', 'north america', 'diving wave', 'magnetic susceptibility', 'stoneley wave', 'oceanography', 'statistical', 'tectonics', 'scattering', 'dip moveout', 'distributed systems', 'geomatics', 'high-velocity layer', 'visualization', 'gulf of mexico', 'navigation', 'microseismic', 'density', 'vti', 'srme', 'frequency-domain', 'passive', 'production', 'maximum entropy', 'mwd', 'elimination microseismic', 'heat flow', 'resistivity log', 'sonic', 'least squares', 'australia', 'signal processing', 'risk', 'geology', 'unconsolidated', 'fluid', 'beam', 'finite element', 'sequestration', 'thermal conductivity', 'logging', 'time-domain', 'gravimeter', 'adaptive', 'avo/ava', 'inversion', 'vibroseis', 'finite difference', 'central america', '3-c',

In [24]:
import numpy as np
def GenerateKeywords(curAbstract, curTitle, curKeywords):
    N_Keywords = len(KeyWords)        
    kwv = np.zeros(N_Keywords)
    extractedKW=set()
    p=-1
    for kw in KeyWords:
        p+=1
        if kw in curKeywords: #if the existing keyword is a recognized one
            extractedKW.add(kw)
            kwv[p]=1.0
            continue
        if kw in curAbstract.lower() and kw in curTitle.lower():
            extractedKW.add(kw)
            kwv[p]=0.9
            continue
        if kw in curAbstract.lower():
            extractedKW.add(kw)
            kwv[p]=0.5
            continue
        if kw in curTitle.lower():
            extractedKW.add(kw)
            kwv[p]=0.8
            continue
    return extractedKW, kwv

In [25]:
import numpy as np
ln=0
KWV=list()
for article in data:
    curTitle = article['title']
    curAbstract = article['abstract']
    curKeywords = article['keywords']
    
    # removing the empty articles (without title nor abstract)
    if curAbstract==[] or curTitle==[]:
        continue
        
    kwl, kwv = GenerateKeywords(curAbstract[0], curTitle[0], curKeywords)
    KWV.append(kwv)
    #print kwl
    ln+=1
    #if ln>5:
    #    break
    
    # TODO: if there is any keywords, generate additional keywords if possible
    
    #print curTitle
    #print curAbstract
    #print curKeywords
D=np.array(KWV)    

In [26]:
D.shape

(2566L, 324L)

In [27]:
import matplotlib.pyplot as plt

In [28]:
plt.imshow(D, aspect='auto')
plt.show()

In [29]:
# histogram
Freq=sum(D)

plt.plot(Freq)
for idx, kw in enumerate(KeyWords):
    if Freq[idx]>100:
        plt.text(idx,Freq[idx],kw)
plt.show()

# Machine learning

## PCA transform

In [30]:
from sklearn.decomposition import PCA

In [31]:
pca = PCA(n_components=3)
X=pca.fit_transform(D)

In [32]:
X.shape

(2566L, 3L)

In [33]:
plt.plot(X[:,1],X[:,2],'.')

[<matplotlib.lines.Line2D at 0x156d8b70>]

In [34]:
plt.show()

In [35]:
import mayavi.mlab as mlab



In [36]:
mlab.points3d(X[:,0], X[:,1], X[:,2], colormap="copper", scale_factor=.02)
mlab.show()

# Article matching

In [39]:
AuthorQuery = {"Mauricio Sacchi": 1.0, "Xander Staal":0.6, "Eric Verschuur":0.5}
KeywordsQuery = {"P-wave":1.0, "interpolation":0.8, "aliasing":0.4}

In [49]:
# query preprocessing

for k in KeywordsQuery:
    KeywordsQuery[k.strip().lower()] = KeywordsQuery.pop(k)

print KeywordsQuery
    
N_Keywords = len(KeyWords)        
kqv = np.zeros(N_Keywords)
KQ = [k for k in KeywordsQuery.keys()]
p=0
for kw in KeyWords:
    if kw in KQ: #if the input keyword is a recognized one
        kqv[p]=KeywordsQuery[kw]
    p+=1
print kqv

{'aliasing': 0.4, 'p-wave': 1.0, 'interpolation': 0.8}
[ 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0. 

In [57]:
import math
def KeywordScore(kqv, kv):
    s = math.sqrt(np.sum(np.square(kqv-kv)))
    return s

kv = D[30,:]
print KeywordScore(kqv, D[30,:])

2.02731349327


In [None]:
def GetAuthorsFromCitation(citation):
    return []

In [58]:
def AuthorScore(AuthorQuery, authors):
    return 0

def CitationAuthorScore(AuthorQuery, citation):
    return AuthorScore(AuthorQuery, GetAuthorsFromCitation(citation))

In [59]:
cnt = 0
W0=2.0
W1=1.0
W2=0.5
scores=[]
for article in data:
    authors = article['authors']
    #kv = article['kv']
    kv = D[cnt,:] #get the keyword vector
    citation = article['citedby']
    cnt+=1
    score = W0 * AuthorScore(AuthorQuery, authors) + W1 * KeywordScore(kqv, kv) + W2 * CitationAuthorScore(AuthorQuery, citation)
    scores.append(score)

In [61]:
plt.plot(scores)
plt.show()