General:
----------------------------------------------------------------------------------------------------------------------------
This notebook extracts data from the databases as sentences in the form of list of tokens (or list of words).

Prerequisites:
 - Download sentences.db from __FIXME__: Add public ling here

Sentence processing using SpaCy:
----------------------------------------------------------------------------------------------------------------------------
- Remove stop words
- Remove punctuation
- Mask numbers e.g.: 18 --> dd, 2018-->dddd, 34.54--> dd.dd
- Lookup word lemma and replace tokens with lemmas when they exist. 
  "Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of 
   words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word,  
   which is known as the lemma." (https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html)


In [1]:
# -*- coding: utf-8 -*-
import os
import re
import csv
import sys
import glob
# import spacy
import errno
import random
import sqlite3
import subprocess
import pandas as pd
import pymysql.cursors
from random import shuffle
from pandas import ExcelFile
from __future__ import division
from abbreviations import get_abbreviations

# Avoid ascii error
reload(sys)
sys.setdefaultencoding('utf8')

In [2]:
def prepare_directories(): 
    # FIXME: Add link to dowload db in this directory once it's created?
    try:
        os.mkdir("db")
    except OSError as exc:
        if exc.errno != errno.EEXIST:
            raise
        pass
    try:
        os.mkdir("models")
    except OSError as exc:
        if exc.errno != errno.EEXIST:
            raise
        pass

In [3]:
# Create directories to store database and models
prepare_directories()

In [4]:
# Connect to db
def connect_to_db():
    database = "db/sentences.db"
    conn = create_connection(database)
    return conn

In [5]:
# Connect to DB
def create_connection(db_file):
    try:
        conn = sqlite3.connect(db_file)
        return conn
    except Error as e:
        print(e)
 
    return None

In [6]:
def cleanup_token(tkn):
    new_tkn = tkn.strip().strip(',').strip('.')
    if len(new_tkn)>=2 and new_tkn[0] == '(' and new_tkn[len(new_tkn)-1]==')':
        new_tkn=new_tkn.rstrip(')').lstrip('(')
    return new_tkn

In [7]:
# Get sentences
def get_sentences_from_db(conn):
    dictionary = {}
    docids = []
    cur = conn.cursor()
    cur.execute("select sentence, docid from sentences where haspolys = 1") 
    rows = cur.fetchall()
 
    for row in rows:
        sentence = row[0]
        docid = row [1]
        
        #combine sentences with identical docids
        if docid not in docids: 
            docids.append(docid)
            sentences_list = list()
            sentences_list.append(sentence)
            dictionary[docid] = sentences_list
            
        else: 
            sentences_list.append(sentence)

    # Returns sentences as list of tokens where each sentence is broken into words using spaces except within parenthesis
    return dictionary

In [8]:
# Connects to db
connection = connect_to_db()

In [9]:
# Gets sentences and write training and testing files
sentences = get_sentences_from_db(connection)

In [10]:
# Gets abbreviation-polymer pairs and returns as dictionary sorted by docid
polymer_abbrs_dictionary = {}

#gets abbreviation:polymer pair for each docid
for key in sentences: 
    values = sentences[key]
    abbrs = get_abbreviations(values)
    abbreviations = abbrs.items()
    polymer_abbrs_dictionary[key] = abbreviations

In [11]:
#combine entries for each docid dictionary into one dictionary

dictionary_values = polymer_abbrs_dictionary.values()
dictionary3 = {}
for item in dictionary_values: 
    dictionary2 = dict(item)
    for key, value in dictionary2.items(): 
        dictionary3[key] = value

In [12]:
#print polymers: [every acronym found for that polymer]

acronyms = {}
polymers = list()
for key,value in dictionary3.items():
    abbrev = key
    polymer = value
    if polymer not in polymers: 
        polymers.append(polymer)
        abbreviations = list()
        abbreviations.append(abbrev)
        acronyms[polymer] = abbreviations
    else: 
        abbreviations.append(abbrev)

In [13]:
#open file of polymer candidates/create new file

f = open('input/polymer_candidates.txt','r')
f2 = open('output/amber_polymer_candidates.txt','w+')
F = f.readlines()

In [14]:
#format candidates

candidates_list = list()
for string in F:
    candidates_list.append(string)

formatted_strings = list()
for string in candidates_list: 
    plain_string = string.rstrip('\n')  
    if "[u'" in plain_string: 
        plain_string = plain_string.replace("[u'","")
        plain_string = plain_string.replace("']","")
    else: 
        pass
    unicode_string = plain_string.decode('utf8')
    formatted_strings.append(unicode_string)

In [15]:
#find and record matching pairs

for string in formatted_strings:
    f2.write('%s\n' % (string))
    for key in acronyms: 
        if string in acronyms[key]:
            line_acronym = '%s\n' % (key)
            f2.write(line_acronym)
        if string in key: 
            mod = ''.join(acronyms[key])
            line_polymer = '%s\n' % (mod)
            f2.write(line_polymer)

In [16]:
#close files 

f.close()
f2.close()