In [None]:
#preprocessing and chunking

import re

def load_stop_words():
    return set(line.strip() for line in open('stopwords-en.txt'))

def remove_special_characters(text):
    """remove characters that are not indicators of phrase boundaries"""
    return re.sub("([{}@\"$%&\\\/*'\"]|\d)", "", text)
    

def generate_candidate_phrases(text):
    """ generate phrases using phrase boundary markers """
    coarse_candidates = char_splitter.split(text.lower().replace('('," to ").replace(')'," to "))
    
    candidate_phrases = []

    for coarse_phrase in coarse_candidates:

        words = re.split("\\s+", coarse_phrase)
        previous_stop = False

        for w in words:
            if (w in stopwords ) and not previous_stop:
                candidate_phrases.append(";")
                previous_stop = True
            elif w not in stopwords and len(w) >=3:
                candidate_phrases.append(w.strip())
                previous_stop = False

    phrases = re.split(";+", ' '.join(candidate_phrases))
    return phrases


In [None]:
#filtering out special characters and numbers
with open('preprocessed.txt','a') as outfile:
    outfile.write(remove_special_characters(abstracts)) 

In [None]:
#generating candidate phrases
char_splitter = re.compile("[.,;!:-]")
stopwords = load_stop_words()
with open('candidates.txt','w') as outfile:
    with open('preprocessed.txt') as infile:
        for line in infile:
            phr=generate_candidate_phrases(line)
            for i in range(len(phr)):
                if len(phr[i].split())>1:
                    outfile.write(phr[i])
                    outfile.write('\n')

In [None]:
#count the number of occurrences for each phrase
clean_phrases={}
with open('candidates.txt','r') as infile:
    with open('unique.txt','w') as outfile:
        for line in infile:
            sline= line.strip()
            if sline not in clean_phrases:
                clean_phrases[sline]=0
                outfile.write(sline)
                outfile.write('\n')
                clean_phrases[sline]+=1

#filter out phrases that dont occur less than 100 times
with open('unique.txt','r') as myfile:
    with open('top_phrases.txt','w') as outfile:
        for line in myfile :
            sline= line.strip()
            if clean_phrases[sline] >=100:
                outfile.write(sline)
                outfile.write("<>")
                outfile.write(str(clean_phrases[sline]))
                outfile.write('\n')
                
#frequency for the filtered phrases
phrase_frequency= {}
with open('unique.txt','r') as infile:
    for line in infile:
        sline=line.split('<>')
        if sline[0] not in phrase_frequency:
            phrase_frequency[sline[0]]=sline[1]
        else: 
            phrase_frequency[sline[0]]=int(phrase_frequency[sline[0]])+int(sline[1])    

#write the phrases and their frequency to file            
with open('all_phrases.txt','w') as outfile:
    for k, v in phrase_frequency.iteritems():
        outfile.write(k+'<>'+str(v))
        outfile.write('\n')            

In [None]:
#count word frequencies for words in our phrases
from collections import Counter
import numpy as np

counter=Counter()
f1= open('all_phrases.txt') 
lin= set(f1.read().strip().split())   
with open('preprocessed.txt') as f2:
    lines= (line.lower().split() for line in f2)
    for item in lines:
        counter.update(Counter([s for s in item if s in lin]))
            

#calculate phrase importance using information frequency
N=1886096272 # total number of words
info_freq={}

for key, value in sorted(phrases.iteritems(), key=lambda (k,v): (v,k)):
    
    mnm= key.split()
    
    count=0
    multiple=1.0
    minus=0
    n=len(mnm)
    for i in range(n):
        try:
            count+=counter[mnm[i]]
            multiple*=counter[mnm[i]]/float(N)
        except KeyError: 
            pass
    
    try:
        val=float("{0:.2f}".format(np.log(value/multiple*N)))
        info_freq[key]=val*np.log(value)
    except:
        pass
    
    

In [None]:
# tag selected phrases back to the preprocessed file

phrases=all_phrases[:18000]
def tagg_data(original_text, phrases):
    for phrase in phrases:
        sphrase= phrase.split()
        replacement= '_'.join(sphrase)
        
        original_text = original_text.replace(phrase,replacement)
    return original_text   

def which_phrases(item):       
    return [f for f in phrases if f in item]


from itertools import islice
start= time.time()

with open('preprocessed.txt') as infile:
    with open('tagged.txt','a') as outfile:
        for item in infile:
            item= item.lower()
            select_phrases=which_phrases(item)
            t_item=tagg_data(item, select_phrases)

            for line in t_item:
                outfile.write(line)  

            

In [None]:
#build a word embedding model using the tagged data

import nltk
from nltk.corpus import stopwords
import gensim
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

import codecs, time, re
import glob, gzip, os
with codecs.open('tagged.txt','r', errors='ignore') as text_doc: 
    texts = [[word for word in document.lower().split() if word not in stop_words and re.match("^[a-zA-Z_]*$", word)] for document in text_doc.read().split('.')]
model = gensim.models.Word2Vec(texts, size=200, window=15, min_count=1, workers=8, negative=10, sample=1e-5)   