In [72]:
import csv
import gensim
import glob
import logging
import os
import re

import numpy as np
import pandas as pd
import scipy as sp

import ZODB, ZODB.FileStorage

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from textblob import TextBlob, Word

%matplotlib inline


In [None]:
# Current design is a simple 2 DB design - will not scale
# for full multi doc indexing,
# would need to breakdown datastores to faciliate concurrency, 
# and different document types and meta data needs.

In [None]:
# Ngram Datastore (Core index)
storageNgram = ZODB.FileStorage.FileStorage('data/ngram.fs')
dbNgram = ZODB.DB(storageNgram)
connectionNgram = dbNgram.open()
rootNgram = connectionNgram.root

In [None]:
# Document MetaData Datastore (Target datastore)
storageDocMeta = ZODB.FileStorage.FileStorage('data/docmeta.fs')
dbDocMeta = ZODB.DB(storageDocMeta)
connectionDocMeta = dbDocMeta.open()
rootDocMeta = connectionDocMeta.root

In [73]:
corpusDir = './corpus' # where to scan files (currently only processing text)
ngramWidth = 3         # Width to produce ngrams -> wider = more data


In [74]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [75]:
# Normalize Text Convert text to lower-case and strip punctuation/symbols from words
def normalize_text(text):
    norm_text = text.lower()
    # Replace html breaks with newline
    norm_text = re.sub(r'<br *\/*>', '\n', norm_text)
    # Replace non-AlphaNumeric|Newline with Space
    norm_text = re.sub(r'[^\w\n]+', ' ', norm_text)
    return norm_text




In [None]:
dictList = {}  # Key=ngram, data=[fileids]
filesDict = {} # Key=FileID, data={fileDict}
fileDict = {}  # file=fullpath, <lineid>=[ngrams]

path = os.path.join(corpusDir, '*.txt')
files = glob.glob(path)
# print(files)

#Generate Vectorization of ngrams and strip stop words 
vectorizer = CountVectorizer(ngram_range=(1, ngramWidth), stop_words='english')
ngramAnalyzer = bigram_vectorizer.build_analyzer()

# for each file, get a UID and parse
for fileID, fileName in enumerate(files):
    # Build a individual File Breakdown dictionary
    fileDict = {}
    fileDict['_file'] = fileName
    print fileID, fileName
    with open( fileName, mode = 'rU' ) as currFile:
        # for each line get a UID and parse line
        for lineID, line in enumerate(currFile):
            #print lineID, normalize_text(line)

            # store the lines vectorization for later analysis
            lineNgrams = ngramAnalyzer( normalize_text(line) )
            #print lineID, lineNgrams
                        
            # For each word/ngram add to master dictionary with FileID & In FileDict
            for item in lineNgrams:
                # First Record Ngram is in File, then record which lines have the Ngram
                
                # initialize item if not already in the master dictionory
                if item not in dictList:
                  dictList[item] = [fileID]
                elif fileID not in dictList[item]:
                    # if File isn't recorded as a viable match, then add to list
                    dictList[item].append(fileID)

                # initialize item if not already in fileDict
                if item not in fileDict:
                  fileDict[item] = [lineID]
                elif lineID not in fileDict[item]:
                    # if line isn't recorded as a viable match, then add to list
                    fileDict[item].append(lineID)
                

    # store file's analysis in master file list
    filesDict[fileID] = fileDict
print dictList
print '======'
print filesDict
# model = gensim.models.Word2Vec(sentences)


0 ./corpus/Missing.txt
1 ./corpus/contents.txt
2 ./corpus/Kindle-Book-List-20130830 - Unknown.txt
3 ./corpus/rq3.txt
4 ./corpus/CASTLE.txt
5 ./corpus/i11.txt
6 ./corpus/dd_half.txt
7 ./corpus/dd_elven.txt
8 ./corpus/text8.txt
9 ./corpus/LICENSE-README.txt
10 ./corpus/appendix.txt
11 ./corpus/dd_gnome.txt
12 ./corpus/dd_drow.txt
13 ./corpus/dd_dwarf.txt
