In [1]:
from pymongo import MongoClient
from pprint import pprint
import csv
import glob
import re
import os

# Credentials

In [2]:
pw_file = 'pw.txt'
if os.path.exists(pw_file): 
    with open(pw_file, 'r') as f:
        email, indeed_pw = f.readline().strip().split(', ')
        username, pia_pw = f.readline().strip().split(', ')
        pub_ip, mongo_usr, mongo_usr_pw = f.readline().strip().split(', ')

# Connect to DB

In [3]:
# connect to ec2 mongo client
client = MongoClient('{0}:27017'.format(pub_ip))


In [4]:
# get reference to  resume_db
db = client.resume_db

In [5]:
# authenticate user for database
db.authenticate(mongo_usr, mongo_usr_pw)

True

# Create Collection

In [6]:
# create a collection call 'originals'
if 'originals' not in db.collection_names():
    db.create_collection('originals')

# Get list of search terms that were saved

In [7]:
def search_terms():
    all_files = glob.glob('data/txt/*.txt')
    all_files = [x[9:] for x in all_files]
    terms = [re.search('(.*?)\_[0-9]+\.txt',x).group(1) for x in all_files]
    terms = set(terms)
    return list(terms)

In [8]:
search_terms()

['engineer', 'scientist', 'analytics', 'data', 'big_data', 'data_analysis']

# Get data that will be stored in DB

In [9]:
def get_linkid_list(term):
    # reference csv file based on term
    csv_file = 'data/{0}.csv'.format(term)
    
    # open file and extract links to list
    with open('{0}'.format(csv_file), 'rt') as f:
        return [row[0] for row in csv.reader(f)]

In [10]:
get_linkid_list('engineer')[:5]

['/r/2b5b06cff39ce808?',
 '/r/8fe4de80947b60f2?',
 '/r/9c6eeb04e6963d6b?',
 '/r/acbcc02f2044b655?',
 '/r/6d757143003b8a7b?']

# Upsert Document (Insert/Update)

In [11]:
def upsert_doc(collection, link_id, term, res_txt):
    '''
    collection: mongodb collection data will be upserted into
    link_id: unique id for users resume online
    term: term searched when resume was found
    res_text: text only from resume
    '''
    db[collection].update_one({
        'link_id': link_id,
    }, 
    {
        '$set':{
            'search_term': term,
            'resume_text': res_txt,
        }
    }, upsert=True
    )

    return None

# Get Files Associated to Search Term

In [12]:
def txt_files(term):
    '''
    term: search term that resume text files being with
    returns => list of tuples in form of (resume text, resume text digit)
    '''
    all_files = glob.glob('data/txt/*.txt')
    all_files = [x[9:] for x in all_files]
    
    files = [x for x in all_files if x[:len(term)] == term]
    digs = [re.search(r'\d+',x).group(0) for x in files]

    return list(zip(files, digs))

In [13]:
txt_files('engineer')[:5]

[('engineer_0.txt', '0'),
 ('engineer_1.txt', '1'),
 ('engineer_10.txt', '10'),
 ('engineer_100.txt', '100'),
 ('engineer_101.txt', '101')]

# Mass Upload Function

In [14]:
def insert_doc():
    # loop through search terms used 
    for term in search_terms():
        
        # get linked ids in search term list
        link_ids = get_linkid_list(term)
        
        # loop through list text resumes for given term
        for res_file, res_dig in txt_files(term):
            
            # open resume text file
            with open('data/txt/{0}'.format(res_file)) as infile:
                res_txt = infile.read()

                # upsert resume to database
                upsert_doc('originals', link_ids[int(res_dig)], term, res_txt)

In [15]:
##########################################
##########################################
##########################################
# insert_doc()
##########################################
##########################################
##########################################

# View a Few Entries

In [16]:
col = db['originals'].find().limit(3)

for doc in col:
    pprint(doc)

{'_id': ObjectId('57b217037e4a2ee6bc8639ca'),
 'link_id': '/r/2b5b06cff39ce808?',
 'resume_text': 'Petros Gazazyan North Hollywood, CA Werkervaring DESIGN '
                'ENGINEER, STRUCTURAL TTG Engineer Pasadena, CA december 2015 '
                'tot heden Designed nonstructural equipment anchorage for '
                'major southern California hospitals in accordance with ASCE, '
                'CBC and other local codes set forth by the Office of '
                'Statewide Planning and Development Gained extensive knowledge '
                'and experience in engineering programs for design including '
                'Enercalc, ETABS, and Hilti Profis for the design of remodel '
                'of buildings beams, columns, and foundations Surveyed area of '
                'work to be remodeled and inspected physical work after '
                'remodel to ensure work is done according to design CIVIL '
                'ENGINEERING STUDENT WORKER Los Angeles County De

# Notes Below

In [17]:
break here

SyntaxError: invalid syntax (<ipython-input-17-a8f2c0e30b80>, line 1)

In [None]:
# insert document (WITH prior existence checking, 
# if it did already exist it will be updated with the values under $set)

db['originals'].update_one({
        'link_id': link_ids[0],
    }, 
    {
        '$set':{
            'search_term': search_terms()[-1],
            'resume_text': res_txt,
        }
    }, upsert=True
)

In [None]:
# insert document (without prior existence checking)
res = db['originals'].insert_one(
    {
        'search_term': term,
        'link_id':link_ids[0],
        'resume_text': res_txt
    }
)

res.inserted_id

# Show Documents in Collection 'originals'

In [None]:
col = db['originals'].find()

for doc in col:
    pprint(doc)

# Drop Collection

In [None]:
db['originals'].drop()

In [None]:
for doc in cursor:
    pprint(document)