In [180]:
import string
from collections import defaultdict

In [181]:
docs = [
    'The battery life of this ipod is terrible. I hate the battery. I feel that there are better products out there.',
    'I loved the screen resolution of the ipod. Very bright.',
    'Long lasting battery that lasted lasted me 7 months. I play with it everyday and listen to music on the ipod.',
    'Lots of games on the ipod. Very lasted enjoyable.',
]

In [182]:
def create_term_to_pos_for_doc(doc_id):
    term_to_pos = {}
    terms = [s.translate(None, string.punctuation).lower() for s in docs[doc_id].split(' ')]
    for idx, term in enumerate(terms):
        if term in term_to_pos:
            term_to_pos[term].append(idx)
        else:
            term_to_pos[term] = [idx]
    return term_to_pos

In [183]:
def create_forward_index():
    forward_index = {}
    for doc_id in range(len(docs)):
        forward_index[doc_id] = create_term_to_pos_for_doc(doc_id)
    return forward_index

In [184]:
def create_inverted_index():
    fwd_index = create_forward_index()
    inverted_index = defaultdict(dict)
    for doc_id in fwd_index.keys():
        terms_to_pos = fwd_index[doc_id]
        for term in terms_to_pos.keys():
            pos = terms_to_pos[term]
            if term in inverted_index.keys():
                if doc_id in inverted_index[term]:  
                    inverted_index[term][doc_id].extend(pos)
                else:
                    inverted_index[term][doc_id] = pos
            else:
                inverted_index[term] = {doc_id : pos}
    return inverted_index

In [185]:
def _get_docs_simple_query(search_query):
    inv_idx = create_inverted_index()
    query_terms = [
        q.translate(None, string.punctuation).lower()
        for q in search_query.split(' ')
    ]
    retrieved_docs = []
    for q_term in query_terms:
        retrieved_docs.extend(
            inv_idx[q_term].keys()
        )
    return list(set(retrieved_docs))

def _get_docs_phrase_query(search_query):
    inv_idx = create_inverted_index()
    query_terms = [
        q.translate(None, string.punctuation).lower()
        for q in search_query.split(' ')
    ]
    
    docs_to_pos = defaultdict(list)
    for q_term in query_terms:
        docs_to_pos_q_term = inv_idx[q_term]
        for doc_id in docs_to_pos_q_term.keys():
            if doc_id in docs_to_pos.keys():
                docs_to_pos[doc_id].append(docs_to_pos_q_term[doc_id])
            else:
                docs_to_pos[doc_id] = [docs_to_pos_q_term[doc_id]]

    for doc_id in docs_to_pos.keys():
        pos = docs_to_pos[doc_id]
        for idx, pos_list in enumerate(pos):
            for i in range(len(pos_list)):
                pos_list[i] -= idx
    
    retrieved_docs = []
    
    for doc_id in docs_to_pos.keys():
        pos_list = docs_to_pos[doc_id]
        if len(pos_list) >= 2:
            intersection = set(pos_list[0])
            for p in pos_list[1:]:
                intersection = intersection & set(p)
            if len(intersection) >= 1:
                retrieved_docs.append(doc_id)
                
    return retrieved_docs

def get_docs(search_query, search_type='simple_query'):
    if search_type=='simple_query' or len(search_query.split(' ')) < 2:
        return _get_docs_simple_query(search_query)
    else:
        return _get_docs_phrase_query(search_query)

In [186]:
get_docs('the ipod', search_type='simple_query')

[0, 1, 2, 3]

In [188]:
get_docs('lasted enjoyable', search_type='phrase_query')

[3]