In [1]:
import re
import pandas as pd
import csv

class Appearance:
    """
    Represents the appearance of a term in a given document, along with the
    frequency of appearances in the same one.
    """
    def __init__(self, docId, frequency):
        self.docId = docId
        self.frequency = frequency
        
    def __repr__(self):
        """
        String representation of the Appearance object
        """
        return str(self.__dict__)

In [2]:
class Database:
    """
    In memory database representing the already indexed documents.
    """
    def __init__(self):
        self.db = dict()

    def __repr__(self):
        """
        String representation of the Database object
        """
        return str(self.__dict__)
    
    def get(self, id):
        return self.db.get(id, None)
    
    def add(self, document):
        """
        Adds a document to the DB.
        """
        return self.db.update({document['id']: document})

    def remove(self, document):
        """
        Removes document from DB.
        """
        return self.db.pop(document['id'], None)

In [3]:
class InvertedIndex:
    """
    Inverted Index class.
    """
    def __init__(self, db):
        self.index = dict()
        self.db = db
    
    def returnIndex(self):
        return self.index
    
    def exportDictToCSV(self):
        with open('mycsvfile.csv','wb') as f:
            w = csv.writer(f)
            w.writerows(self.index.items())

    def __repr__(self):
        """
        String representation of the Database object
        """
        return str(self.index)
        
    def index_document(self, document):
        """
        Process a given document, save it to the DB and update the index.
        """
        
        # Remove punctuation from the text.
        clean_text = re.sub(r'[^\w\s]','', document['text'])
        terms = clean_text.split(' ')
        appearances_dict = dict()
        # Dictionary with each term and the frequency it appears in the text.
        for term in terms:
            term_frequency = appearances_dict[term].frequency if term in appearances_dict else 0
            appearances_dict[term] = Appearance(document['id'], term_frequency + 1)
            
        # Update the inverted index
        update_dict = { key: [appearance]
                       if key not in self.index
                       else self.index[key] + [appearance]
                       for (key, appearance) in appearances_dict.items() }
        self.index.update(update_dict)
        # Add the document into the database
        self.db.add(document)
        return document
    
    def lookup_query(self, query):
        """
        Returns the dictionary of terms with their correspondent Appearances. 
        This is a very naive search since it will just split the terms and show
        the documents where they appear.
        """
        return { term: self.index[term] for term in query.split(' ') if term in self.index }

In [15]:
import pandas as pd
data = pd.read_csv('ProcessedTweetsClusters.csv',encoding='UTF-8');
#data.columns = ['id','content']
#data = data[:]

In [16]:
def highlight_term(id, term, text):
    replaced_text = text.replace(term, "\033[1;32;40m {term} \033[0;0m".format(term=term))
    return "--- document {id}: {replaced}".format(id=id, replaced=replaced_text)

db = Database()
index = InvertedIndex(db)

for i in range(len(data)):
    text = data['content'][i]
    if(pd.isnull(text)):
        continue
    document = {
        'id': data['index'][i],
        'text': data['content'][i]
    }
    index.index_document(document)


In [17]:
import json
search_term = "colombia"
result = index.lookup_query(search_term)
print(result)
dic = index.returnIndex().__repr__
print(len(index.returnIndex().keys()))
with open('index.json', 'w') as json_file:  
    json.dump(dic(), json_file)
for term in result.keys():
    for appearance in result[term]:
        # Belgium: { docId: 1, frequency: 1}
        document = db.get(appearance.docId)
        print(highlight_term(appearance.docId, term, document['text']))
        print(data.iloc[appearance.docId]['id_str'])
        print()
    print("-----------------------------")    

{'colombia': [{'docId': 5064, 'frequency': 1}, {'docId': 14986, 'frequency': 1}, {'docId': 21230, 'frequency': 1}]}
34116
--- document 5064: great honor welcome president manuel santos [1;32;40m colombia [0;0m white house today!????????joint press 
8.61e+17

--- document 14986: stickmissosolog realdonaldtrump universe [1;32;40m colombia [0;0m 
5.57e+17

--- document 21230: believe model mysteriously disappear obamacare website citizen [1;32;40m colombia [0;0m 
3.98e+17

-----------------------------


In [56]:
dicti = index.returnIndex()
for i in dicti.keys():
    print(i)

great
republican
senator
ineffective
border
security
without
strong
country
voter
board
smart
democrat
crime

former
harry
throw
working
fail
career
deception
replace
another
beauty
cryin
chuck
schumer
things
never
change
meeting
breakfast
nation
governor
vietnam
important
summit
complete
denuclearization
north
korea
rapidly
become
economic
powerhouse
chairman
decision
realdonaldtrump
funny
watch
people
years
nothing
telling
negotiate
ivankatrump
admin
commit
ensure
american
skill
need
secure
better
economy
price
getting
please
relax
world
fragile
spike
could
note
racist
president
african
criminal
justice
reform
lowest
unemployment
numbers
history
almost
highly
respect
richard
senate
intelligence
interview
witness
state
emergency
southern
patrol
military
local
enforcement
major
construction
drug
gang
human
traffic
stop
report
substantial
progress
trade
talks
china
structural
issue
include
intellectual
property
protection
technology
transfer
agriculture
services
currency
result


In [47]:
from pymongo import MongoClient
client = MongoClient()
client = MongoClient('mongodb://admin:admin234@ds161074.mlab.com:61074/searchengine')
db = client['searchengine']
collection = db['index']

In [53]:
document = {
        'word': '2',
        'value': {'id':2,'fre':3}
    }
collection.insert_one(document)

<pymongo.results.InsertOneResult at 0x11bb56188>

In [55]:
lista = []
for i in index.returnIndex().keys():
    array = index.returnIndex()[i]
    document = {
        'word': i,
        'value': array
    }
    print(document)
    collection.insert_one(document)
    lista.append(document)
print(lista)
#collection.insert_many(lista)

{'word': 'great', 'value': [{'docId': 0, 'frequency': 1}, {'docId': 8, 'frequency': 1}]}


InvalidDocument: Cannot encode object: {'docId': 0, 'frequency': 1}