# Pre-Processing the data before loading it into database

In [1]:
import pandas as pd

df_profile = pd.read_json('company_profiles.json')
df_index = pd.read_json('company_index.json')

## Removing the duplicate company names

In [2]:
df_index.drop_duplicates(subset=['company_name'], inplace=True)
df_profile.drop_duplicates(subset=['company_name'], inplace=True)

## Renaming the company_name column
It is necessary to have an _id column to load the json file into `mongodb` <br>
So company_name is getting renamed to _id <br>
And also creating 2 new json files `company_profiles_db.json` & `company_index_db.json` which will be 2 collections in mongodb  

In [3]:
df_index.rename(columns={"company_name": "_id"}, inplace=True)
df_profile.rename(columns={"company_name": "_id"}, inplace=True)

In [4]:
df_profile.to_json('company_profiles_db.json',orient='records')
df_index.to_json('company_index_db.json', orient='records')

# Loading the Data into Mongodb

In [6]:
from pymongo import MongoClient
import json


def init_connection():
    client = MongoClient('mongodb://localhost:27017/')
    return client

def insert_json_to_mongodb_collection(database, collection, json_data):
    client = init_connection()
    db = client[database]
    collection = db[collection]
    return collection.insert_many(json_data)

    
def main():
    
    with open('company_index_db.json', 'r') as file:
        data = json.load(file)
        report = insert_json_to_mongodb_collection(database='company_database', collection='company_index', json_data=data)
        print(report)
    with open('company_profiles_db.json', 'r') as file:
        data = json.load(file)
        report = insert_json_to_mongodb_collection(database='company_database', collection='company_profile', json_data=data)
        print(report)
        
main()

<pymongo.results.InsertManyResult object at 0x7f143846be00>
<pymongo.results.InsertManyResult object at 0x7f14384eb980>


# MongoDB database running locally
<p display="flex">
  <img src="db1.png" width=40% />
  <img src="db2.png" width=40% /> 
</p>

