In [494]:
import numpy as np
import pandas as pd
import ast
import pymongo
from pymongo import MongoClient
import json
import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bson import ObjectId
from sklearn.metrics.pairwise import cosine_similarity

# Database Connection

In [495]:
CONNECTION_STRING = "mongodb://prathamrasal:pratham@movies-shard-00-00.3vhge.mongodb.net:27017,movies-shard-00-01.3vhge.mongodb.net:27017,movies-shard-00-02.3vhge.mongodb.net:27017/sihdb?ssl=true&replicaSet=atlas-13vdfc-shard-0&authSource=admin&retryWrites=true&w=majority"
client = MongoClient(CONNECTION_STRING)
db = client['sihdb']
resources = db['resources']
institutes =db['institutes']

In [496]:
resInfo=resources.find()
resInfo= list(resInfo)
resDF = pd.DataFrame(resInfo)
# title, description, cost , reputation point
# resDF.head(2)

In [497]:
insInfo=institutes.find()
insInfo= list(insInfo)
insDF = pd.DataFrame(insInfo)
street = []
state = []
city = []
for i in range(len(insDF)): 
    street.append(insDF.iloc[i].address['street'])
    city.append(insDF.iloc[i].address['city'])
    state.append(insDF.iloc[i].address['state'])
insDF['street'] = street
insDF['city'] = city
insDF['state'] = state
insDF = insDF[['_id','instituteName','aisheCode','naac','street','city','state']]
# insDF.tail(5)

# Data Selection

In [498]:
resDF = pd.merge(resDF,insDF,left_on="lendingInstitute", right_on = "_id", how="inner")
# resDF.tail(3)

In [499]:
resDF = resDF[['_id_x','title','description','cost','city','street','state','naac']]
# resDF.head(5)

# Data Cleaning

In [500]:
resDF.dropna(inplace=True)
resDF.isnull().sum()

_id_x          0
title          0
description    0
cost           0
city           0
street         0
state          0
naac           0
dtype: int64

In [501]:
lematizer = WordNetLemmatizer()
def getCleanText(text):
    text = str(text)
    words = word_tokenize(text)
    wordd = []
    for w in words:
        word = lematizer.lemmatize(w)
        if not word in set(stopwords.words('english')):
            wordd.append(word)
    cleanText = ' '.join(wordd)
    return cleanText

def toLowerCase(text):
    text = str(text)
    return text.lower()

In [502]:
resDF['description'] = resDF['description'].apply(getCleanText)

# Data Processing

In [503]:
resDF['tags'] = ''
# resDF.head(2)

In [504]:
def generateTag():
    resDF['tags'] = resDF['title'] + " " + resDF['description']+ " "  + resDF['cost']+ " " + resDF['city']+ " " +resDF['street']+ " " +resDF['state']+ " " +resDF['naac']
    resDF['tags'] = resDF['tags'].apply(lambda x: x.lower())
# resDF['tags'].head(2)
generateTag()

# Vectorization

In [505]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [506]:
def vectorize(narr):
    vector = cv.fit_transform(narr.astype('str')).toarray()
    similarity = cosine_similarity(vector)
    return similarity    

In [518]:
def recommend_search(instituteId,title):
    instituteId = ObjectId(instituteId)
    institute = institutes.find_one({"_id": instituteId})
    institute = pd.DataFrame(institute)
    city = institute['address']['city']
    street = institute['address']['street']
    state = institute['address']['state']
    naac = institute['naac'][0]
    tags = title + city + street +state + naac
    myDict = {
         "title":title,
         "description":" ", 
         "cost":" ", 
         "city": city,
         "street":street,
         "state":state,
         "naac": naac,
         "tags": tags.lower()
    }
    newDF = resDF.append(myDict,ignore_index = True)
    newarr = newDF['tags'].values
    vector = vectorize(newarr)
    recommended_list = sorted(list(enumerate(vector[-1])),reverse=True,key= lambda x:x[1])
    recList = []
    for i in recommended_list[1:]:
        recList.append(resDF.iloc[i[0]]._id_x)
    return recList

In [519]:
def getInstitute(instituteId):
    instituteId = ObjectId(instituteId)
    institute = institutes.find_one({"_id": instituteId})
    institute = pd.DataFrame(institute)
    city = institute['address']['city']
    street = institute['address']['street']
    state = institute['address']['state']
    naac = institute['naac'][0]
    tags = city + street +state + naac
    myDict = {
         "title":" ",
         "description":" ", 
         "cost":" ", 
         "city": city,
         "street":street,
         "state":state,
         "naac": naac,
         "tags": tags.lower()
    }
    newDF = resDF.append(myDict,ignore_index = True)
    return newDF

In [520]:
def dashboard(id):
    newDF = getInstitute(id)
    newarr = newDF['tags'].values
    vector = vectorize(newarr)
    recommended_list = sorted(list(enumerate(vector[-1])),reverse=True,key= lambda x:x[1])
    recList = []
    for i in recommended_list[1:]:
        recList.append(resDF.iloc[i[0]]._id_x)
    return recList

In [524]:
# dashboard('62f412426fc0348badda5ae9')

In [523]:
# recommend_search('62f412426fc0348badda5ae9',"Aries Gold G 729 S-BK Analog Watch  - For Men, Boys")