In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from transformers.data.processors.utils import InputExample
from torch import nn
from tqdm.notebook import tqdm
from torch.utils.data import TensorDataset, DataLoader
from transformers import glue_convert_examples_to_features,DistilBertTokenizer,DistilBertForSequenceClassification
from scipy.stats import ks_2samp
import pickle
import warnings
warnings.filterwarnings("ignore")


tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=200)

model.classifier.add_module('activation', nn.Linear(768, 200))
model.classifier.add_module('prediction', nn.Softmax(5))

model.classifier.load_state_dict(torch.load('Yelp Classifier.pt',map_location=torch.device('cpu')))
model.pre_classifier.load_state_dict(torch.load('Yelp Pre_Classifier.pt',map_location=torch.device('cpu')))

model.eval()
# print(model)

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

In [2]:
with open(b'metric_dist.pkl','rb') as f:
    new_dataset=pickle.load(f)

In [3]:
def get_features(df, text_col, label_col):
    l = [InputExample(guid=idx, text_a=df.loc[idx, text_col], label=df.loc[idx, label_col]) for idx, row in df.iterrows()]
    features = glue_convert_examples_to_features(examples=l, 
                                    tokenizer=tokenizer,
                                    max_length=300,
                                    label_list = df[label_col].values,
                                    output_mode='regression')
#     print(features[0].token_type_ids)
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_labels = torch.tensor([int(f.label-1) for f in features], dtype=torch.long)
    dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels)
    return dataset

def score(curr_dataset,ref_dataset):
    return 1e5/ks_2samp(ref_dataset,curr_dataset).statistic

In [4]:
from dotenv import load_dotenv,find_dotenv
import os
import pprint
from pymongo import MongoClient
from bson.objectid import ObjectId
import time

load_dotenv(find_dotenv())
PWD=os.environ.get("PWD")
connection_string = f"mongodb+srv://hirezen:{PWD}@cluster0.ktnki.mongodb.net/?retryWrites=true&w=majority"

counter=time.time()
start=True

In [5]:
while True:
    # reconnecting every 10 mins
    if start or time.time()-counter>=10*60:
        counter=time.time()
        start=False
        try:
            print("Connecting to the MongoDB server...")
            client=MongoClient(connection_string)

            print(client.list_database_names())
            hirezen=client.hirezen
            print(hirezen.list_collection_names())
            appliedservices=hirezen.appliedservices
            jobs=hirezen.jobs2
            printer = pprint.PrettyPrinter()

        except :
            print("Network Error detected")
            exit(0)


    services = appliedservices.find({"reRecommend":True})
    df=[]
    for service in services:
        # printer.pprint(service)
        df.append({'id':service['_id'],'review':service['review'].lower(),'stars':-1})

    if len(df):
        df=pd.DataFrame(df)

        test_dataset = get_features(df, 'review', 'stars')

        test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)
        batch_predictions, batch_actual = [], []
        with torch.no_grad():
            for k, (input_ids, attention_mask, labels) in enumerate(tqdm(test_dataloader, total=len(test_dataloader), desc='val batch')):
                feed_dict = {'input_ids': input_ids,
                            'attention_mask': attention_mask} 
                
                pred = model(**feed_dict)[0].cpu()
                batch_predictions.append(pred.numpy())


        predictions = np.array([i for k in batch_predictions for i in k ])

        predictions = np.argmax(predictions, axis=1)

        df['score']=predictions


        for idx,entry in df.iterrows():
            job=appliedservices.find_one({"_id":entry[0]})
            # printer.pprint(job)

            updates= {
                "$set":{"reviewScore":entry[-1],"reRecommend":False}
            }
            appliedservices.update_one({"_id":entry[0]},updates)
        

    listings=jobs.find()
    for listing in listings:
        jobID=str(listing['_id'])

        servicesForEachListing=list(appliedservices.find({"jobid":jobID},{"rating":1,"reviewScore":1}))

        if len(servicesForEachListing):
            printer.pprint(jobID)
            average=[]
            reviewScores=[]
            for serviceForEachListing in servicesForEachListing:
                rating=serviceForEachListing['rating']
                reviewScore=serviceForEachListing['reviewScore']
                print(rating,reviewScore) 
                average.append(rating)
                reviewScores.append(reviewScore)
            
            average=round(sum(average)/len(average))
            reviewScores=round(score(reviewScores,new_dataset))
            print(average,reviewScores)

            updates={
                "$set":{"avgRating":average,"score":reviewScores}
            }
            # print(jobID)
            jobs.update_one({"_id":ObjectId(jobID)},updates)
                
                

            
    
    

Connecting to the MongoDB server...
['hirezen', 'test', 'admin', 'local']
['jobs2', 'users', 'jobs', 'app', 'jobs3', 'appliedservices']
'62eb460de1888541f6b313ff'
3 0
2 1
2 100794
'6307a5386252fc536e7be8e0'
4 2
4 107924
'62eb460de1888541f6b313ff'
3 0
2 1
2 100794
'6307a5386252fc536e7be8e0'
4 2
4 107924
'62eb460de1888541f6b313ff'
3 0
2 1
2 100794
'6307a5386252fc536e7be8e0'
4 2
4 107924
'62eb460de1888541f6b313ff'
3 0
2 1
2 100794
'6307a5386252fc536e7be8e0'
4 2
4 107924
'62eb460de1888541f6b313ff'
3 0
2 1
2 100794
'6307a5386252fc536e7be8e0'
4 2
4 107924
'62eb460de1888541f6b313ff'
3 0
2 1
2 100794
'6307a5386252fc536e7be8e0'
4 2
4 107924
'62eb460de1888541f6b313ff'
3 0
2 1
2 100794
'6307a5386252fc536e7be8e0'
4 2
4 107924
'62eb460de1888541f6b313ff'
3 0
2 1
2 100794
'6307a5386252fc536e7be8e0'
4 2
4 107924
'62eb460de1888541f6b313ff'
3 0
2 1
2 100794
'6307a5386252fc536e7be8e0'
4 2
4 107924
'62eb460de1888541f6b313ff'
3 0
2 1
2 100794
'6307a5386252fc536e7be8e0'
4 2
4 107924
'62eb460de1888541f6b313f

KeyboardInterrupt: 

## Individual components as Jupyter Notebook blocks

In [None]:

try:
    client=MongoClient(connection_string)
except :
    print("Network Error detected")

In [None]:
print(client.list_database_names())
hirezen=client.hirezen
print(hirezen.list_collection_names())
appliedservices=hirezen.appliedservices
jobs=hirezen.jobs2
printer = pprint.PrettyPrinter()


['hirezen', 'test', 'admin', 'local']
['jobs2', 'users', 'jobs', 'app', 'jobs3', 'appliedservices']


In [None]:
services = appliedservices.find({"reRecommend":True})
df=[]
for service in services:
    # printer.pprint(service)
    df.append({'id':service['_id'],'review':service['review'].lower(),'stars':-1})

if len(df):
    df=pd.DataFrame(df)

    test_dataset = get_features(df, 'review', 'stars')

    test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)
    batch_predictions, batch_actual = [], []
    with torch.no_grad():
        for k, (input_ids, attention_mask, labels) in enumerate(tqdm(test_dataloader, total=len(test_dataloader), desc='val batch')):
            feed_dict = {'input_ids': input_ids,
                        'attention_mask': attention_mask} 
            
            pred = model(**feed_dict)[0].cpu()
            batch_predictions.append(pred.numpy())


    predictions = np.array([i for k in batch_predictions for i in k ])

    predictions = np.argmax(predictions, axis=1)

    df['score']=predictions


    for idx,entry in df.iterrows():
        job=appliedservices.find_one({"_id":entry[0]})
        # printer.pprint(job)

        updates= {
            "$set":{"reviewScore":entry[-1],"reRecommend":False}
        }
        appliedservices.update_one({"_id":entry[0]},updates)
    

# df

val batch:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
listings=jobs.find()
for listing in listings:
    jobID=str(listing['_id'])

    servicesForEachListing=list(appliedservices.find({"jobid":jobID},{"rating":1,"reviewScore":1}))

    if len(servicesForEachListing):
        printer.pprint(jobID)
        average=[]
        reviewScores=[]
        for serviceForEachListing in servicesForEachListing:
            rating=serviceForEachListing['rating']
            reviewScore=serviceForEachListing['reviewScore']
            print(rating,reviewScore) 
            average.append(rating)
            reviewScores.append(reviewScore)
        
        average=round(sum(average)/len(average))
        reviewScores=round(score(reviewScores,new_dataset))
        print(average,reviewScores)

        updates={
            "$set":{"avgRating":average,"score":reviewScores}
        }
        # print(jobID)
        jobs.update_one({"_id":ObjectId(jobID)},updates)
        
        

    

'62eb460de1888541f6b313ff'
3 0
2 1
2 100794
'6307a5386252fc536e7be8e0'
4 2
4 107924
