In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
# Install PySpark and Spark NLP
! pip install -q pyspark==3.3.0 spark-nlp==4.2.8
!pip install Levenshtein



In [27]:
import json
import pandas as pd
import numpy as np

import sparknlp
import pyspark.sql.functions as F

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.types import StringType, IntegerType
from transformers import pipeline
import Levenshtein

import requests
from requests.adapters import HTTPAdapter, Retry

In [28]:
spark = sparknlp.start()

In [29]:
document_assembler = DocumentAssembler()\
        .setInputCol('text')\
        .setOutputCol('document')

sentence_detector = SentenceDetector() \
        .setInputCols(['document'])\
        .setOutputCol('sentence')

tokenizer = Tokenizer()\
        .setInputCols(['sentence']) \
        .setOutputCol('token')

tokenClassifier_loaded = BertForTokenClassification.pretrained("bert_token_classifier_hi_en_ner","hi")\
        .setInputCols(["sentence",'token'])\
        .setOutputCol("ner")

ner_converter = NerConverter()\
        .setInputCols(["sentence","token","ner"])\
        .setOutputCol("ner_chunk")

nlp_pipeline = Pipeline(stages=[document_assembler,
                                sentence_detector,
                                tokenizer,
                                tokenClassifier_loaded,
                                ner_converter])

checkpoint = "/content/drive/MyDrive/Dead_line ISRO/distilbert-finetuned-ner-2/checkpoint-3135"
token_classifier = pipeline(
    "token-classification", model=checkpoint, aggregation_strategy="simple"
)

# text_list =["""वॉरेन एडवर्ड बफेट (Warren Buffet) (अगस्त 30 (August 30), 1930 को ओमाहा (Omaha), नेब्रास्का (Nebraska) में पैदा हुए) एक अमेरिकी निवेशक (investor), व्यवसायी और परोपकारी (philanthropist) व्यक्तित्व हैं।"""]

# df = spark.createDataFrame(text_list, StringType()).toDF("text")
# result = nlp_pipeline.fit(df).transform(df)

bert_token_classifier_hi_en_ner download started this may take some time.
Approximate size to download 634.9 MB
[OK!]


In [30]:
# text_list =["Are Jaipur and Ajmer the same state?"]
# text_list = ["AMong jaipur, Ajmer and Baroda whih of these have a higher population than Surat"]
# text_list = ["Which has higher average temperature in June, AHmedabad or Gandhinagar"]
# text_list = ["Where is zebara? is it in amdavad?"]
# text_list = ["The new england journal of medicine is the best medical journal in the world"]
# text_list = ["Can I visit new york, mars, sun and delhi on the same day?"]
# text_list = ["Can I visit new york mars sun and delhi on the same day?"]
# text_list = ["where can i find lakes near Ahmedabad"]
# text_list = ["where can I find lakes near Amdavad"]
# text_list = ["is there water in delli?"]
# text_list = ["Is it a good time to visit Prince edward island?"]
# text_list = ["which has higher average temperature in june, ahmedabad or gandhinagar"]
# text_list = ["Which Has Higher Average Temperature In June, Ahmdvad Or Gandhingr"]
# text_list = ["temperature at amdavad is high"]
# text_list = ["The zoo is located in Abc"]
# text_list = ["The zoo is located in Abc"]
# text_list = ["अहमदाबाद का तापमान मध्य प्रदेश से भी ज्यादा है"]
# text_list = ["अमदाबद का तापमान मध्यदेश से भी ज्यादा है"]
# text_list = ["Name of my daughter is India"]
# text_list = ["I hate Gujarat but i love faafada"]
# text_list = ["Anjeer is my favourite"]
# text_list = ["Farah Went To Kushk"]
# text_list = ["Temperature At Abc Is Higher Than Temperature at xyz"]
# text_list = ["Venus has a travel planned to mars"]
# text_list = ["The dal lake is in sri nagar"]
# text_list = ["What happened in Tamil Nadu"]


In [31]:
import codecs,string
def is_hindi(character):
    maxchar = max(character)
    if u'\u0900' <= maxchar <= u'\u097f':
        return True
    else:
      return False

def findword(text, s, e):
  while s>0 and text[s-1] != ' ' and text[s-1] != ',' and text[s-1] != '.' and text[s-1] != '?':
    s-=1
  while e<len(text) and text[e] != ' ' and text[e] != ',' and text[e] != '.' and text[e] != '?':
    e+=1
  return text[s:e]

def combinedOutput(extracted_list, output, text_list):
  namelist1 = {}
  # print(extracted_list, output)
  for out in extracted_list:
    # print(out)
    namelist1[out[0].lower()] = out[1]

  namelist2 = {}
  for out in output:
    # if out['score'] > 0.70:
      # print("printing out", out)
      word = findword(text_list[0].lower(), out['start'], out['end'])
      # print(word)
      namelist2[word] =  out['score']

  # print(namelist2)
  # print(namelist1)
  result = {}
  for place in namelist2.keys():
    if is_hindi(place):
      continue
    if place not in namelist1.keys():
      result[place] = "Looks Like"

    elif namelist1[place] == 'PLACE':
      result[place] = "Certain"
    else:
      result[place] = f'{place} occurs in the context of {namelist1[place]} but its name may resemble the name of a place'

  for place in namelist1.keys():
    if namelist1[place] == 'PLACE' and place not in result.keys():
      result[place] = "Most Likely"
      if len(place.split(' ')) > 1:
        for x in place.split(' '):
          if x in result.keys():
            del result[x]


  return result

In [32]:
def getNER(text):
  text = text.capitalize()
  # print(text)
  text_list = [text]
  df = spark.createDataFrame(text_list, StringType()).toDF("text")
  result = nlp_pipeline.fit(df).transform(df)


  extracted_result = result.select(F.explode(F.arrays_zip(result.ner_chunk.result, result.ner_chunk.metadata)).alias("cols")) \
    .select(F.expr("cols['0']").alias("chunk"),
            F.expr("cols['1']['entity']").alias("ner_label"))
  # extracted_result.show(truncate=False)

  extracted_list = extracted_result.collect()

  checkpoint = "/content/drive/MyDrive/Dead_line ISRO/distilbert-finetuned-ner-2/checkpoint-3135"
  token_classifier = pipeline(
      "token-classification", model=checkpoint, aggregation_strategy="simple"
  )

  output = token_classifier(text_list[0].lower())
  # print(extracted_list, output)
  answer = combinedOutput(extracted_list, output, text_list)
  return answer


In [33]:
getNER("surat is a city in dujrat")

{'surat': 'Certain', 'dujrat': 'Certain'}

In [34]:
# The function to find similarity between correctAnswer and userAnswer, based on levenshtein distance
def validate_answer_levenshtein(correct_answer, user_answer, threshold=80):
    distance = Levenshtein.distance(user_answer.lower(), correct_answer.lower())
    similarity = 1 - (distance / max(len(user_answer), len(correct_answer)))
    # normalising from [0,1] to [0,100]
    return round(100*similarity, 2)

In [35]:
def fuzzyMatching(fuzzy_li, universe_of_names):
    global_nearest_match_li = []
    max_similarity = 0

    for word in fuzzy_li:
        nearest_match_li = []
        n = 0
        for checking_word, type_ in universe_of_names:
            temp = validate_answer_levenshtein(word, checking_word)
            if n == 0:
                nearest_match_li.append((checking_word, temp, type_))
                n += 1
            else:
                for i in range(n):
                    # print(len(nearest_match_li), n)
                    if temp >= nearest_match_li[i][1]:
                        nearest_match_li.insert(i, (checking_word, temp, type_))
                        n += 1
                        break
                else:
                    nearest_match_li.append((checking_word, temp, type_))
                    n += 1
            if n == 4:
                nearest_match_li = nearest_match_li[:-1]
                n -= 1
        global_nearest_match_li.append(nearest_match_li)

    return global_nearest_match_li

In [36]:
def fuzzyMatchingPreprocessing():

    countries_df = pd.read_csv("/content/drive/MyDrive/Dead_line ISRO/Matching Tables/countries.csv")
    states_df = pd.read_csv("/content/drive/MyDrive/Dead_line ISRO/Matching Tables/states.csv")
    cities_df = pd.read_csv("/content/drive/MyDrive/Dead_line ISRO/Matching Tables/cities.csv")


    n_cities = len(cities_df)
    n_states = len(states_df)
    n_countries = len(countries_df)

    cities_list = [(cities_df.iloc[i]["name"].lower(), "city") for i in range(n_cities)]
    states_list = [(states_df.iloc[i]["name"].lower(), "state") for i in range(n_states)]
    countries_list = [(countries_df.iloc[i]["name"].lower(), "country") for i in range(n_countries)]

    universe_of_names = countries_list + states_list + cities_list
    return universe_of_names

In [37]:
def fuzzyMatchingComplete (fuzzy_li, universe_of_names):

    final_list = fuzzyMatching(fuzzy_li, universe_of_names)

    return final_list

In [38]:
def add(word,entity,universe_of_names):
  entry = (word.lower(), entity.lower())
  if entry in universe_of_names:
      print(entry, " is already present in the database")
  else:
      universe_of_names.append(entry)
  return universe_of_names

def delete(word,entity,universe_of_names):
  entry = (word.lower(), entity.lower())
  if entry not in universe_of_names:
    print(tuple([word,entity.lower()]), " is not present in the database")
  else:
    universe_of_names.remove(entry)
  return universe_of_names

### Integerating frontend with backend

In [39]:
universe_of_names = fuzzyMatchingPreprocessing()

In [148]:
def take_input():
  # for request type
  url = 'https://7c12-2401-4900-53f7-cac1-a859-b2de-728a-2545.ngrok-free.app/fetchData'
  r = requests.get(url)
  # json_data = json.loads()
  json_data = str(r.content)[2:-1]
  json_data = (json_data.split('\\n'))[-1]
  return json_data

In [149]:
def give_output(type_, output, o1 = 0, o2 = 0, o3 = 0):
    url = 'https://7c12-2401-4900-53f7-cac1-a859-b2de-728a-2545.ngrok-free.app/sendQuery'
    headers = {'Content-type': 'application/json'}
    # 0 for string
    # 1 for dictionary
    if type_==0:
      json_obj = json.dumps({'0':output})
      # json_obj = output
    elif type_==1:
      li = []
      li.append(output)
      li.append(o1)
      li.append(o2)
      li.append(o3)
      json_obj = json.dumps({'1':li})
    # write code for sending the json object
    requests.post(url, json_obj, headers = headers)

In [159]:
# give_output(0, "Aashray")
entity = {"w2242": ["Aashray"]}
matched_cities = ["Aryan"]
give_output(1, "matching places for ", entity, " are : ", matched_cities[0])

In [70]:
url = 'https://bccc-2401-4900-53f7-cbda-a270-92ad-7964-5499.ngrok-free.app/fetchData'
r = requests.get(url)

In [71]:
print(str(r.content)[2:-1])

jnfwjibgeijw\nwhat is the temp of jaipur


In [156]:
print(take_input())

Where is New Delho?


In [150]:
def main(universe_of_names):
  while True:
    # request_type = input("Enter a request type: (Entering sentence: 1, add to database: 2, delete to database: 3), (exit: -1) : ")
      # request_type = take_input()
      request_type = 1
      # print("")
      if request_type == '-1':
        give_output(0, "Exiting...")
        # print("Exiting...")
        break
      elif request_type == '1' :
        # user_input = input("Enter a sentence (type '-1' to exit): ")
        user_input = take_input()
        print(user_input)

        # Check if the user wants to exit
        if user_input == '-1':
          # print("Exiting...")
          give_output(0, "Exiting...")
          break

        # Call the processing function and display the result
        result = getNER(user_input)
        # print(f"Result: {result}")

        matched_cities = fuzzyMatchingComplete(result.keys(), universe_of_names)

        # give_output(result) # DO NOT GIVE IN FRONT END
        i=0
        for entity in result.keys():
          give_output(1, "matching places for ", entity, " are : ", matched_cities[i])
          i+=1
      elif request_type == '2':
        # user_input = input("Enter the name of the place (type '-1' to exit): ")
        user_input = take_input()
        # user_category = input("Enter the type of place(city, state, country)  (type '-1' to exit): ")
        user_category = take_input()
        if user_input == '-1' or user_category == '-1':
          break
        universe_of_names = add(user_input, user_category, universe_of_names)

      elif request_type == "3":
        # user_input = input("Enter the name of the place  (type '-1' to exit): ")
        user_input = take_input()
        # user_category = input("Enter the type of place(city, state, country)  (type '-1' to exit): ")
        user_category = take_input()
        if user_input == '-1' or user_category == '-1':
          break
        universe_of_names = delete(user_input, user_category, universe_of_names)
      else:
        give_output(0, "Please enter a valid input")

In [120]:
def main(universe_of_names):
  while True:
    # request_type = input("Enter a request type: (Entering sentence: 1, add to database: 2, delete to database: 3), (exit: -1) : ")
      request_type = take_input()
      # print("")
      if request_type == '-1':
        give_output(0, "Exiting...")
        # print("Exiting...")
        break
      elif request_type == '1' :
        # user_input = input("Enter a sentence (type '-1' to exit): ")
        user_input = take_input()

        # Check if the user wants to exit
        if user_input == '-1':
          # print("Exiting...")
          give_output(0, "Exiting...")
          break

        # Call the processing function and display the result
        result = getNER(user_input)
        # print(f"Result: {result}")

        matched_cities = fuzzyMatchingComplete(result.keys(), universe_of_names)

        # give_output(result) # DO NOT GIVE IN FRONT END
        i=0
        for entity in result.keys():
          give_output(1, "matching places for ", entity, " are : ", matched_cities[i])
          i+=1
      elif request_type == '2':
        # user_input = input("Enter the name of the place (type '-1' to exit): ")
        user_input = take_input()
        # user_category = input("Enter the type of place(city, state, country)  (type '-1' to exit): ")
        user_category = take_input()
        if user_input == '-1' or user_category == '-1':
          break
        universe_of_names = add(user_input, user_category, universe_of_names)

      elif request_type == "3":
        # user_input = input("Enter the name of the place  (type '-1' to exit): ")
        user_input = take_input()
        # user_category = input("Enter the type of place(city, state, country)  (type '-1' to exit): ")
        user_category = take_input()
        if user_input == '-1' or user_category == '-1':
          break
        universe_of_names = delete(user_input, user_category, universe_of_names)
      else:
        give_output(0, "Please enter a valid input")

In [43]:
def main(universe_of_names):
  while True:
      # Take input from the user
      print("")

      request_type = input("Enter a request type: (Entering sentence: 1, add to database: 2, delete to database: 3), (exit: -1) : ")
      print("")
      if request_type == '-1':
            print("Exiting...")
            break
      elif request_type == '1' :
        user_input = input("Enter a sentence (type '-1' to exit): ")

        # Check if the user wants to exit
        if user_input == '-1':
            print("Exiting...")
            break

        # Call the processing function and display the result
        result = getNER(user_input)
        # print(f"Result: {result}")

        matched_cities = fuzzyMatchingComplete(result.keys(), universe_of_names)

        print("*****")
        print(result)
        print()
        i=0
        for entity in result.keys():
          print("matching places for ", entity, " are : ", matched_cities[i])
          i+=1
          print()
      elif request_type == '2':
        user_input = input("Enter the name of the place (type '-1' to exit): ")
        print("")
        user_category = input("Enter the type of place(city, state, country)  (type '-1' to exit): ")
        print("")
        if user_input == '-1' or user_category == '-1':
          break
        universe_of_names = add(user_input, user_category, universe_of_names)

      elif request_type == "3":
        user_input = input("Enter the name of the place  (type '-1' to exit): ")
        print("")
        user_category = input("Enter the type of place(city, state, country)  (type '-1' to exit): ")
        print("")
        if user_input == '-1' or user_category == '-1':
          break
        universe_of_names = delete(user_input, user_category, universe_of_names)
      else:
        print("Please enter a valid input")

In [154]:
main(universe_of_names)

KeyboardInterrupt: ignored

In [None]:
x = json.dumps(universe_of_names)