In [None]:
!pip install streamlit
!pip install pyngrok==4.1.10
!pip install unidecode

In [5]:
%%writefile app.py
import streamlit as st
import string, math
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
stoplist = stopwords.words("english")
from nltk.stem import PorterStemmer
ps = PorterStemmer()
import joblib
import operator

def make_docsList(df):
  docs = []
  for i in range(df.shape[0]):
    docs.append(df.text[i])
  return docs

def cleaning(docs_list):
  clean_docs = []
  for doc in docs_list:
    words = []
    try:
      for word in doc.split(' '):
        word = word.lower()
        for char in word:
          if char in string.punctuation+'1234567890'+'\n':
            word = word.replace(char, ' ')
        words.append(word.replace(' ',''))
        
      clean_docs.append(' '.join(w for w in words if w != ''))
    except:
      print(doc)
  return clean_docs

def tokenize(docs_list):
  tokens_list = []
  for doc in docs_list:
    tokens_list.append(word_tokenize(doc))

  return tokens_list

def remove_stopwords(tokens, stoplist):
  new_tokens = []
  for token in tokens:
    temp = []
    for word in token:
      if word not in stoplist:
        temp.append(word)
    new_tokens.append(temp)

  return new_tokens

def stemming(tokens):
  new_tokens = []
  for token in tokens:
    temp = []
    for word in token:
      temp.append(ps.stem(word))
    new_tokens.append(temp)

  return new_tokens

def doc_stemming(docs):
  stemmed_docs = []
  for doc in docs:
    stemmed = []
    for word in word_tokenize(doc):
      stemmed.append(ps.stem(word))
    stemmed_docs.append(' '.join(w for w in stemmed))
  return stemmed_docs

def count_docs_appearance(docs_list, tokens_list):
  terms = []
  for token in tokens_list:
    for word in token:
      terms.append(word)
  terms = list(set(terms))
  terms = sorted(terms)

  counting_dict = {}
  for term in terms:
    counting_dict[term] = []
    for id, doc in enumerate(docs_list):
      if term in doc.split():
        counting_dict[term].append(id+1)
  
  return terms, counting_dict

def count_freq(terms, docs_list):
  doc_freq = []
  for term in terms:
    freq = []
    for doc in docs_list:
      count = 0
      for word in doc.split():
        if term==word:
          count += 1
      if count>0:
        freq.append(count)
    doc_freq.append(freq)
  return doc_freq

def docRetrieval(query, posting_list):
  query = cleaning(query)
  stemmed_query = doc_stemming(query)
  query_token = stemming(remove_stopwords(tokenize(query), stoplist))
  query_terms, _ = count_docs_appearance(stemmed_query, query_token)
  query_tf = count_freq(query_terms, stemmed_query)

  temp = query_tf
  query_tf = []
  for i in temp:
    query_tf += i

  query_idf = []
  for t in query_terms:
    try:
      query_idf.append(float(posting_list[posting_list['terms']==t]['idf']))
    except:
      query_idf.append(0)

  query_w = []
  for q_tf, q_idf in zip(query_tf, query_idf):
    query_w.append(q_tf*q_idf)

  query_norm = sum([w**2 for w in query_w])
  query_norm = round(math.sqrt(query_norm), 2)

  query_w = [round(w/query_norm, 2) for w in query_w]

  doc_id, sim = [], []
  for q_t in query_terms:
    try:
      doc_id.append(list(posting_list[posting_list['terms']==q_t]['doc_id'])[0])
      sim.append(list((posting_list[posting_list['terms']==q_t]['normed_w']))[0])
    except:
      pass

  similarity = []
  for i in range(len(sim)):
    temp = []
    for w in sim[i]:
      temp.append(round(w*query_w[i], 2))
    similarity.append(temp)

  shuffled_dict = dict(zip(doc_id[0], similarity[0]))
  for i in range(1, len(doc_id)):
    temp_dict = dict(zip(doc_id[i], similarity[i]))
    for k, v in temp_dict.items():
      if k in shuffled_dict.keys():
        shuffled_dict[k] = round(shuffled_dict[k] + v, 2)
      else:
        shuffled_dict[k] = v
  
  result_dict = dict(sorted(shuffled_dict.items(), key=operator.itemgetter(1),reverse=True))
  return list(result_dict.keys())[:35]



def main():
  st.title("Vector space model on Cranfield")
  st.subheader("Query: ")

  user_input = st.text_input("", "airplane spot for example")

  # load normed posting list  file
  pl_path = '/content/drive/MyDrive/Colab Notebooks/Truy xuất thông tin - CS419.L11/Report/norm_posting_list.plk'
  with open(pl_path, 'rb') as f:
    posting_list = joblib.load(f)

  query = [user_input]
  result = docRetrieval(query, posting_list)
  st.subheader(result)
###
if __name__ == '__main__':
  main()

Overwriting app.py


In [3]:
from pyngrok import ngrok
!streamlit run app.py &>/dev/null&
!pgrep streamlit

190
237
272
431


In [4]:
public_url = ngrok.connect(port='8501')
public_url

'http://3ac4f4002f9a.ngrok.io'