In [1]:
import pandas as pd
import numpy as np
import re
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def ngrams(string, n=2):
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

def search(df,column, word, threshold = 0.5, top=10):
    vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams) #vectorizer with ngrams
    tf_idf_matrix_clean = vectorizer.fit_transform(df[column]) #fit the data
    tf_idf_matrix_dirty = vectorizer.transform(pd.Series([word])) #transform data on input
    similarity = cosine_similarity(tf_idf_matrix_clean, tf_idf_matrix_dirty) #get cosine_similarity between two 
    similarity = pd.Series([i[0] for i in similarity]) #data_cleaning

    ans = pd.concat([df[column].reset_index(drop=True), similarity], axis=1).sort_values(by=0, ascending=False) #join matrices
    ans.columns = ['word', 'similarity'] 

    ans = ans[ans['similarity']>threshold]
    ans = ans.sort_values(by='similarity', ascending=False)[:top]
    suggestions = ans['word'].to_list()
    return suggestions



In [2]:
df = pd.read_json('/Users/emadsiddiq/Projects/personal_website/pashto/data/Pashto_Raverty_full.json').transpose()


In [4]:
search(df, 'meaning', 'love', threshold=0.2, top=10)

['adv. Besides, moreover, save, over and above.',
 'adj. (past part. of حب), Beloved, loved, lovely; (Fem.) محبوبه maḥ-būbaʿh. See مابوب',
 's.f. (3rd) A turtle dove, a ring dove. Pl. يْ ey.',
 's.m. (Pl. of حبيب) Lovers, beloved friends.',
 's.f. (3rd) A basket, a hamper, a kind of cover for covering food, a sort of dish cover. Pl. يْ ey.',
 's.f. (3rd) A ring-dove, the turtle-dove. Pl. يْ ey.',
 '(act. part. of عشق) (used substantively), A lover; (Fem.) عاشقه œā-s̱ẖiḳaʿh. عاشق کیدل œā-s̱ẖiḳ kedal, verb intrans. To fall in love, to become a lover. عاشقي œā-s̱ẖikī, s.f. (3rd) Making love, courtship, gallantry, the state of being a lover, love, amour. Pl. ئِي aʿī. عاشقي کول œā-s̱ẖiḳī kawul, verb trans. To make love, to court. See مین and مین توب',
 '(from حب) adj. Beloved, dear. Also used substantively, A beloved one, a sweetheart, a friend, a mistress.',
 'adj. Overlooking, commanding, overtopping (as a fort). 2. An overlooker, a superintendent, a foreman.',
 "adj. In love, loving, fon