In [2]:
from utils import *
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import pickle
import re
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def tfidf_features(X_train, X_test, vectorizer_path):
    tfidf_vectorizer = TfidfVectorizer(ngram_range=[1,2],min_df=5,max_df=0.9,token_pattern='(\S+)')
    tfidf_vectorizer.fit(X_train)
    X_train=tfidf_vectorizer.transform(X_train)
    X_test=tfidf_vectorizer.transform(X_test)
    pickle.dump(tfidf_vectorizer, open(vectorizer_path, 'wb'))
    return X_train, X_test

In [4]:
sample_size = 310
dialogue_df = pd.read_csv('data/dialogues.tsv', sep='\t').sample(sample_size, random_state=0)
questions_df = pd.read_csv('data/questions.csv', sep=',').sample(sample_size, random_state=0)
dialogue_df['text'] = [text_prepare(x) for x in dialogue_df['text']]
questions_df['title'] = [text_prepare(x) for x in questions_df['title']]

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X = np.concatenate([dialogue_df['text'].values, questions_df['title'].values])
y = ['dialogue'] * dialogue_df.shape[0] + ['questions'] * questions_df.shape[0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))

X_train_tfidf, X_test_tfidf = tfidf_features(X_train, X_test, RESOURCE_PATH['TFIDF_VECTORIZER'])

Train size = 558, test size = 62


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [8]:
intent_recognizer=LogisticRegression(penalty='l2',C=10, random_state=0).fit(X_train_tfidf, y_train)
accuracy_score(y_train, intent_recognizer.predict(X_train_tfidf))

0.8924731182795699

In [9]:
y_test_pred = intent_recognizer.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Test accuracy = {}'.format(test_accuracy))

Test accuracy = 0.8870967741935484


In [10]:
pickle.dump(intent_recognizer, open(RESOURCE_PATH['INTENT_RECOGNIZER'], 'wb'))

In [12]:
hindi={'awas':'home','pradhan':'prime','mantri':'minister','yojana':'scheme','aam':'normal','admi':'man','bima':'insurance','varishtha':'old','vaya':'age','vandana':'prayer','jeevan':'life','jyoti':'light','fasal':'crop','suraksha':'security','jan':'people','dhan':'money','arogya':'health','mudra':'currency','rojgar':'employment','swasthya':'health','rastriya':'national'}

In [16]:
X = questions_df['scheme'].unique()
Cat = questions_df['category'].unique()
New=[]
for c in X:
    New.append(c)
for c in Cat:
    New.append(c)
Q=[text_prepare(x) for x in New]
print(Q)

['varishtha pension bima yojana', 'pradhan mantri mudra yojana', 'scheme liberation rehabilitation scavenger', 'pradhan mantri jeevan jyoti bima yojana', 'aam admi bima yojana', 'pradhan mantri suraksh bima yojna', 'national social assistance programme insurance', 'atal pension yojana', 'employment state insurance scheme', 'national pension system', 'pradhan mantri fasal bima yojana', 'pradhan mantri jan arogya', 'general insurance', 'genaral home loan', 'pradhan mantri jandhan yojana', 'vida lakshmi yojana', 'stand india buissness', 'prime ministers rozgar yojana', 'pradhan mantri awas yojana', 'pradhan mantri vaya vandana yojana', 'rashtriya swasthya bima yojana', 'pension insurance', 'small business loan', 'house loan', 'life insurance', 'accident disablity death insurance', 'health insurance', 'insurance', 'pension scheme', 'health insurance', 'senior citizen insurance', 'agriculture crop insurance', 'general insurance faq', 'genaral home loan faq', 'accidental death insurance', 'e

In [29]:
a=[]
for i in Q:
    for j in i.split():
        a.append(j)
a=[i for i in set(a)]
from sklearn.preprocessing import OneHotEncoder 
b={}
for i in range(len(a)):
    b[a[i]]=[int(i==j) for j in range(len(a))]
R=[]
for sch in Q:
    arr=sch.split()
    vec = np.zeros(len(a))
    i=0
    for word in arr:
        vec=vec+b[word]
        i+=1
            
    if i!=0:
        vec= vec/i
    R.append(vec)

In [None]:
hindi={'awas':'home','pradhan':'prime','mantri':'minister','yojana':'scheme','aam':'normal','admi':'man','bima':'insurance','varishtha':'old','vaya':'age','vandana':'prayer','jeevan':'life','jyoti':'light','fasal':'crop','suraksha':'security','jan':'people','dhan':'money','arogya':'health','mudra':'currency','rojgar':'employment','swasthya':'health','rastriya':'national'}

In [24]:
import string
st=string.ascii_lowercase
chemd={}
for i in st:
    chemd[i]=[int(j==i) for j in st]
d=[]
for j in a:
    s=np.zeros(26)
    for i in j:
        s=s+chemd[i]
    d.append(s/len(j))


In [None]:
def question_to_vec_char(question, embeddings, dim):
    """Transforms a string to an embedding by averaging word embeddings."""
    


    arr=question.split()
    vec = np.zeros(dim)
    i=0
    for word in arr:
        if word in embeddings:
            vec+=embeddings[word]
            i+=1
        else:
            best_scheme = np.argmax(cosine_similarity(s, d)[0])
            
            if a[best_scheme] in embeddings:
                vec=vec+embeddings[a[best_scheme]]
                i+=1
    if i!=0:
        return vec/i
    else:
        return vec
s=wrd2vec('primr')
best_scheme = np.argmax(cosine_similarity(s, d)[0])
a[best_scheme]
R=[]
for sch in Q:
    arr=sch.split()
    vec = np.zeros(300)
    i=0
    for word in arr:
        if word in word_embeddings:
            vec=vec+word_embeddings[word]
            i+=1
            
    if i!=0:
        vec= vec/i
    R.append(vec)
def predict (inp):
    W=question_to_vec_char(inp, word_embeddings, 300).reshape(1,-1)
    best_scheme = np.argmax(cosine_similarity(W, R)[0])
    return (Q[best_scheme])

In [None]:
import pandas as pd
#finalembed_df=pd.read_csv("dictionary.csv",sep=',')

In [51]:
import csv

csvData=[]
for i in finalembed_df['word']:
    if i.lower() in finalembed_df:
        w=finalembed_df[i.lower()]
        csvRow=[i.lower()]
        for j in w:
            csvRow.append(j)
        csvData.append(csvRow)
    
with open('finalembed.csv', 'w') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerows(csvData)

csvFile.close()

In [31]:
starspace_embeddings,embeddings_dim=load_embeddings('word_embeddings.tsv')

In [32]:
posts_df = pd.read_csv('data/questions.csv', sep=',')

In [33]:
counts_by_category = posts_df.groupby('category').count()['title']
counts_by_scheme = posts_df.groupby('scheme').count()['title']

In [80]:
import os
os.makedirs(RESOURCE_PATH['SCHEME_EMBEDDINGS_FOLDER'], exist_ok=True)

for scheme, count in counts_by_scheme.items():
    scheme_posts = posts_df[posts_df['scheme'] == scheme]
    
    scheme_post_ids = scheme_posts['post_id'].tolist()
    
    scheme_vectors = np.zeros((count, embeddings_dim), dtype=np.float32)
    for i, title in enumerate(scheme_posts['title']):
        scheme_vectors[i, :] = question_to_vec(title, starspace_embeddings, embeddings_dim)

    # Dump post ids and vectors to a file.
    filename = os.path.join(RESOURCE_PATH['SCHEME_EMBEDDINGS_FOLDER'], os.path.normpath('%s.pkl' % scheme.lower()))
    pickle.dump((scheme_post_ids, scheme_vectors), open(filename, 'wb'))

In [34]:
import os
os.makedirs(RESOURCE_PATH['CATEGORY_EMBEDDINGS_FOLDER'], exist_ok=True)

for category, count in counts_by_category.items():
    category_posts = posts_df[posts_df['category'] == category]
    
    category_post_ids = category_posts['post_id'].tolist()
    
    category_vectors = np.zeros((count, embeddings_dim), dtype=np.float32)
    for i, title in enumerate(category_posts['title']):
        category_vectors[i, :] = question_to_vec(title, starspace_embeddings, embeddings_dim)

    # Dump post ids and vectors to a file.
    filename = os.path.join(RESOURCE_PATH['CATEGORY_EMBEDDINGS_FOLDER'], os.path.normpath('%s.pkl' % category.lower()))
    pickle.dump((category_post_ids, category_vectors), open(filename, 'wb'))
