In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense,Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('varied_dataset.csv') #this varied dataset is sum of dataset 
# generated by files short and varied both to incoporate both small and big sentences

In [3]:
df['city'].value_counts()

city
current_loc    6000
Vijayawada     2000
Jhansi         2000
Agra           2000
Siliguri       2000
               ... 
Firozabad      1000
Kochi          1000
Nellore        1000
Bhavnagar      1000
Mangalore      1000
Name: count, Length: 182, dtype: int64

In [4]:
label_encoder = LabelEncoder()
df['city'] = label_encoder.fit_transform(df['city'])

In [5]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['query'])
total_words = len(tokenizer.word_index) + 1

In [7]:
train_sequences = tokenizer.texts_to_sequences(train_data['query'])
test_sequences = tokenizer.texts_to_sequences(test_data['query'])

In [8]:
train_padded_sequences = pad_sequences(train_sequences)
test_padded_sequences = pad_sequences(test_sequences, maxlen=train_padded_sequences.shape[1])

In [22]:
train_padded_sequences.shape[1]

26

In [9]:
df['city'].unique()

array([123,  63,  12,  88,   1,  27, 107, 162, 139,  92, 114, 102, 127,
        90, 163,  21, 178, 137, 136, 174,  74, 115,   0, 130, 145,  70,
       121, 143, 101, 176, 175, 161,  11,  64,   9, 131,   6,  85,  80,
        91,  59, 177,  99, 116, 142, 109,  79,  26, 157,  86,  13, 122,
       125,  78,   5,  93, 167,  22, 152, 179, 165,  19, 151,  75,  23,
         8, 133,  97,  18,  60,  71, 105, 132,  17,  62,  68,  10, 149,
       128, 106,   2,   4,  76,  96, 172, 113, 156,  98, 173,  95, 153,
       119,  69,  14,   7, 168, 118,  73,  94, 171, 117, 170,  61, 110,
         3,  16,  20,  15,  24,  25,  28,  65,  66,  67,  72,  77,  81,
        82,  83,  84,  87,  89, 100, 103, 104, 108, 111, 112, 120, 124,
       126, 129, 134, 135, 138, 140, 141, 144, 146, 147, 148, 150, 155,
       154, 158, 159, 160, 164, 166, 169, 180,  29,  30,  31,  32,  33,
        34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,
        47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  5

In [10]:
train_labels = tf.keras.utils.to_categorical(train_data['city'], num_classes=len(df['city'].unique()))
test_labels = tf.keras.utils.to_categorical(test_data['city'], num_classes=len(df['city'].unique()))

In [11]:
model = Sequential()
model.add(Embedding(total_words, 64, input_length=train_padded_sequences.shape[1]))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(len(set(df['city'])), activation='softmax'))

In [12]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [13]:
model.fit(train_padded_sequences, train_labels, epochs=2, verbose=1, validation_data=(test_padded_sequences, test_labels))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x201ad92e450>

In [14]:
new_queries = ["find a medical shop nearby", "Looking for some around near in bangalore. need doctor for second opinion."]

In [15]:
new_sequences = tokenizer.texts_to_sequences(new_queries)
new_padded_sequences = pad_sequences(new_sequences, maxlen=train_padded_sequences.shape[1])

In [16]:
predictions = model.predict(new_padded_sequences)
predicted_classes = [label_encoder.classes_[tf.argmax(prediction).numpy()] for prediction in predictions]



In [17]:
predicted_classes

['current_loc', 'Bangalore']

In [18]:
model.evaluate(test_padded_sequences, test_labels)



[4.649843685911037e-05, 1.0]

In [19]:
model.save('location.keras')

In [20]:
import pickle

with open('location_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
with open('location_label_encoder.pickle', 'wb') as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)