## 1. Importing Libraries

In [2]:
import nltk
import json
import pickle
import random
import numpy as np
from nltk.stem import WordNetLemmatizer
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD
import os

## 2. Ensuring NLTK Resources are Available

In [3]:
# Function to check and download NLTK resources if not already available
def check_nltk_resources():
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')
    
    try:
        nltk.data.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet')

# Call the function to ensure NLTK resources are available
check_nltk_resources()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ayank\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 3. Load and Process Data

In [4]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Load and process the data
data_file = open('Data/admission_data.json').read()
intents = json.loads(data_file)

## 4. Preparing Data for Training

In [5]:
words = []
classes = []
documents = []
ignore_words = ['?', '!']

In [6]:
for intent in intents['intents']:
    for pattern in intent['patterns']:
        # Tokenize each word in the sentence
        w = nltk.word_tokenize(pattern)  # Tokenize each sentence
        # Add to documents
        words.extend(w)  # Add words to list
        documents.append((w, intent['tag']))  # Store word-intent pair
        # Add to classes if not already present
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

In [7]:
# Lemmatize and lower each word and remove duplicates
words = sorted(list(set([lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words])))

# Sort classes
classes = sorted(list(set(classes)))

In [8]:
# Print data information
print(len(documents), "documents")
print(len(classes), "classes", classes)
print(len(words), "unique lemmatized words", words)

91 documents
29 classes ['academic_support', 'admission_deadline', 'admission_interview', 'admission_requirements', 'admission_status', 'admission_tests', 'admission_website', 'application_fee', 'application_process', 'application_tips', 'campus_life', 'career_services', 'diversity', 'faculty', 'financial_aid', 'financial_aid_application', 'financial_planning', 'goodbye', 'greeting', 'health_services', 'housing', 'internships', 'online_resources', 'programs_offered', 'scholarship_deadline', 'student_activities', 'study_abroad', 'support_services', 'thanks']
145 unique lemmatized words ["'s", 'about', 'abroad', 'academic', 'access', 'act', 'activity', 'admission', 'advising', 'afford', 'aid', 'an', 'and', 'any', 'applicant', 'application', 'apply', 'are', 'attend', 'available', 'back', 'body', 'bye', 'campus', 'can', 'care', 'career', 'check', 'club', 'college', 'cost', 'counseling', 'course', 'day', 'deadline', 'decision', 'diversity', 'do', 'document', 'doe', 'dormitory', 'during', 'e

In [9]:
# Save words and classes to disk
pickle.dump(words, open('Model/words.pkl', 'wb'))
pickle.dump(classes, open('Model/classes.pkl', 'wb'))

## 5. Creating Training Data

In [10]:
# Create training data
training = []
output_empty = [0] * len(classes)

In [11]:
for doc in documents:
    # Initialize bag of words
    bag = []
    # Lemmatize each word
    pattern_words = [lemmatizer.lemmatize(word.lower()) for word in doc[0]]
    # Create bag of words array
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)
        # Fills `bag` with `1` if the word appears in the document, else `0`.
        
# Create output array
output_row = list(output_empty)
output_row[classes.index(doc[1])] = 1

training.append([bag, output_row])

In [11]:
# Shuffle the data and convert to numpy arrays
random.shuffle(training)
training = np.array(training, dtype=object)

In [None]:
# Create train and test lists
train_x = np.array(list(training[:, 0]))
text_y = np.array(list(training[:, 1]))

print("Training data created")

Training data created


In [13]:
# Create model
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation='softmax'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [14]:
# Compile model
sgd = SGD(learning_rate=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])



In [15]:
# Fit and save the model
hist = model.fit(train_x, train_y, epochs=200, batch_size=5, verbose=1)

Epoch 1/200
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.0068 - loss: 3.3859  
Epoch 2/200
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0598 - loss: 3.3958   
Epoch 3/200
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.1400 - loss: 3.3046
Epoch 4/200
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0500 - loss: 3.3412     
Epoch 5/200
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1388 - loss: 3.2024 
Epoch 6/200
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1973 - loss: 3.1350 
Epoch 7/200
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0961 - loss: 3.0845     
Epoch 8/200
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1301 - loss: 3.0647 
Epoch 9/200
[1m19/19[0m [32

In [16]:
model.save('Model/chatbot_model.h5', hist)
print("Model created")



Model created
