In [28]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import joblib
import nltk
from nltk.tokenize import word_tokenize
from hazm import Normalizer
from langdetect import detect
from fuzzywuzzy import fuzz

nltk.download('punkt')

def load_data(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    data = []
    labels = []
    existing_questions = {} 
    for line in lines:
        if "**" in line:  
            print(line)
            question, answer = line.strip().split("**")
            similar_question = None
            for existing_question in existing_questions:
                if fuzz.ratio(existing_question, question) > 80:  
                    similar_question = existing_question
                    break
            if similar_question:
                existing_questions[similar_question] += " " + answer
            else:
                existing_questions[question] = answer

    for question, answer in existing_questions.items():
        lang = detect(question)
        if lang == 'fa':
            tokenizer = word_tokenize
            normalizer = Normalizer().normalize
        else:
            tokenizer = word_tokenize
            normalizer = lambda x: x  
        question_tokens = tokenizer(normalizer(question))
        processed_question = ' '.join(question_tokens)
        data.append(processed_question)
        labels.append(answer)
    return data, labels


X, y = load_data('data.txt')

preprocessing_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
])

X_tfidf = preprocessing_pipeline.fit_transform(X)
svm_classifier = SVC(kernel='linear', probability=True)
svm_classifier.fit(X_tfidf, y)
joblib.dump((preprocessing_pipeline, svm_classifier), 'model.pkl')

def predict_answer(question):
    loaded_pipeline, loaded_classifier = joblib.load('model.pkl')
    lang = detect(question)
    if lang == 'fa':
        tokenizer = word_tokenize
        normalizer = Normalizer().normalize
    else:
        tokenizer = word_tokenize
        normalizer = lambda x: x 
    question_tokens = tokenizer(normalizer(question))
    processed_question = ' '.join(question_tokens)
    question_tfidf = loaded_pipeline.transform([processed_question])
    probabilities = loaded_classifier.predict_proba(question_tfidf)
    max_probability_index = np.argmax(probabilities)
    predicted_label = loaded_classifier.classes_[max_probability_index]
    return predicted_label


question = "لامپ حمام رو روشن کن"
predicted_answer = predict_answer(question)

print("Predicted answer:", predicted_answer)

[nltk_data] Downloading package punkt to /home/amin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


لامپ پذیرایی رو روشن کن ** 1

لامپ پذیرایی رو خاموش کن ** 2 

لامپ آشپزخانه رو روشن کن ** 3

لامپ آشپزخانه رو خاموش کن ** 4 

لامپ حمام رو روشن کن ** 5

لامپ حمام رو خاموش کن ** 6 

لامپ دستشویی رو روشن کن ** 7

لامپ دستشویی رو خاموش کن ** 8 

لامپ دفتر رو روشن کن ** 9

لامپ دفتر رو خاموش کن ** 0 

آبیاری رو روشن کن **  H

آبیاری رو خاموش کن ** L



In [49]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import joblib
import nltk
from nltk.tokenize import word_tokenize
from hazm import Normalizer
from langdetect import detect
from fuzzywuzzy import fuzz

nltk.download('punkt')

def load_data(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    data = []
    labels = []
    existing_questions = {} 
    for line in lines:
        if "**" in line:  
            question, answer = line.strip().split("**")
            similar_question = None
            for existing_question in existing_questions:
                if fuzz.ratio(existing_question, question) > 80:  
                    similar_question = existing_question
                    break
            if similar_question:
                existing_questions[similar_question] += " " + answer
            else:
                existing_questions[question] = answer

    for question, answer in existing_questions.items():
        lang = detect(question)
        if lang == 'fa':
            tokenizer = word_tokenize
            normalizer = Normalizer().normalize
        else:
            tokenizer = word_tokenize
            normalizer = lambda x: x  
        question_tokens = tokenizer(normalizer(question))
        processed_question = ' '.join(question_tokens)
        data.append(processed_question)
        labels.append(answer)
    return data, labels

X, y = load_data('data.txt')

preprocessing_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
])

X_tfidf = preprocessing_pipeline.fit_transform(X)
svm_classifier = SVC(kernel='linear', probability=True)
svm_classifier.fit(X_tfidf, y)
joblib.dump((preprocessing_pipeline, svm_classifier), 'model.pkl')

def predict_answer(question):
    loaded_pipeline, loaded_classifier = joblib.load('model.pkl')
    lang = detect(question)
    if lang == 'fa':
        tokenizer = word_tokenize
        normalizer = Normalizer().normalize
    else:
        tokenizer = word_tokenize
        normalizer = lambda x: x 
    question_tokens = tokenizer(normalizer(question))
    processed_question = ' '.join(question_tokens)
    question_tfidf = loaded_pipeline.transform([processed_question])
    probabilities = loaded_classifier.predict_proba(question_tfidf)
    max_probability_index = np.argmax(probabilities)
    predicted_label = loaded_classifier.classes_[max_probability_index]
    return predicted_label

question = "آبیاری رو روشن کن"
predicted_answer = predict_answer(question)

print("Predicted answer:", predicted_answer)


[nltk_data] Downloading package punkt to /home/amin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Predicted answer:  6  6


In [49]:
import numpy as np
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

data_file = "data.txt"
with open(data_file, 'r', encoding='utf-8') as file:
    data = file.readlines()

input_texts = []
output_labels = []
for line in data:
    if "**" in line:  
        text, label = line.strip().split("**")
        input_texts.append(text.strip())
        output_labels.append(label.strip())

# Data Augmentation: Adding reversed sentences
augmented_input_texts = input_texts + [text[::-1] for text in input_texts]
augmented_output_labels = output_labels + output_labels

tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(augmented_input_texts)
num_chars = len(tokenizer.word_index) + 1

input_sequences = tokenizer.texts_to_sequences(augmented_input_texts)
max_sequence_length = max(len(seq) for seq in input_sequences)
padded_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='post')

label_dict = {label: idx for idx, label in enumerate(np.unique(augmented_output_labels))}
numerical_labels = [label_dict[label] for label in augmented_output_labels]

model = Sequential([
    Embedding(input_dim=num_chars, output_dim=128),
    Bidirectional(LSTM(500, return_sequences=True)),
    LSTM(500),
    Dense(len(label_dict), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(padded_sequences, np.array(numerical_labels), epochs=20, batch_size=128, validation_split=0.2)

model.save('model.h5')


Epoch 1/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 2s/step - accuracy: 0.0941 - loss: 2.4546 - val_accuracy: 0.1882 - val_loss: 2.1559
Epoch 2/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2s/step - accuracy: 0.1813 - loss: 2.1719 - val_accuracy: 0.1798 - val_loss: 2.1043
Epoch 3/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2s/step - accuracy: 0.1804 - loss: 2.0706 - val_accuracy: 0.2079 - val_loss: 2.0180
Epoch 4/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2s/step - accuracy: 0.2107 - loss: 1.9785 - val_accuracy: 0.2725 - val_loss: 1.7992
Epoch 5/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2s/step - accuracy: 0.3343 - loss: 1.7155 - val_accuracy: 0.2978 - val_loss: 1.6973
Epoch 6/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2s/step - accuracy: 0.3515 - loss: 1.5858 - val_accuracy: 0.4354 - val_loss: 1.3077
Epoch 7/20
[1m12/12[0m [32m━━━━━━━━━━



In [50]:
neural_network_model = load_model('model.h5')

new_text = " لامپ حموم رو روشن کن"

new_sequence = tokenizer.texts_to_sequences([new_text])
new_padded_sequence = pad_sequences(new_sequence, maxlen=max_sequence_length, padding='post')

predicted_label_index_nn = np.argmax(neural_network_model.predict(new_padded_sequence), axis=1)[0]
predicted_label_nn = list(label_dict.keys())[predicted_label_index_nn]

prediction_confidence = neural_network_model.predict(new_padded_sequence)[0][predicted_label_index_nn]
print(prediction_confidence)
confidence_threshold = 0.5

if prediction_confidence < confidence_threshold:
    print("Low confidence prediction. Please check the request.")
else:
    print("Predicted label (Neural Network):", predicted_label_nn)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 486ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
0.991939
Predicted label (Neural Network): lamp3_on


In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from nltk.tokenize import word_tokenize
from hazm import Normalizer


data_file = "data.txt"
with open(data_file, 'r', encoding='utf-8') as file:
    data = file.readlines()

input_texts = []
output_labels = []
for line in data:
    if "**" in line:  
        text, label = line.strip().split("**")
        input_texts.append(text.strip())
        output_labels.append(label.strip())

augmented_input_texts = input_texts + [text[::-1] for text in input_texts]
augmented_output_labels = output_labels + output_labels

normalizer = Normalizer()
normalized_texts = [normalizer.normalize(text) for text in augmented_input_texts]

model_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=word_tokenize)),
    ('svc', SVC(probability=True))
])

model_pipeline.fit(normalized_texts, augmented_output_labels)

new_text = " لامپ پذیرایی  روشن کن"

normalized_new_text = normalizer.normalize(new_text)

predicted_label_index_svm = model_pipeline.predict([normalized_new_text])[0]
prediction_confidence = max(model_pipeline.predict_proba([normalized_new_text])[0])

confidence_threshold = 0.5

if prediction_confidence < confidence_threshold:
    print("Low confidence prediction. Please check the request.")
else:
    print("Predicted label (SVM):", predicted_label_index_svm)




Predicted label (SVM): lamp1_on
