In [None]:
import nltk
import random
import json
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score

In [None]:
intents = {
  "intents": [
    {
      "tag": "MachineConditionCheck",
      "patterns": ["How's the machine looking?", "Check machine condition", "Machine status?"],
      "responses": ["The machine is currently in good condition." "All performance metrics are normal."],
      "context_set": ""
    },
    {
      "tag": "ProblemDetected",
      "patterns": ["What should we do if there’s a problem?","What if there’s an issue?", "How to handle machine problems?"],
      "responses": ["If you see any issues, here's what to do:"
                   "Unusual Noises: Check for loose parts or wear. Tighten or replace parts as needed."
                   "Temperature Spikes: Ensure cooling systems are working. Clean or repair cooling components if necessary."
                   "Vibration Issues: Inspect for misalignment or imbalance. Adjust or replace parts to correct the problem."
                   "Error Messages: Follow the error code instructions. Consult the manual or contact support if needed.`"],
      "context_set": ""
    },
    {
      "tag": "UnclearProblem",
      "patterns": ["What if the problem isn’t clear?", "What if I can’t identify the issue?", "How to handle unclear problems?"],
      "responses": ["If the issue isn’t obvious, perform a diagnostic check." "Look at system logs for more details." "If needed, escalate the issue to a senior technician."],
      "context_set": ""
    },
    {
      "tag": "PreventFutureProblems",
      "patterns": ["How can we prevent future problems?", "What to do to avoid issues in the future?", "How to prevent machine failures?"],
      "responses": ["After fixing the issue, review the cause and update maintenance practices." "Adjust schedules or improve monitoring based on new data."],
      "context_set": ""
    },
    {
      "tag": "GeneralHelp",
      "patterns": ["Thanks", "Thank you", "I need more help"],
      "responses": ["You’re welcome! If you need more help, just ask."],
      "context_set": ""
    }

  ]
}

In [None]:
def synonym_replacement(tokens, limit):
    augmented_sentences = []
    for i in range(len(tokens)):
        synonyms = []
        for syn in wordnet.synsets(tokens[i]):
            for lemma in syn.lemmas():
                synonyms.append(lemma.name())
        if len(synonyms) > 0:
            num_augmentations = min(limit, len(synonyms))
            sampled_synonyms = random.sample(synonyms, num_augmentations)
            for synonym in sampled_synonyms:
                augmented_tokens = tokens[:i] + [synonym] + tokens[i+1:]
                augmented_sentences.append(' '.join(augmented_tokens))
    return augmented_sentences

In [None]:
!pip install nltk numpy pandas




In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Now you can proceed with your script
stopwords = set(nltk.corpus.stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
text_data = []
labels = []
stopwords = set(nltk.corpus.stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


limit_per_tag = 40

for intent in intents['intents']:
    augmented_sentences_per_tag = 0
    for example in intent['patterns']:
        tokens = nltk.word_tokenize(example.lower())
        filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords and token.isalpha()]
        if filtered_tokens:
            text_data.append(' '.join(filtered_tokens))
            labels.append(intent['tag'])

            augmented_sentences = synonym_replacement(filtered_tokens, limit_per_tag - augmented_sentences_per_tag)
            for augmented_sentence in augmented_sentences:
                text_data.append(augmented_sentence)
                labels.append(intent['tag'])
                augmented_sentences_per_tag += 1
                if augmented_sentences_per_tag >= limit_per_tag:
                    break

print(len(text_data))
print(len(labels))

215
215


In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(text_data)
y = labels

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def find_best_model(X, y, test_size=0.2):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=100)


    models = [
        ('Logistic Regression', LogisticRegression(), {
            'penalty': ['l2'],
            'C': [0.1, 1.0, 10.0],
            'solver': ['liblinear'],
            'max_iter': [100, 1000, 10000]
        }),
        ('Multinomial Naive Bayes', MultinomialNB(), {'alpha': [0.1, 0.5, 1.0]}),
        ('Linear SVC', LinearSVC(), {
            'penalty': ['l2'],
            'loss': ['hinge', 'squared_hinge'],
            'C': [0.1, 1, 10],
            'max_iter': [100, 1000, 10000]
        }),
        ('Decision Tree', DecisionTreeClassifier(), {
            'max_depth': [5, 10, 20, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'criterion': ['gini', 'entropy']
        }),
        ('Random Forest', RandomForestClassifier(), {
            'n_estimators': [100, 200, 300],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        })
    ]

    for name, model, param_grid in models:
        grid = GridSearchCV(model, param_grid, cv=3, n_jobs=-1)
        grid.fit(X_train, y_train)
        y_pred = grid.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        print(f'{name}: {score:.4f} (best parameters: {grid.best_params_})')

    best_model = max(models, key=lambda x: GridSearchCV(x[1], x[2], cv=3, n_jobs=-1).fit(X_train, y_train).score(X_test, y_test))
    print(f'\nBest model: {best_model[0]}')


    best_model[1].fit(X, y)

    return best_model[1]

In [None]:
best_model = find_best_model(X, y)

Logistic Regression: 0.8837 (best parameters: {'C': 1.0, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'})
Multinomial Naive Bayes: 0.8837 (best parameters: {'alpha': 0.1})
Linear SVC: 0.8837 (best parameters: {'C': 0.1, 'loss': 'squared_hinge', 'max_iter': 100, 'penalty': 'l2'})
Decision Tree: 0.8837 (best parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5})
Random Forest: 0.9070 (best parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100})

Best model: Random Forest


In [None]:
def chatbot_response(user_input):
    input_text = vectorizer.transform([user_input])
    predicted_intent = best_model.predict(input_text)[0]

    for intent in intents['intents']:
        if intent['tag'] == predicted_intent:
            response = random.choice(intent['responses'])
            break

    return response

In [None]:
print('Hello! I am a chatbot. How can I help you today? Type "quit" to exit.')
while True:
    user_input = input('> ')
    if user_input.lower() == 'quit':
        break
    response = chatbot_response(user_input)
    print(response)

Hello! I am a chatbot. How can I help you today? Type "quit" to exit.
> quit


In [None]:
import os
print(os.getcwd())


/content


In [None]:
import pickle

# Your data
data = {"key": "value"}

# Save to a .pkl file
with open("data.pkl", "wb") as f:
    pickle.dump(data, f)

# Check if the file exists in the working directory
!ls


data.pkl  sample_data


In [None]:
from google.colab import files

# Download the first file
files.download('best_model.pkl')

# If you have two separate files, give the second one a unique name
# Download the second file (if it exists with a different name or location)
files.download('best_model_2.pkl')  # Change the name as per your file structure


FileNotFoundError: Cannot find file: best_model.pkl

In [17]:
import joblib

# Assuming 'model' is your trained model
joblib.dump(model, 'best_model.joblib')


NameError: name 'model' is not defined

In [21]:
import joblib

# Assuming 'model' is your trained model
joblib.dump( best_model,'best_model.joblib')


['best_model.joblib']

In [22]:
import joblib

# Assuming 'model' is your trained model
joblib.dump( vectorizer,'vectorizer.joblib')


['vectorizer.joblib']

In [23]:
import joblib

# Load the saved model
model = joblib.load('best_model.joblib')
print(model.get_params())


{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


In [24]:
# Assuming you have input data X
predictions = model.predict(X)
print(predictions)


['MachineConditionCheck' 'MachineConditionCheck' 'MachineConditionCheck'
 'MachineConditionCheck' 'MachineConditionCheck' 'MachineConditionCheck'
 'MachineConditionCheck' 'MachineConditionCheck' 'MachineConditionCheck'
 'MachineConditionCheck' 'MachineConditionCheck' 'MachineConditionCheck'
 'MachineConditionCheck' 'MachineConditionCheck' 'MachineConditionCheck'
 'MachineConditionCheck' 'MachineConditionCheck' 'MachineConditionCheck'
 'MachineConditionCheck' 'MachineConditionCheck' 'MachineConditionCheck'
 'MachineConditionCheck' 'MachineConditionCheck' 'MachineConditionCheck'
 'MachineConditionCheck' 'MachineConditionCheck' 'MachineConditionCheck'
 'MachineConditionCheck' 'MachineConditionCheck' 'MachineConditionCheck'
 'MachineConditionCheck' 'MachineConditionCheck' 'MachineConditionCheck'
 'MachineConditionCheck' 'MachineConditionCheck' 'MachineConditionCheck'
 'MachineConditionCheck' 'MachineConditionCheck' 'MachineConditionCheck'
 'MachineConditionCheck' 'MachineConditionCheck' 'M

In [25]:
print(type(model))


<class 'sklearn.ensemble._forest.RandomForestClassifier'>
