### Importing the libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
nltk.download('stopwords')

In [65]:
INPUT_FILE = 'training_dataset.csv'
INFORMATION_COLUMN = 'Information'
CATEGORY_COLUMN = 'Tag'
LABEL_COLUMN = 'Label'

### Import Dataset

In [None]:
dataset = pd.read_csv(INPUT_FILE)

print(dataset.info())
print(dataset.head())

# Preprocessing

In [67]:
classes = dataset['Tag'].unique()
print(classes)

['tower' 'hot' 'ice' 'lolly' 'sweet']


### Label encoding

In [68]:
# Label encoding and create a new column with the encoded data
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(dataset[CATEGORY_COLUMN])

# create new column with the encoded values
dataset[LABEL_COLUMN] = label_encoder.transform(dataset[CATEGORY_COLUMN])

# create new column with the encoded values
dataset[LABEL_COLUMN] = label_encoder.transform(dataset[CATEGORY_COLUMN])

In [None]:
dataset.head()

In [70]:
# Shuffle the rows randomly
dataset = dataset.sample(frac=1, random_state=42)  # Setting random_state for reproducibility

# Reset the index of the shuffled DataFrame
dataset = dataset.reset_index(drop=True)

### Creating the feature and dependent variables

In [71]:
X = dataset[INFORMATION_COLUMN]
y = dataset[LABEL_COLUMN]

In [None]:
# Checking the X and y values ...
print(X)
print('\n')
print(y)

### Creating the Bag of Words model

In [None]:
all_stopwords = stopwords.words('english')
all_stopwords.append('like')

vectorizer = TfidfVectorizer(stop_words=all_stopwords, max_features=1000)
vectorized_X = vectorizer.fit_transform(X)
vectorized_X

### Split training and test sets

In [74]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(vectorized_X, y, test_size = 0.20, random_state = 0)

In [None]:
print(X_train.shape)
print(X_train)
print()
print(y_train)

### Training the model

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

### Prediction and classification

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [None]:
plt.figure(figsize=(8, 6))
# Adjust font size as needed
sns.set(font_scale=1.2)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=classes, yticklabels=classes)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
# plt.savefig("confusion.png")

### Plot Confusion Matrix

### Single prediction

In [None]:
X_test.shape

In [None]:
# Wrap the single value in a list
value = ['Going to the beach']
# value = ['I want to eat something cold']
# value = ['Chilling by the fireplace']
# value = ['I like this cake']

# Transform the single value using the same vectorizer used for training
vectorized_value = vectorizer.transform(value)

y_pred = classifier.predict(vectorized_value)
category = label_encoder.inverse_transform(y_pred)
print(f"The category is: {category}")