### Importing the libraries

In [168]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/zensei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [169]:
INPUT_FILE = 'training_dataset.csv'
INFORMATION_COLUMN = 'Information'
CATEGORY_COLUMN = 'Tag'
LABEL_COLUMN = 'Label'

### Import Dataset

In [170]:
dataset = pd.read_csv(INPUT_FILE)

print(dataset.info())
print(dataset.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312 entries, 0 to 311
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Information  312 non-null    object
 1   Tag          312 non-null    object
dtypes: object(2)
memory usage: 5.0+ KB
None
                             Information    Tag
0                  Let's go to the beach  tower
1                   I want something hot    hot
2  The kids want to go out for ice cream    hot
3                         Is cold as ice    ice
4  The kids want to go out for ice cream  lolly


# Preprocessing

In [171]:
dataset['Tag'].unique()

array(['tower', 'hot', 'ice', 'lolly', 'sweet'], dtype=object)

### Label encoding

In [172]:
# Label encoding and create a new column with the encoded data
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(dataset[CATEGORY_COLUMN])

# create new column with the encoded values
dataset[LABEL_COLUMN] = label_encoder.transform(dataset[CATEGORY_COLUMN])

# create new column with the encoded values
dataset[LABEL_COLUMN] = label_encoder.transform(dataset[CATEGORY_COLUMN])

In [173]:
dataset.head()

Unnamed: 0,Information,Tag,Label
0,Let's go to the beach,tower,4
1,I want something hot,hot,0
2,The kids want to go out for ice cream,hot,0
3,Is cold as ice,ice,1
4,The kids want to go out for ice cream,lolly,2


### Creating the feature and dependent variables

In [174]:
X = dataset[INFORMATION_COLUMN]
y = dataset[LABEL_COLUMN]

In [175]:
# Checking the X and y values ...
print(X)
print('\n')
print(y)

0                             Let's go to the beach
1                              I want something hot
2             The kids want to go out for ice cream
3                                    Is cold as ice
4             The kids want to go out for ice cream
                           ...                     
307             Drizzling chocolate over my churros
308           Enjoying a piece of peanut butter pie
309    Snacking on chocolate-covered macadamia nuts
310               Drenched from the summer rainfall
311        Making a splash in the neighborhood pool
Name: Information, Length: 312, dtype: object


0      4
1      0
2      0
3      1
4      2
      ..
307    3
308    3
309    3
310    4
311    4
Name: Label, Length: 312, dtype: int64


### Creating the Bag of Words model

In [176]:
all_stopwords = stopwords.words('english')
all_stopwords.append('like')

vectorizer = TfidfVectorizer(stop_words=all_stopwords, max_features=1000)
vectorized_X = vectorizer.fit_transform(X)
vectorized_X

<312x345 sparse matrix of type '<class 'numpy.float64'>'
	with 1064 stored elements in Compressed Sparse Row format>

### Split training and test sets

In [177]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(vectorized_X, y, test_size = 0.20, random_state = 0)

In [178]:
print(X_train.shape)
print(X_train)
print()
print(y_train)

(249, 345)
  (0, 128)	0.6708504825520719
  (0, 129)	0.6201021448304747
  (0, 175)	0.40673450804707656
  (1, 67)	0.4731344445213467
  (1, 267)	0.5153064407010403
  (1, 324)	0.6295719405287504
  (1, 175)	0.3379973983307649
  (2, 15)	0.6304351604608536
  (2, 162)	0.6304351604608536
  (2, 193)	0.4528830057635143
  (3, 10)	0.5647654615824802
  (3, 226)	0.5269345397210705
  (3, 123)	0.3832071145727604
  (3, 134)	0.40570814547513523
  (3, 175)	0.30320483552941097
  (4, 251)	0.6172658622285185
  (4, 179)	0.6786498659801156
  (4, 148)	0.39801660107645587
  (5, 61)	0.6629403287149748
  (5, 286)	0.7486722384082912
  (6, 144)	0.5587124198788183
  (6, 105)	0.5044458543049534
  (6, 277)	0.6583120931197457
  (7, 134)	0.5550919423178156
  (7, 227)	0.7209545097635699
  :	:
  (242, 175)	0.40099687319683935
  (243, 230)	0.543231437869796
  (243, 288)	0.47655887852663237
  (243, 93)	0.543231437869796
  (243, 300)	0.42742349621571624
  (244, 312)	0.5051171610399671
  (244, 206)	0.5051171610399671
  (244, 2

### Training the model

In [179]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

### Prediction and classification

In [180]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[18  0  0  2  0]
 [ 0  2  0  2  0]
 [ 0  0  4  0  0]
 [ 0  0  0 22  0]
 [ 0  0  0  2 11]]


0.9047619047619048

### Single prediction

In [181]:
X_test.shape

(63, 345)

In [182]:
# Wrap the single value in a list
# value = ['Going to the beach']
# value = ['I want to eat something cold']
# value = ['Chilling by the fireplace']
value = ['I like this cake']

# Transform the single value using the same vectorizer used for training
vectorized_value = vectorizer.transform(value)

y_pred = classifier.predict(vectorized_value)
category = label_encoder.inverse_transform(y_pred)
print(f"The category is: {category}")

The category is: ['sweet']
