In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer 
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, Dropout 
from tensorflow.keras.optimizers import Adam 
from tensorflow.keras.utils import to_categorical
from imblearn.metrics import classification_report_imbalanced
import matplotlib.pyplot as plt

In [7]:
import tensorflow as tf
print("TensorFlow Keras version:", tf.keras.__version__)
from transformers import AutoTokenizer as at
print("TensorFlow Keras version:", at.__version__)


TensorFlow Keras version: 3.7.0


ModuleNotFoundError: No module named 'transformers'

In [65]:
data = pd.read_csv("Poem_classification_data.csv")

In [66]:
data

Unnamed: 0,Genre,Poem
0,Music,
1,Music,In the thick brushthey spend the...
2,Music,Storms are generous. ...
3,Music,—After Ana Mendieta Did you carry around the ...
4,Music,for Aja Sherrard at 20The portent may itself ...
...,...,...
836,Environment,Why make so much of fragmentary blue In here a...
837,Environment,"Woman, I wish I didn't know your name. What co..."
838,Environment,"Yonder to the kiosk, beside the creek, Paddle ..."
839,Environment,You come to fetch me from my work to-night Whe...


In [67]:
data.sample(5)

Unnamed: 0,Genre,Poem
165,Music,"Take it easy, Sadness. Settle down.You asked f..."
694,Environment,I have taken scales from off The cheeks of the...
466,Death,"Who goes there? hankering, gross, mystical, nu..."
520,Affection,Pleasure is black.I no longer imaginewhere my ...
507,Affection,"O my luve's like a red, red rose, That's newly..."


In [68]:
data.isnull().sum() 

Genre    0
Poem     4
dtype: int64

In [69]:
data.Genre.value_counts()

Genre
Music          239
Death          234
Environment    227
Affection      141
Name: count, dtype: int64

In [70]:
df = data.copy()
df = df.dropna()


In [71]:
X = df['Poem']
y = df['Genre']
y.unique()

array(['Music', 'Death', 'Affection', 'Environment'], dtype=object)

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [73]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb


model1 = make_pipeline_imb(TfidfVectorizer(), RandomUnderSampler(), MultinomialNB())
model1.fit(X_train, y_train)

y_pred1 = model1.predict(X_test)

In [74]:
from imblearn.metrics import classification_report_imbalanced


print(classification_report_imbalanced(y_test, y_pred1))

                   pre       rec       spe        f1       geo       iba       sup

  Affection       0.28      0.68      0.64      0.39      0.66      0.44        28
      Death       0.33      0.32      0.70      0.32      0.47      0.21        53
Environment       0.48      0.33      0.89      0.39      0.55      0.28        39
      Music       0.30      0.12      0.88      0.18      0.33      0.10        48

avg / total       0.35      0.33      0.79      0.31      0.48      0.24       168



In [75]:
## dopolnitelno

In [76]:
df = data.copy()
df = df.dropna()

X = df['Poem']
y = df['Genre']

In [77]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2)

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# print(f"Training class distributions summary: {Counter(y_train)}")
# print(f"Test class distributions summary: {Counter(y_test)}")

In [79]:
y_train_cat = to_categorical(y_train) 
y_test_cat = to_categorical(y_test)

In [80]:
model2 = Sequential()
model2.add(Dense(128, input_dim=X_train_tfidf.shape[1], activation='relu'))
model2.add(Dropout(0.3))
model2.add(Dense(64, activation='relu'))
model2.add(Dense(len(label_encoder.classes_), activation='softmax'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [81]:
# model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [82]:
history = model2.fit(X_train_tfidf, y_train_cat, epochs=5, batch_size=16, validation_data=(X_test_tfidf, y_test_cat))

Epoch 1/5
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.3036 - loss: 1.3782 - val_accuracy: 0.3512 - val_loss: 1.3726
Epoch 2/5
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7275 - loss: 1.2677 - val_accuracy: 0.3452 - val_loss: 1.3407
Epoch 3/5
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7849 - loss: 0.8943 - val_accuracy: 0.4167 - val_loss: 1.3291
Epoch 4/5
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9206 - loss: 0.4126 - val_accuracy: 0.4286 - val_loss: 1.4581
Epoch 5/5
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9429 - loss: 0.1853 - val_accuracy: 0.4107 - val_loss: 1.6485


In [83]:
loss, accuracy = model2.evaluate(X_test_tfidf, y_test_cat)
print(f'Loss: {loss}, Accuracy: {accuracy}')

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.4363 - loss: 1.5895 
Loss: 1.6484843492507935, Accuracy: 0.4107142984867096


In [84]:
print(f"Training class distributions summary: {Counter(label_encoder.inverse_transform(y_train))}")
print(f"Test class distributions summary: {Counter(label_encoder.inverse_transform(y_test))}")

Training class distributions summary: Counter({'Environment': 190, 'Music': 187, 'Death': 186, 'Affection': 106})
Test class distributions summary: Counter({'Music': 51, 'Death': 45, 'Environment': 37, 'Affection': 35})


In [85]:
y_pred2 =model2.predict(X_test_tfidf) 
y_pred_classes = y_pred2.argmax(axis=1) 
print(classification_report_imbalanced(y_test, y_pred_classes))

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
                   pre       rec       spe        f1       geo       iba       sup

          0       0.57      0.23      0.95      0.33      0.47      0.20        35
          1       0.40      0.38      0.79      0.39      0.55      0.29        45
          2       0.41      0.38      0.85      0.39      0.57      0.31        37
          3       0.39      0.59      0.60      0.47      0.59      0.35        51

avg / total       0.43      0.41      0.78      0.40      0.55      0.29       168



In [86]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred_classes))

0.4107142857142857
