In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

def classify_decade(year_str):
    year = int(year_str)
    val = -1
    if year < 1900:
        val = 0
    elif 1900 <= year <= 1910:
        val = 1
    elif year > 1910:
        val = 2
    return val

with open("Poems_Unparsed.JSON", "r") as file:
    data = json.load(file)
poems_X = [poem["content"] for poem in data if poem["year"] != ""]
poems_Y = [classify_decade(poem["year"]) for poem in data if poem["year"] != ""]

X_train, X_test, y_train, y_test = train_test_split(poems_X, poems_Y, test_size=0.2, random_state=42)
#train a model to predict the year of a poem based on its content:
model = make_pipeline(CountVectorizer(), MultinomialNB())
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Print the accuracy score:
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")



In [7]:
import json
with open("Poems_Unparsed.JSON", "r") as file:
    data = json.load(file)
poems = [poem["content"] for poem in data]
names = [poem["poem"] for poem in data]

In [None]:
#the following script shows how to use the model to predict the topics of a poem:

from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('onlplab/alephbert-base')
model = AutoModelForSequenceClassification.from_pretrained('onlplab/alephbert-base')
topics = ['מוות', 'צער', 'יהדות', 'ציונות', 'פטריוטיזם', 'בכי', 'אהבה','מלחמה', 'פוליטיקה', 'תקווה', 'נעורים']

def extract_topics(poem):
    # Tokenize the poem
    inputs = tokenizer(poem, truncation=True, padding=True, return_tensors='pt')

    # Forward pass through the model
    outputs = model(**inputs)

    # Get the predicted labels
    predicted_labels = outputs.logits.argmax(dim=1)

    # Return the topics related to the poem
    return [topics[label] for label in predicted_labels]



In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('onlplab/alephbert-base')
model = AutoModelForSequenceClassification.from_pretrained('onlplab/alephbert-base')

def extract_topics(poem):
    # Tokenize the poem
    inputs = tokenizer(poem, truncation=True, padding=True, return_tensors='pt')

    # Forward pass through the model
    outputs = model(**inputs)

    # Get the predicted labels
    predicted_labels = outputs.logits.argmax(dim=1)

    # Return the topics related to the poem
    return predicted_labels[0]

In [None]:
for poem,name in zip(poems,names):
    print(f"poem: {name}, topics: {extract_topics(poem)}")

In [None]:
poems_early = [poems in data if poem["year"] != "" and poem["year"] < 1900]
poems_mid = [poems in data if poem["year"] != "" and poem["year"] >= 1900 and poem["year"] <= 1910]
poems_late = [poems in data if poem["year"] != "" and poem["year"] > 1910]