In [10]:
import json

# Open the JSON file
with open('./../Poems_And_Outputs/Poems_Unparsed.JSON' , 'r', encoding='utf-8') as f:
    data = json.load(f)

datasongs = []
datalabels = []

for song in data:
  if not song['year'] == "":
    datasongs.append(song['content'])
    if int(song['year']) < 1900:
      datalabels.append(0)
    elif int(song['year']) < 1910:
      datalabels.append(1)
    else:
      datalabels.append(2)


list

### **Decision** **Trees**

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Global hyperparameters
TEST_SIZE = 0.2
RANDOM_STATE = 50

def train_classifier(songs, labels):
    # Splitting the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(songs, labels, test_size=TEST_SIZE, random_state=RANDOM_STATE)

    # Vectorizing the text data
    vectorizer = CountVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Creating and training the classifier
    classifier = DecisionTreeClassifier()
    classifier.fit(X_train_vec, y_train)

    return classifier, vectorizer, X_test_vec, y_test

def evaluate_classifier(classifier, X_test_vec, y_test):
    # Making predictions
    predictions = classifier.predict(X_test_vec)

    # Evaluating the model
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy:", accuracy)
    return accuracy

# Sample data
songs = datasongs
labels = datalabels

# Train the classifier
classifier, vectorizer, X_test_vec, y_test = train_classifier(songs, labels)

# Evaluate the classifier
accuracy = evaluate_classifier(classifier, X_test_vec, y_test)

# Print accuracy
print("Final Accuracy:", accuracy)

Accuracy: 0.7272727272727273
Final Accuracy: 0.7272727272727273


### **Random** **Forests**

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Global hyperparameters
TEST_SIZE = 0.2
RANDOM_STATE = 50

def train_classifier(songs, labels):
    # Splitting the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(songs, labels, test_size=TEST_SIZE, random_state=RANDOM_STATE)

    # Vectorizing the text data
    vectorizer = CountVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Creating and training the classifier
    classifier = RandomForestClassifier()
    classifier.fit(X_train_vec, y_train)

    return classifier, vectorizer, X_test_vec, y_test

def evaluate_classifier(classifier, X_test_vec, y_test):
    # Making predictions
    predictions = classifier.predict(X_test_vec)

    # Evaluating the model
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy:", accuracy)
    return accuracy

# Sample data
songs = datasongs
labels = datalabels

# Train the classifier
classifier, vectorizer, X_test_vec, y_test = train_classifier(songs, labels)

# Evaluate the classifier
accuracy = evaluate_classifier(classifier, X_test_vec, y_test)

# Print accuracy
print("Final Accuracy:", accuracy)

Accuracy: 0.7272727272727273
Final Accuracy: 0.7272727272727273


### **Gradient Boosting Machines (GBM)**

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Global hyperparameters
TEST_SIZE = 0.2
RANDOM_STATE = 50
N_ESTIMATORS = 100

def train_classifier(songs, labels):
    # Splitting the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(songs, labels, test_size=TEST_SIZE, random_state=RANDOM_STATE)

    # Vectorizing the text data
    vectorizer = CountVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Creating and training the classifier
    classifier = GradientBoostingClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_STATE)
    classifier.fit(X_train_vec, y_train)

    return classifier, vectorizer, X_test_vec, y_test

def evaluate_classifier(classifier, X_test_vec, y_test):
    # Making predictions
    predictions = classifier.predict(X_test_vec)

    # Evaluating the model
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy:", accuracy)
    return accuracy

# Sample data
songs = datasongs
labels = datalabels

# Train the classifier
classifier, vectorizer, X_test_vec, y_test = train_classifier(songs, labels)

# Evaluate the classifier
accuracy = evaluate_classifier(classifier, X_test_vec, y_test)

# Print accuracy
print("Final Accuracy:", accuracy)

Accuracy: 0.8636363636363636
Final Accuracy: 0.8636363636363636


### **Support Vector Machines (SVM)**

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Global hyperparameters
TEST_SIZE = 0.2
RANDOM_STATE = 50

def train_classifier(songs, labels):
    # Splitting the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(songs, labels, test_size=TEST_SIZE, random_state=RANDOM_STATE)

    # Vectorizing the text data
    vectorizer = CountVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Creating and training the classifier
    classifier = SVC(random_state=RANDOM_STATE)
    classifier.fit(X_train_vec, y_train)

    return classifier, vectorizer, X_test_vec, y_test

def evaluate_classifier(classifier, X_test_vec, y_test):
    # Making predictions
    predictions = classifier.predict(X_test_vec)

    # Evaluating the model
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy:", accuracy)
    return accuracy

# Sample data
songs = datasongs
labels = datalabels

# Train the classifier
classifier, vectorizer, X_test_vec, y_test = train_classifier(songs, labels)

# Evaluate the classifier
accuracy = evaluate_classifier(classifier, X_test_vec, y_test)

# Print accuracy
print("Final Accuracy:", accuracy)

Accuracy: 0.6363636363636364
Final Accuracy: 0.6363636363636364


### **Naive Bayes**

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Global hyperparameters
TEST_SIZE = 0.2
RANDOM_STATE = 50

def train_classifier(songs, labels):
    # Splitting the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(songs, labels, test_size=TEST_SIZE, random_state=RANDOM_STATE)

    # Vectorizing the text data
    vectorizer = CountVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Creating and training the classifier
    classifier = MultinomialNB()
    classifier.fit(X_train_vec, y_train)

    return classifier, vectorizer, X_test_vec, y_test

def evaluate_classifier(classifier, X_test_vec, y_test):
    # Making predictions
    predictions = classifier.predict(X_test_vec)

    # Evaluating the model
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy:", accuracy)
    return accuracy

# Sample data
songs = datasongs
labels = datalabels

# Train the classifier
classifier, vectorizer, X_test_vec, y_test = train_classifier(songs, labels)

# Evaluate the classifier
accuracy = evaluate_classifier(classifier, X_test_vec, y_test)

# Print accuracy
print("Final Accuracy:", accuracy)

Accuracy: 0.5909090909090909
Final Accuracy: 0.5909090909090909
