In [1]:
import numpy as np
import pandas as pd

### _Data Understanding_ 

In [2]:
from datasets import load_dataset

data = load_dataset("merve/poetry")
data.head()

Unnamed: 0,author,content,poem name,age,type
0,WILLIAM SHAKESPEARE,Let the bird of loudest lay\r\nOn the sole Ara...,The Phoenix and the Turtle,Renaissance,Mythology & Folklore
1,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,"Sir Charles into my chamber coming in,\r\nWhen...",An Epilogue to the Above,Renaissance,Mythology & Folklore
2,THOMAS BASTARD,"Our vice runs beyond all that old men saw,\r\n...","Book 7, Epigram 42",Renaissance,Mythology & Folklore
3,EDMUND SPENSER,"Lo I the man, whose Muse whilome did maske,\r\...","from The Faerie Queene: Book I, Canto I",Renaissance,Mythology & Folklore
4,RICHARD BARNFIELD,"Long have I longd to see my love againe,\r\nSt...",Sonnet 16,Renaissance,Mythology & Folklore


In [3]:
data = data[data['poem name'].isnull()==False]                   # removing the null values 
data = data[data['content'].isnull()==False]
data.shape

(571, 5)

In [4]:
data=data[data["content"].str.contains("Copyright")==False]
data.shape

(501, 5)

In [5]:
data=data[data["content"].str.contains("from Dana, 1904")==False]               # Faulty data ,needed to be removed
data.shape
#data["content"]


(500, 5)

### _Data Preparation_

In [6]:
data.content = data["content"].str.lower()       
data['content'].head()

0    let the bird of loudest lay\r\non the sole ara...
1    sir charles into my chamber coming in,\r\nwhen...
2    our vice runs beyond all that old men saw,\r\n...
3    lo i the man, whose muse whilome did maske,\r\...
4    long have i longd to see my love againe,\r\nst...
Name: content, dtype: object

In [7]:
data['content'] = data['content'].str.replace("\n", " ")                #   Remove "#" to observe the updated data 
data['content'] = data['content'].str.replace("\r", " ")
data['content'] = data['content'].str.replace("\t", " ")
data['content'] = data['content'].str.replace(","," ").replace("."," ")
#data['content']

In [8]:
data['content'] = data['content'].str.replace("[^a-zA-Z]", " ")
data['content'] = data['content'].str.replace("ing( |$)", " ")
data['content'] = data['content'].str.replace("ies( |$)", "y ")
data['content'].head()

0    let the bird of loudest lay  on the sole arabi...
1    sir charles into my chamber com in   when i wa...
2    our vice runs beyond all that old men saw   an...
3    lo i the man  whose muse whilome did maske   a...
4    long have i longd to see my love againe   stil...
Name: content, dtype: object

In [9]:
# remove list in the document
remove_list=["A",
"An",
"The",
"Aboard",
"About",
"Above",
"Absent",
"Across",
"After",
"Against",
"Along",
"Alongside",
"Amid",
"Among",
"Amongst",
"Anti",
"Around",
"As",
"At",
"Before",
"Behind",
"Below",
"Beneath",
"Beside",
"Besides",
"Between",
"Beyond",
"But",
"By",
"Circa",
"Concerning",
"Considering",
"Despite",
"Down",
"During",
"Except",
"Excepting",
"Excluding",
"Failing",
"Following",
"For",
"From",
"Given",
"In",
"Inside",
"Into",
"Like",
"Minus",
"Near",
"Of",
"Off",
"On",
"Onto",
"Opposite",
"Outside",
"Over",
"Past",
"Per",
"Plus",
"Regarding",
"Round",
"Save",
"Since",
"Than",
"Through",
"To",
"Toward",
"Towards",
"Under",
"Underneath",
"Unlike",
"Until",
"Up",
"Upon",
"Versus",
"Via",
"With",
"Within",
"Without"]

In [10]:
for words in remove_list:
    data['content'] = data['content'].str.replace(words , " ")

In [11]:
X = data.iloc[:,0:3]            # Features
y = data.iloc[:,-1]             # Label

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,1), sublinear_tf=True, analyzer= 'word')

In [13]:
X_content = vectorizer.fit_transform(X["content"].values)           # X_content is the sparse matrix formed by X["content"]     
feature_names = vectorizer.get_feature_names()                      # Features extracted from X_content storede in feature_names   
len(feature_names)

10892

In [14]:
from sklearn import preprocessing                   # Preprocessing the other columns using label encoder
 
label_au = preprocessing.LabelEncoder()
X_author = label_au.fit_transform(X.author.values)

label_poe_name = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = 'english')  
X_poemname  = label_poe_name.fit_transform(X["poem name"].values)

X_author = np.reshape(X_author, (X_author.shape[0], 1))


In [15]:
import scipy.sparse as sp                        # Joining the three formed data to form the final input features

X_final = sp.hstack([X_content,X_author, X_poemname])

### _Modelling_
#### _Classification of poems on the basis of genres , Will use 2 different classifiers_

In [16]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(X_final , y , test_size = 0.20 , random_state = 0)
print(X_train.shape)
print(X_test.shape)

(400, 13444)
(100, 13444)


### Classifier-1  (Random Forest)

In [17]:
from sklearn.ensemble import RandomForestClassifier             

Classifier = RandomForestClassifier(n_estimators = 100 , criterion = "entropy" , random_state = 0)
Classifier.fit(X_train, y_train)

  from numpy.core.umath_tests import inner1d


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [18]:
y_pred = Classifier.predict(X_test)
print(y_test.values)
print(y_pred)

['Love' 'Love' 'Love' 'Love' 'Love' 'Mythology & Folklore'
 'Mythology & Folklore' 'Love' 'Love' 'Love' 'Love' 'Love' 'Nature' 'Love'
 'Love' 'Mythology & Folklore' 'Love' 'Nature' 'Love' 'Nature' 'Love'
 'Nature' 'Love' 'Love' 'Nature' 'Love' 'Nature' 'Nature' 'Love' 'Love'
 'Nature' 'Love' 'Love' 'Love' 'Nature' 'Love' 'Love' 'Love' 'Nature'
 'Love' 'Mythology & Folklore' 'Nature' 'Nature' 'Love' 'Love'
 'Mythology & Folklore' 'Love' 'Mythology & Folklore' 'Nature' 'Love'
 'Nature' 'Love' 'Nature' 'Love' 'Love' 'Mythology & Folklore'
 'Mythology & Folklore' 'Love' 'Love' 'Mythology & Folklore' 'Love' 'Love'
 'Nature' 'Nature' 'Love' 'Love' 'Mythology & Folklore' 'Love' 'Love'
 'Love' 'Love' 'Nature' 'Mythology & Folklore' 'Love' 'Love' 'Nature'
 'Love' 'Love' 'Love' 'Love' 'Love' 'Love' 'Love' 'Love'
 'Mythology & Folklore' 'Mythology & Folklore' 'Nature' 'Love' 'Nature'
 'Nature' 'Love' 'Nature' 'Nature' 'Love' 'Love' 'Nature' 'Nature' 'Love'
 'Nature' 'Love']
['Love' 'Love' 'Love' 

In [19]:
pd.DataFrame({ 'correct_class' : y_test, 
               'Predicted_class' : y_pred,
                }).reset_index()


Unnamed: 0,index,correct_class,Predicted_class
0,90,Love,Love
1,254,Love,Love
2,283,Love,Love
3,508,Love,Love
4,532,Love,Nature
5,15,Mythology & Folklore,Nature
6,316,Mythology & Folklore,Love
7,561,Love,Love
8,159,Love,Love
9,153,Love,Love


### Result Evaluation

In [20]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test , y_pred)
print(cm)

[[53  1  6]
 [10  0  3]
 [16  1 10]]


### Classifier 2 : Naive Bayes

In [21]:
from sklearn.naive_bayes import GaussianNB                                  


In [22]:
Classifier = GaussianNB()
Classifier.fit(X_train.toarray() , y_train)

GaussianNB(priors=None)

In [23]:
y_pred = Classifier.predict(X_test.toarray())
print(y_test.values)
print(y_pred)

['Love' 'Love' 'Love' 'Love' 'Love' 'Mythology & Folklore'
 'Mythology & Folklore' 'Love' 'Love' 'Love' 'Love' 'Love' 'Nature' 'Love'
 'Love' 'Mythology & Folklore' 'Love' 'Nature' 'Love' 'Nature' 'Love'
 'Nature' 'Love' 'Love' 'Nature' 'Love' 'Nature' 'Nature' 'Love' 'Love'
 'Nature' 'Love' 'Love' 'Love' 'Nature' 'Love' 'Love' 'Love' 'Nature'
 'Love' 'Mythology & Folklore' 'Nature' 'Nature' 'Love' 'Love'
 'Mythology & Folklore' 'Love' 'Mythology & Folklore' 'Nature' 'Love'
 'Nature' 'Love' 'Nature' 'Love' 'Love' 'Mythology & Folklore'
 'Mythology & Folklore' 'Love' 'Love' 'Mythology & Folklore' 'Love' 'Love'
 'Nature' 'Nature' 'Love' 'Love' 'Mythology & Folklore' 'Love' 'Love'
 'Love' 'Love' 'Nature' 'Mythology & Folklore' 'Love' 'Love' 'Nature'
 'Love' 'Love' 'Love' 'Love' 'Love' 'Love' 'Love' 'Love'
 'Mythology & Folklore' 'Mythology & Folklore' 'Nature' 'Love' 'Nature'
 'Nature' 'Love' 'Nature' 'Nature' 'Love' 'Love' 'Nature' 'Nature' 'Love'
 'Nature' 'Love']
['Love' 'Nature' 'Love

In [24]:
pd.DataFrame({ 'correct_class' : y_test, 
               'Predicted_class' : y_pred,
                }).reset_index()

Unnamed: 0,index,correct_class,Predicted_class
0,90,Love,Love
1,254,Love,Nature
2,283,Love,Love
3,508,Love,Nature
4,532,Love,Mythology & Folklore
5,15,Mythology & Folklore,Nature
6,316,Mythology & Folklore,Love
7,561,Love,Nature
8,159,Love,Love
9,153,Love,Love


### Result Evaluation

In [25]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test , y_pred)
print(cm)

[[46  4 10]
 [ 5  2  6]
 [12  3 12]]


### Analysis Of the Classifiers Used

###### Using Random Forest classifier , we got an accuracy which is around 63% which is below par.
###### Using Naive-bayes classifier , we got an accuracy which is around 60% which is very low.
_We can infer that both classifiers didn't performed well on the test data , thus more training data is required to fit the classifiers well._