After loading csv file in pandas dataframe i've extracted content('poems') and type('genre') from it.
Poems will be used as input samples and genre will be the output.

Then I've shuffled the data and split it into training and testing using train_test_split.
Now I've created documents, where each row of the document represents input('poem') and it's corresponding output('genre').
clean_documents() function takes each poem and filter out the punctuations, stop_words and convert each word to a lowercase letter after splitting it into tokens. PosTag is used to marking up a particular word corresponding to a particular part of speech based on its context.

After training the document, I've joined the filtered tokens back together and CountVectorizer is used to convert a collection of text documents to a matrix of tokens count.
I've used RandomForestClassifier, SVC, MultiNomialNB to classify poems and all three have been compared based on accuracy.

SVC gives the best accuracy among all three classifiers with training accuracy of about 89%, so I've used GridSearchCV to find the best parameters and testing accuracy increased to about 70%. for SVC.

In [489]:
import pandas as pd
import numpy as np
import json
import gzip, json
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB

In [490]:
# change the path and Run All cells.
path = "C:\\Users\\Archit\\Desktop\\internship\\intellify\\all.csv"

In [491]:
# It's a library in nltk that provides pos_tag value (eg. whether a word is used as adjective or a verb etc in a sentence and then 
# using lemmatizer we can change the word accordingly).
from nltk.corpus import wordnet
def get_pos_tag_value(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [492]:
df = pd.read_csv(path)
df.head(3)

Unnamed: 0,author,content,poem name,age,type
0,WILLIAM SHAKESPEARE,Let the bird of loudest lay\r\nOn the sole Ara...,The Phoenix and the Turtle,Renaissance,Mythology & Folklore
1,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,"Sir Charles into my chamber coming in,\r\nWhen...",An Epilogue to the Above,Renaissance,Mythology & Folklore
2,THOMAS BASTARD,"Our vice runs beyond all that old men saw,\r\n...","Book 7, Epigram 42",Renaissance,Mythology & Folklore


In [493]:
# removing '\n', '\r' errors occurs while loading dataframe. 
df = df.replace('\n','', regex=True)
df = df.replace('\r','', regex=True)
df.head(3)

Unnamed: 0,author,content,poem name,age,type
0,WILLIAM SHAKESPEARE,Let the bird of loudest layOn the sole Arabian...,The Phoenix and the Turtle,Renaissance,Mythology & Folklore
1,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,"Sir Charles into my chamber coming in,When I w...",An Epilogue to the Above,Renaissance,Mythology & Folklore
2,THOMAS BASTARD,"Our vice runs beyond all that old men saw,And ...","Book 7, Epigram 42",Renaissance,Mythology & Folklore


In [494]:
stop = stopwords.words('english')
punct = list(string.punctuation)
stop_words = stop+punct
stop_words[:5], len(stop_words)

(['i', 'me', 'my', 'myself', 'we'], 211)

In [495]:
data = np.array(df)
df['type'].unique()

array(['Mythology & Folklore', 'Nature', 'Love'], dtype=object)

In [496]:
# shuffling numpy array for better predictions.
np.random.shuffle(data)

In [497]:
# extracting x_train and y_train values.
x = data[:, 1]
y = data[:, 4]
x.shape, y.shape

((573,), (573,))

In [498]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.15)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((487,), (487,), (86,), (86,))

In [499]:
# creating document by appending input and output data.
# here we've 3 types of classes i.e Mythology & Folklore, Nature and love.
train_documents = []
for i in range(len(x_train)):
    train_documents.append((x_train[i], y_train[i]))
# train_documents[:1]

In [500]:
# creating documents for test.
test_documents = []
for i in range(len(x_test)):
    test_documents.append((x_test[i], y_test[i]))
#test_documents[:1]

In [501]:
# I have used wordnet Lemmatizer.
lemmatizer = WordNetLemmatizer()

In [502]:
# a function to clean our documents i.e. removing stop_words, addig posTag in documents.
def clean_documents(poems):
    clean_poems = []
    peoms_list = text_to_word_sequence(poems) # text_word_seq split words, filter out punctuation and converts text to lowercase.
    #print(peoms_list,end=" ")
    for t in peoms_list:
        if t not in stop_words:
            postag = pos_tag([t]) # getting posTag of a word. 
            #print(type(t))
            clean_word = lemmatizer.lemmatize(t, pos = get_pos_tag_value(postag[0][1]))
            clean_poems.append(clean_word.lower())
    return clean_poems

In [503]:
new_documents_train = [(clean_documents(poems), genre) for poems, genre in train_documents]

In [504]:
new_documents_test = [(clean_documents(poems), genre) for poems, genre in test_documents]

In [505]:
#print(new_documents_test[0][0], new_documents_train[0][1], end="" )

In [506]:
#print(new_documents[0][0], new_documents_test[0][1], end = " ")

In [507]:
all_input_train = np.array([" ".join(poem) for poem, genre in new_documents_train])
all_output_train = np.array([genre for poem, genre in new_documents_train])
all_input_test = np.array([" ".join(poem) for poem, genre in new_documents_test])
all_output_test = np.array([genre for poem, genre in new_documents_test])
all_input_train.shape, all_output_train.shape, all_input_test.shape, all_output_test.shape

((487,), (487,), (86,), (86,))

In [508]:
# Now i have used CountVectorizer to get maximum frequency words and ngrams can also be used for better accuracy.
count_vec = CountVectorizer(max_features = 15000, ngram_range = (1, 4))
x_train_new = count_vec.fit_transform(all_input_train)
y_train_new = all_output_train
x_test_new = count_vec.transform(all_input_test)
y_test_new = all_output_test

In [509]:
x_train_new.shape, x_test_new.shape, y_train_new.shape, y_test_new.shape

((487, 15000), (86, 15000), (487,), (86,))

In [510]:
# label_encode = LabelEncoder()
# y_train_new = label_encode.fit_transform(y_train_new)
# y_test_new = label_encode.transform(y_test_new)
# y_train_new[:2], y_test_new[:2]

In [511]:
rf = RandomForestClassifier()
rf.fit(x_train_new, y_train_new)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [512]:
# accuracy on training set.
rf.score(x_train_new, y_train_new)

0.8891170431211499

In [513]:
rf.score(x_test_new, y_test_new)

0.7209302325581395

In [514]:
svc = SVC(750, kernel = 'rbf')
svc.fit(x_train_new, y_train_new)

SVC(C=750, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [515]:
# score on training data.
svc.score(x_train_new, y_train_new)

0.8891170431211499

In [516]:
# score on testing data.
svc.score(x_test_new, y_test_new)

0.6744186046511628

In [517]:
yp = svc.predict(x_test_new)

In [518]:
mnb = MultinomialNB()
mnb.fit(x_train_new, y_train_new)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [519]:
mnb.score(x_train_new, y_train_new)

0.7967145790554415

In [520]:
mnb.score(x_test_new, y_test_new)

0.6976744186046512

In [521]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold

In [522]:
x_tr = x_train_new
y_tr = y_train_new
x_te = x_test_new
y_te = y_test_new

In [523]:
skf = StratifiedKFold(n_splits=10, shuffle = True)
skf.get_n_splits(x_tr, y_tr)

10

In [524]:
print(skf)

StratifiedKFold(n_splits=10, random_state=None, shuffle=True)


In [525]:
score = []
for trainindex, testindex in skf.split(x_tr, y_tr):
    xtr, xte = x_tr[trainindex], x_tr[testindex]
    ytr, yte = y_tr[trainindex], y_tr[testindex]
    rfc = SVC(750, kernel = 'rbf')
    rfc.fit(xtr, ytr)
    output = rfc.score(xte, yte)
    score.append(output)

In [526]:
score

[0.58,
 0.52,
 0.68,
 0.5510204081632653,
 0.5306122448979592,
 0.7291666666666666,
 0.5416666666666666,
 0.625,
 0.7083333333333334,
 0.6595744680851063]

In [527]:
# using GridSearch to find best parameters for SVC.
tuned_para = [{'kernel' : ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4], 
               'C' : [1, 10, 100, 1000, 1500]}, {'kernel' : ['linear'], 
                'C' : [1, 10, 100, 1000]}]
scores= ['precision', 'recall']

In [528]:
clf = GridSearchCV(SVC(), tuned_para, cv = 10)

In [529]:
clf.fit(x_tr, y_tr)

GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'kernel': ['rbf'], 'gamma': [0.01, 0.001, 0.0001], 'C': [1, 10, 100, 1000, 1500]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [530]:
print(clf.best_params_)

{'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}


In [531]:
clfsvc = SVC(C=100, gamma=0.0001, kernel='rbf')
clfsvc.fit(x_tr, y_tr)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [532]:
clfsvc.score(x_tr, y_tr)

0.8562628336755647

In [533]:
clfsvc.score(x_te, y_te)

0.6976744186046512

In [534]:
from sklearn.metrics import classification_report
print(classification_report(y_te, clfsvc.predict(x_te)))

                      precision    recall  f1-score   support

                Love       0.74      0.87      0.80        53
Mythology & Folklore       0.00      0.00      0.00         4
              Nature       0.67      0.48      0.56        29

         avg / total       0.68      0.70      0.68        86

