In [202]:
import os
import json
import sklearn as sk
import numpy as np
import pandas as pd
from sklearn import preprocessing
from matplotlib import pyplot as plt
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import nltk
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

## Required File Paths

In [126]:
BASE_PATH = "/home/richhiey/Desktop/workspace/academics/courses/semester_4/Advanced Topics in Machine Learning/course_project"
CORPUS_PATH = os.path.join(BASE_PATH, "Gutenberg_English_Fiction_1k")
DATA_PATH = os.path.join(CORPUS_PATH, "Gutenberg_19th_century_English_Fiction")
FEATURES_PATH = os.path.join(BASE_PATH, "ATiML-Project", "features")

## Quick look at the merged dataset

In [127]:
COLUMN_NAMES = [
    'Filename',
    'Positive Sentiment',
    'Negative Sentiment',
    'Objective Sentiment',
    'Female Orientation',
    'Male Orientation',
    'Number of Words',
    'Number of Paragraphs',
    'Relative Punctuation',
    'Average Words per Sentence',
    'Number of Sentences',
    'Number of Dialogs',
    'Number of Sentences with Dialogs',
    'Number of Named People',
    'Number of Named Places',
    'Number of Named Organizations'
]
all_features = pd.read_csv(os.path.join(FEATURES_PATH, "all_features_raw.csv"), usecols=COLUMN_NAMES)
meta_data = pd.read_csv(os.path.join(CORPUS_PATH, "master996.csv"), sep=";", header=0, encoding='latin1')
merged_data = pd.merge(all_features, meta_data, left_on="Filename", right_on="book_id")
merged_data

Unnamed: 0,Positive Sentiment,Negative Sentiment,Objective Sentiment,Female Orientation,Male Orientation,Filename,Number of Words,Number of Paragraphs,Relative Punctuation,Average Words per Sentence,Number of Sentences,Number of Dialogs,Number of Sentences with Dialogs,Number of Named People,Number of Named Places,Number of Named Organizations,Book_Name,book_id,guten_genre,Author_Name
0,0.069583,0.071145,0.859272,355.0,2719.0,pg10067.epub,68757,2282,0.175386,12.858986,2283,2476.5,1649,114,71,28,The Mystery of the Boule Cabinet: A Detective ...,pg10067.epub,Detective and Mystery,Stevenson| Burton Egbert
1,0.078844,0.073378,0.847779,206.0,893.0,pg1032.epub,17776,267,0.119768,17.444553,268,337.0,251,21,25,8,The Pupil,pg1032.epub,Literary,James| Henry
2,0.079451,0.083087,0.837462,4673.0,5753.0,pg10379.epub,151734,3509,0.152899,16.287462,3510,3265.0,2087,145,115,44,At Love's Cost,pg10379.epub,Literary,Garvice| Charles
3,0.057817,0.066159,0.876024,761.0,3572.0,pg10473.epub,89228,3331,0.162158,9.886759,3332,3644.5,2619,173,59,37,The Heart of the Range,pg10473.epub,Western Stories,White| William Patterson
4,0.086624,0.091678,0.821698,473.0,547.0,pg10812.epub,17883,471,0.139518,16.760075,472,341.0,264,14,13,7,The Worshipper of the Image,pg10812.epub,Literary,Gallienne| Richard Le
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
989,0.079953,0.079557,0.840490,6750.0,8450.0,pg766DickensDavidCopfld.epub,358420,7080,0.125389,15.768588,7081,203.5,122,274,187,121,David Copperfield,pg766DickensDavidCopfld.epub,Literary,Dickens| Charles
990,0.074366,0.078430,0.847204,2101.0,3250.0,pg786DickensHardTimes.epub,103930,2255,0.121745,14.693906,2256,27.0,16,111,55,64,Hard Times,pg786DickensHardTimes.epub,Literary,Dickens| Charles
991,0.070811,0.071905,0.857284,490.0,2866.0,pg834DoyleMemoirsSherlk.epub,87323,2055,0.124286,15.178689,2056,1913.0,1820,184,142,77,Memoirs of Shelock Holmes,pg834DoyleMemoirsSherlk.epub,Detective and Mystery,Connan| Doyle
992,0.080267,0.090601,0.829131,1015.0,1476.0,pg863Agatha1.epub,57390,2532,0.170169,9.825372,2533,2211.5,1818,80,36,24,The Mysterious Affair at Styles,pg863Agatha1.epub,Detective and Mystery,Christie| Agatha


## Prepare dataset and labels to be used for classification

In [193]:
COLUMNS_TO_DROP = ["Filename", "Book_Name", "book_id",  
                   "Author_Name", "guten_genre"]

train_dataset = pd.DataFrame() 
test_dataset = pd.DataFrame()

# 792 Instances
literary_data = merged_data[merged_data.guten_genre == "Literary"]
test_dataset = test_dataset.append(literary_data[0:100])
train_dataset = train_dataset.append(literary_data[101:])

# 111 Instances
detective = merged_data[merged_data.guten_genre == "Detective and Mystery"]
test_dataset = test_dataset.append(detective[0:30])
train_dataset = train_dataset.append(detective[31:])

western = merged_data[merged_data.guten_genre == "Western Stories"]
test_dataset = test_dataset.append(western[0:4])
train_dataset = train_dataset.append(western[5:])

sea_and_adventure = merged_data[merged_data.guten_genre == "Sea and Adventure"]
test_dataset = test_dataset.append(sea_and_adventure[0:10])
train_dataset = train_dataset.append(western[5:])

romance = merged_data[merged_data.guten_genre == "Love and Romance"]
test_dataset = test_dataset.append(romance[0:4])
train_dataset = train_dataset.append(western[4:])

allegories = merged_data[merged_data.guten_genre == "Allegories"]
test_dataset = test_dataset.append([allegories[0:1]])
train_dataset = train_dataset.append(allegories)

humor = merged_data[merged_data.guten_genre == "Humorous and Wit and Satire"]
test_dataset = test_dataset.append(humor[0:1])
train_dataset = train_dataset.append(humor[2:])

horror = merged_data[merged_data.guten_genre == "Ghost and Horror"]
test_dataset = test_dataset.append(horror[0:1])
train_dataset = train_dataset.append(horror[2:])

christmas = merged_data[merged_data.guten_genre == "Christmas Stories"]
test_dataset = test_dataset.append(christmas[0:1])
train_dataset = train_dataset.append(christmas[2:])

test_labels = test_dataset[["guten_genre"]]
test_data = test_dataset.drop(COLUMNS_TO_DROP, axis=1)

## Genre Classification with Logisitic Regression

In [198]:
labels = train_dataset[['guten_genre']]
data = train_dataset.drop(COLUMNS_TO_DROP, axis=1)
lreg = LogisticRegression(max_iter=100000, class_weight='balanced').fit(data, labels)
print(lreg.score(test_data, test_labels))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.618421052631579


## Genre Classification with Naive Bayes

In [200]:
labels = train_dataset[['guten_genre']]
data = train_dataset.drop(COLUMNS_TO_DROP, axis=1)
nb = GaussianNB()
nb = nb.fit(data, labels)
print(nb.score(test_data, test_labels))

0.618421052631579
0.506578947368421


  y = column_or_1d(y, warn=True)


## Genre Classification with SVM

In [205]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(data, labels)
print(clf.score(test_data, test_labels))

0.6578947368421053


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  y = column_or_1d(y, warn=True)
  Xt = transform.transform(Xt)
