In [9]:
import os
import json
import sklearn as sk
import numpy as np
import pandas as pd
from sklearn import preprocessing
from matplotlib import pyplot as plt
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
import nltk
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.utils import resample

## Required File Paths

In [2]:
BASE_PATH = "/home/richhiey/Desktop/workspace/academics/courses/semester_4/Advanced Topics in Machine Learning/course_project"
CORPUS_PATH = os.path.join(BASE_PATH, "Gutenberg_English_Fiction_1k")
DATA_PATH = os.path.join(CORPUS_PATH, "Gutenberg_19th_century_English_Fiction")
FEATURES_PATH = os.path.join(BASE_PATH, "ATiML-Project", "features")

## Quick look at the merged dataset

In [3]:
COLUMN_NAMES = [
    'Filename',
    'Number of Named People',
    'Number of Named Places',
    'Number of Named Organizations',
    'Number of Sentences',
    'Female Orientation',
    'Male Orientation',
    'Positive Sentiment',
    'Negative Sentiment',
    'Objective Sentiment',
    'Number of Words',
    'Number of Paragraphs',
    'Relative Punctuation',
    'Number of Dialogs',
    'Number of Sentences with Dialogs'  
]
all_features = pd.read_csv(os.path.join(FEATURES_PATH, "all_features_raw.csv"), usecols=COLUMN_NAMES)
meta_data = pd.read_csv(os.path.join(CORPUS_PATH, "master996.csv"), sep=";", header=0, encoding='latin1')
merged_data = pd.merge(all_features, meta_data, left_on="Filename", right_on="book_id")
merged_data["guten_genre"]

0      Detective and Mystery
1                   Literary
2                   Literary
3            Western Stories
4                   Literary
5                   Literary
6      Detective and Mystery
7                   Literary
8                   Literary
9                   Literary
10                  Literary
11                  Literary
12                  Literary
13                  Literary
14                  Literary
15          Ghost and Horror
16                  Literary
17                  Literary
18                  Literary
19                  Literary
20     Detective and Mystery
21                  Literary
22                  Literary
23                  Literary
24                  Literary
25                  Literary
26                  Literary
27                  Literary
28                  Literary
29                  Literary
               ...          
964                 Literary
965                 Literary
966                 Literary
967           

## Prepare dataset and labels to be used for classification

The main problem that we face with the dataset, even after extracting explainable features, is that the output classes are imbalanced. The number of labeled instances for each of the classes is shown below:

Literary:**792**

Detective and Mystery:**111**

Sea and Adventure:**36**

Western Stories:**18**

Love and Romance:**18**

Ghost and Horror:**6**

Humorous and Wit and Satire:**6**

Christmas Stories:**5**

Allegories:**2**

As we can see, the last 4 entries seem outnumbered compared to the first. The label "Allegories" specially has only 2 labeled instances corresponding to it. I don't know how useful it could be predict such a class. In my opinion, it would be better to skip this class and focus on the others.

In the case of "Christmas Stories, "Humor..", and "Ghost and Horror", we have counts of 6, 6 and 5 respectively. Hence, I allowed for the possibility of duplicates and upsampled to get 20 instances in each class. This is a simple hack which increases the signal strength for any learning algorithm for a particular class.

Similarly, we round off "Love and Romance" and "Western Stories" to 40 instances and "Sea and Adventure" to 60 instances by upsampling.

For the other two majority classes "Literary" and "Detective and Mystery", we do not upsample.

Overall, by upsampling, the dataset now looks something like this- 

Literary:**792**

Detective and Mystery:**111**

Sea and Adventure:**60**

Western Stories:**40**

Love and Romance:**40**

Ghost and Horror:**20**

Humorous and Wit and Satire:**20**

Christmas Stories:**20**

I don't know whether this is the right way to solve a class imbalance, but it seemed like a reasonable approach to me.

In [4]:
COLUMNS_TO_DROP = ["Filename", "Book_Name", "book_id", "Author_Name", "guten_genre"]

# THIS METHOD IS A PROBLEM - Our classes are super imbalanced, so doing a random split might 
# lead to some classes not even being in the test or train set
labels = merged_data[['guten_genre']]
data = merged_data.drop(COLUMNS_TO_DROP, axis=1)
min_max_scaler = preprocessing.MinMaxScaler()
x_train, x_test, y_train, y_test = train_test_split(min_max_scaler.fit_transform(data), labels, stratify=labels, test_size=0.2)

# Custom splitting of classes is needed!
def create_training_and_test_sets(labeled_instances, test_size, resample_num):
    resampled = resample(
        labeled_instances, 
        replace=True,
        n_samples=resample_num,
        random_state=123
    )
    return train_test_split(resampled, test_size=test_size)

train_dataset = pd.DataFrame() 
test_dataset = pd.DataFrame()

# Skipping label=Allegories since there are only two instances
labels = [
    "Detective and Mystery",
    "Literary",
    "Christmas Stories",
    "Western Stories",
    "Sea and Adventure",
    "Love and Romance",
    "Humorous and Wit and Satire",
    "Ghost and Horror"
]

for label in labels:
    sliced_dataset = merged_data[merged_data.guten_genre == label]
    if (len(sliced_dataset) < 10):
        # If count is very small, then upsample
        train_set, test_set = create_training_and_test_sets(sliced_dataset, 0.3, 20)
    elif (len(sliced_dataset) < 20):
        train_set, test_set = create_training_and_test_sets(sliced_dataset, 0.3, 40)
    elif (len(sliced_dataset) < 40):
        # If count is less than 100, then resample to 50 
        train_set, test_set = create_training_and_test_sets(sliced_dataset, 0.3, 60)
    else:
        # If count is very large, then split directly
        train_set, test_set = train_test_split(sliced_dataset, test_size=0.3)
    train_dataset = pd.concat([train_dataset, train_set])
    test_dataset = pd.concat([test_dataset, test_set])


x_train = train_dataset.drop(COLUMNS_TO_DROP, axis=1)
x_test = test_dataset.drop(COLUMNS_TO_DROP, axis=1)
x_train = pd.DataFrame(min_max_scaler.fit_transform(x_train))
x_test = pd.DataFrame(min_max_scaler.fit_transform(x_test))
y_train = train_dataset[["guten_genre"]]
y_test = test_dataset[["guten_genre"]]

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


In [5]:
def sensitivity(X, Y, clf, majority="Literary"):
    X=np.take(X, np.where(Y!="Literary")[0], axis=0)
    Y=np.take(Y, np.where(Y!="Literary")[0], axis=0)
    return(np.mean(clf.predict(X) == Y))

## Genre Classification with Logisitic Regression

In [7]:
lreg = LogisticRegression(max_iter=100000).fit(x_train, y_train)
print(lreg.score(x_test, y_test))
print(sensitivity(x_test, y_test['guten_genre'], lreg))

0.7228915662650602
0.02127659574468085


  y = column_or_1d(y, warn=True)


## Genre Classification with Naive Bayes

In [12]:
nb = GaussianNB()
nb = nb.fit(x_train, y_train)
print(nb.score(x_test, y_test))
print(sensitivity(x_test, y_test['guten_genre'], nb))

0.4126506024096386
0.3617021276595745


  y = column_or_1d(y, warn=True)


## Genre Classification with SVM

In [13]:
clf = make_pipeline(StandardScaler(), SVC(kernel='linear',gamma='auto'))
clf.fit(x_train, y_train)
print(clf.score(x_test, y_test))
print(sensitivity(x_test, y_test['guten_genre'], clf))

0.7168674698795181
0.0


  y = column_or_1d(y, warn=True)


## Neural Network

In [None]:
nn = MLPClassifier(hidden_layer_sizes=(100,), max_iter=10000).fit(x_train, y_train)
print(nn.score(x_test, y_test))
print(sensitivity(x_test, y_test['guten_genre'], nn))

  y = column_or_1d(y, warn=True)
