**Article from, AnalyticsVidhya** <br/>
Link: https://www.analyticsvidhya.com/blog/2019/04/predicting-movie-genres-nlp-multi-label-classification/

In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/cmu-book-summary-dataset/booksummaries.txt


In [2]:
import pandas as pd
import numpy as np
import json
import nltk
import re
import csv
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline
pd.set_option('display.max_colwidth', 300)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import f1_score, accuracy_score

In [4]:
data = []

with open("/kaggle/input/cmu-book-summary-dataset/booksummaries.txt", 'r') as f:
    reader = csv.reader(f, dialect='excel-tab')
    for row in tqdm(reader):
        data.append(row)

16559it [00:01, 10603.70it/s]


In [5]:
book_id = []
book_name = []
summary = []
genre = []

for i in tqdm(data):
    book_id.append(i[0])
    book_name.append(i[2])
    genre.append(i[5])
    summary.append(i[6])

books = pd.DataFrame({'book_id': book_id, 'book_name': book_name,
                       'genre': genre, 'summary': summary})
books.head(2)

100%|██████████| 16559/16559 [00:00<00:00, 488242.56it/s]


Unnamed: 0,book_id,book_name,genre,summary
0,620,Animal Farm,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"": ""Satire"", ""/m/0dwly"": ""Children's literature"", ""/m/014dfn"": ""Speculative fiction"", ""/m/02xlf"": ""Fiction""}","Old Major, the old boar on the Manor Farm, calls the animals on the farm for a meeting, where he compares the humans to parasites and teaches the animals a revolutionary song, 'Beasts of England'. When Major dies, two young pigs, Snowball and Napoleon, assume command and turn his dream into a p..."
1,843,A Clockwork Orange,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""Novella"", ""/m/014dfn"": ""Speculative fiction"", ""/m/0c082"": ""Utopian and dystopian fiction"", ""/m/06nbt"": ""Satire"", ""/m/02xlf"": ""Fiction""}","Alex, a teenager living in near-future England, leads his gang on nightly orgies of opportunistic, random ""ultra-violence."" Alex's friends (""droogs"" in the novel's Anglo-Russian slang, Nadsat) are: Dim, a slow-witted bruiser who is the gang's muscle; Georgie, an ambitious second-in-command; and..."


In [6]:
books.shape

(16559, 4)

In [7]:
books.drop(books[books['genre']==''].index, inplace=True)
books[books['genre']=='']

Unnamed: 0,book_id,book_name,genre,summary


In [8]:
json.loads(books['genre'][0]).values()

dict_values(['Roman à clef', 'Satire', "Children's literature", 'Speculative fiction', 'Fiction'])

In [9]:
genres = []
for i in books['genre']:
    genres.append(list(json.loads(i).values()))
books['genre_new'] = genres

In [10]:
all_genres = sum(genres,[])
len(set(all_genres))

227

In [11]:
def clean_summary(text):
    text = re.sub("\'", "", text)
    text = re.sub("[^a-zA-Z]"," ",text)
    text = ' '.join(text.split())
    text = text.lower()
    return text

In [12]:
books['clean_summary'] = books['summary'].apply(lambda x: clean_summary(x))
books.head(2)

Unnamed: 0,book_id,book_name,genre,summary,genre_new,clean_summary
0,620,Animal Farm,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"": ""Satire"", ""/m/0dwly"": ""Children's literature"", ""/m/014dfn"": ""Speculative fiction"", ""/m/02xlf"": ""Fiction""}","Old Major, the old boar on the Manor Farm, calls the animals on the farm for a meeting, where he compares the humans to parasites and teaches the animals a revolutionary song, 'Beasts of England'. When Major dies, two young pigs, Snowball and Napoleon, assume command and turn his dream into a p...","[Roman à clef, Satire, Children's literature, Speculative fiction, Fiction]",old major the old boar on the manor farm calls the animals on the farm for a meeting where he compares the humans to parasites and teaches the animals a revolutionary song beasts of england when major dies two young pigs snowball and napoleon assume command and turn his dream into a philosophy t...
1,843,A Clockwork Orange,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""Novella"", ""/m/014dfn"": ""Speculative fiction"", ""/m/0c082"": ""Utopian and dystopian fiction"", ""/m/06nbt"": ""Satire"", ""/m/02xlf"": ""Fiction""}","Alex, a teenager living in near-future England, leads his gang on nightly orgies of opportunistic, random ""ultra-violence."" Alex's friends (""droogs"" in the novel's Anglo-Russian slang, Nadsat) are: Dim, a slow-witted bruiser who is the gang's muscle; Georgie, an ambitious second-in-command; and...","[Science Fiction, Novella, Speculative fiction, Utopian and dystopian fiction, Satire, Fiction]",alex a teenager living in near future england leads his gang on nightly orgies of opportunistic random ultra violence alexs friends droogs in the novels anglo russian slang nadsat are dim a slow witted bruiser who is the gangs muscle georgie an ambitious second in command and pete who mostly pla...


In [13]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

books['clean_summary'] = books['clean_summary'].apply(lambda x: remove_stopwords(x))

In [14]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(books['genre_new'])

y = multilabel_binarizer.transform(books['genre_new'])

In [15]:
x_train, x_val, ytrain, yval = train_test_split(books['clean_summary'],
                                              y, test_size=0.2)

In [16]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)
xtrain = tfidf_vectorizer.fit_transform(x_train)
xval = tfidf_vectorizer.transform(x_val)

In [17]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)
clf.fit(xtrain, ytrain)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [18]:
y_pred = clf.predict(xval)
f1_score(yval, y_pred, average="micro"), accuracy_score(yval, y_pred)

(0.39492173055589797, 0.10159595173219152)

In [19]:
pred_prob = clf.predict_proba(xval)

In [20]:
t = 0.3
predp = (pred_prob >= t).astype(int)
f1_score(yval, predp, average="micro"), accuracy_score(yval, predp)

(0.5441007002925273, 0.129233164655508)

In [21]:
def predict(m):
    m = clean_summary(m)
    m = remove_stopwords(m)
    m_vec = tfidf_vectorizer.transform([m])
    m_pred = clf.predict(m_vec)
    return multilabel_binarizer.inverse_transform(m_pred)

In [22]:
for i in range(10):
    k = x_val.sample(1).index[0]
    print("Book: ", books['book_name'][k], 
          "\nPredicted genre: ", predict(x_val[k])) ,
    print("Actual genre: ",books['genre_new'][k], "\n")

Book:  Penny Dreadful 
Predicted genre:  [()]
Actual genre:  ['Novel'] 

Book:  A Quiet Belief In Angels 
Predicted genre:  [()]
Actual genre:  ['Thriller'] 

Book:  The Chinese Gold Murders 
Predicted genre:  [('Detective fiction', 'Mystery')]
Actual genre:  ['Mystery', 'Detective fiction'] 

Book:  The Magician 
Predicted genre:  [()]
Actual genre:  ['Young adult literature', 'Fiction'] 

Book:  The Death of Achilles 
Predicted genre:  [()]
Actual genre:  ['Mystery', 'Novel', 'History'] 

Book:  Star Wars Republic Commando: True Colors 
Predicted genre:  [()]
Actual genre:  ['Science Fiction', 'Speculative fiction'] 

Book:  Body Bags 
Predicted genre:  [()]
Actual genre:  ['Thriller'] 

Book:  .hack//Epitaph of Twilight 
Predicted genre:  [()]
Actual genre:  ['Science Fiction'] 

Book:  Evolution 
Predicted genre:  [('Science Fiction', 'Speculative fiction')]
Actual genre:  ['Science Fiction', 'Speculative fiction', 'Fantasy'] 

Book:  The Fire Pony 
Predicted genre:  [()]
Actual ge