Import packages

In [1]:
import pandas as pd
import numpy as np  
import re #for datacleaning
import nltk  
nltk.download('stopwords')  
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score #for measuring accuracy in text category models
from sklearn.model_selection import train_test_split #to divide in train and test set
from sklearn.feature_extraction.text import TfidfVectorizer  #for tfidf vectorize
from sklearn import naive_bayes, svm #for text categorization models
from sklearn.feature_extraction.text import CountVectorizer #for vectorizing
from sklearn.decomposition import LatentDirichletAllocation #for the topic mining model

[nltk_data] Downloading package stopwords to /Users/Amy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


The packages <b> langdetect </b> and <b> pyLDAvis </b> need to be downloaded and installed using pip. 

In [2]:
from langdetect import detect
import pyLDAvis.sklearn

# Text Categorization

Import datasets. The datasets are webscraped from the lists on Goodreads: <br>
for non-fiction: https://www.goodreads.com/list/show/134.Best_Non_Fiction_no_biographies_ <br>
for fiction: https://www.goodreads.com/list/show/2522.Cult_Classics

In [3]:
nf = pd.read_csv('booksSummaryNF.txt', sep="|", header=None)
f = pd.read_csv("booksSummaryCult_Classics.txt",sep="|", header=None)

I rename datasets' columns and add a new column specifing the genre original dataset

In [4]:
f = f.rename(columns={0: "title", 1: "rating",2:"summary"})
nf = nf.rename(columns={0: "title", 1: "rating",2:"summary"})
f["genre"]="fiction"
nf["genre"]="non fiction"

I append the two datasets.

In [5]:
df=f.append(nf,ignore_index = True)

I define the target value. 

In [6]:
X=df["summary"]
y=df["genre"]

Perform data cleaning on the dataset.

In [8]:
documents = []
stemmer = WordNetLemmatizer()

for i in range(0, len(X)):  
    document = re.sub(r'\W', ' ', str(X[i])) #remove all the special characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document) #remove all single characters
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) #remove single characters from the start
    document = re.sub(r'\s+', ' ', document, flags=re.I)  # substitute multiple spaces with single space
    document = document.lower() #converte to Lowercase
    document = document.split() #lemmatization
    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    documents.append(document)

I divide the dataset in test and train.

In [9]:
Train_X, Test_X, Train_Y, Test_Y = train_test_split(documents,y,test_size=0.3)

I apply TfidfVectorizer

In [10]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(documents)
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

<b> Naive Bayes classifier</b> model, first fit the model on the train set then predict the values of the test set and calculate the accuracy:

In [11]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
predictions_NB = Naive.predict(Test_X_Tfidf)
print("Naive Bayes Accuracy Score -> ",accuracy_score(y_pred=predictions_NB, y_true=Test_Y)*100)

Naive Bayes Accuracy Score ->  84.95726495726495


<b> SVM</b> model, first fit the model on the train set then predict the values of the test set and calculate the accuracy:

In [12]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf)
print("SVM Accuracy Score -> ",accuracy_score(y_pred=predictions_SVM,y_true= Test_Y)*100)

SVM Accuracy Score ->  84.61538461538461


# Topic Mining

Import datasets. The datasets are webscraped from the lists on Goodreads: <br>
for non-fiction: https://www.goodreads.com/list/show/134.Best_Non_Fiction_no_biographies_ <br>
for fiction: https://www.goodreads.com/list/show/2522.Cult_Classics <br>
for young adult: https://www.goodreads.com/list/show/43.Best_Young_Adult_Books <br>
for sci-fi: https://www.goodreads.com/list/show/19341.Best_Science_Fiction <br>
for fantasy: https://www.goodreads.com/list/show/50.The_Best_Epic_Fantasy_fiction_ <br>
for thriller: https://www.goodreads.com/list/show/348.Thrillers <br>
for romance: https://www.goodreads.com/list/show/12362.All_Time_Favorite_Romance_Novels

In [13]:
ya= pd.read_csv("booksSummaryYoung_Adult.txt", sep="|",header=None)
f = pd.read_csv("booksSummaryCult_Classics.txt", sep="|", header=None)
nf = pd.read_csv('booksSummaryNF.txt', sep="|", header=None)
rom=pd.read_csv("booksSummaryRomance_Novels.txt", sep="|", header=None)
thr=pd.read_csv("booksSummaryThrillers.txt", sep="|", header=None)
sifi=pd.read_csv("booksSummaryScience_Fiction.txt", sep="|", header=None)
fan=pd.read_csv("booksSummaryEpic_Fantasy.txt", sep="|", header=None)
f = f.rename(columns={0: "title", 1: "rating",2:"summary"})
ya = ya.rename(columns={0: "title", 1: "rating",2:"summary"})
nf = nf.rename(columns={0: "title", 1: "rating",2:"summary"})
rom = rom.rename(columns={0: "title", 1: "rating",2:"summary"})
thr = thr.rename(columns={0: "title", 1: "rating",2:"summary"})
sifi = sifi.rename(columns={0: "title", 1: "rating",2:"summary"})
fan=fan.drop(columns=[0])
fan = fan.rename(columns={1: "title", 2: "rating", 3:"summary"})

I unite the datasets.

In [14]:
fic=f.append(ya,ignore_index = True)
fic=fic.append(nf,ignore_index=True)
fic=fic.append(rom,ignore_index=True)
fic=fic.append(thr,ignore_index=True)
fic=fic.append(sifi,ignore_index=True)
fic=fic.append(fan,ignore_index=True)

For <b>Data Cleaning</b>, I detect the language of each summary using library langdetect

In [15]:
fic["language"]=""
for i in range(0,len(fic)):
    fic.at[i,"language"]=detect(fic.loc[i,"summary"])

I drop the duplicated titles.

In [16]:
fic_clean=fic.drop_duplicates(['title'])

I take only the summaries in English.

In [17]:
fic_final=fic_clean.loc[fic_clean['language'] == "en"]
print("The dataset has ", fic_final.shape[0], "summaries.")

The dataset has  5700 summaries.


I create a vector for the words in the data set and a LDA model, with 6 topics.

In [18]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(fic_final["summary"])
print(dtm_tf.shape)
lda_tf = LatentDirichletAllocation(n_components=6, random_state=0)
lda_tf.fit(dtm_tf)

(5700, 6530)


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=6, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

Visualizing the topics

In [19]:
pyLDAvis.enable_notebook()

In [20]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
