In [96]:
import glob,os,re,shutil,pickle
import gdown,gzip
import string
import pandas as pd
import numpy as np

In [188]:
def get_datalist(typ=None):
    #download and extract selected dataset in json format
    gdrive_csv_url ='https://raw.githubusercontent.com/MengtingWan/goodreads/master/gdrive_id.csv'
    datasets_df = pd.read_csv(gdrive_csv_url, error_bad_lines=False)
    datasets_map = dict(zip(datasets_df['name'].values, datasets_df['id'].values))
    dfdmap=pd.DataFrame(datasets_map.items(),columns=['file','link'])
    dfdmap['type']=dfdmap['file'].apply(lambda x: x.split('_')[1].rsplit('.')[0])
    dfdmap['content']=dfdmap['file'].apply(lambda x: x.split('.')[0].split('_')[2:])
    dfdmap=dfdmap.sort_values(by=['type'])
    if typ!=None:
        dfdmap = dfdmap[dfdmap.type==typ]
    dfdmap=dfdmap.reset_index(drop=True)
    with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
        display(dfdmap[['type','content']])
    return dfdmap

def download_data(ddir,selected_id,data_sources):
    source_id=data_sources.iloc[selected_id].link
    file_name = data_sources.iloc[selected_id].file
    json_file = os.path.join(DIR,file_name.rstrip('.gz'))
    if os.path.exists(os.path.join(ddir,json_file)):
        return json_file
    else:
        if not os.path.exists(os.path.join(ddir,file_name)):
            gdrive_url='https://drive.google.com/uc?id='+source_id
            gdown.download(gdrive_url, output=os.path.join(ddir,file_name),quiet=True)
            print("downloaded file")
            print("extracting")
        with gzip.open(os.path.join(ddir,file_name), 'rb') as f_in:
            with open(json_file, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    return json_file

def get_sentiment(df):
    df = df.drop(df[df.rating ==0].index)
    df.loc[df['rating']<3,'sentiment']=0
    df.loc[df['rating']>3,'sentiment']=1
    df=df[df.rating!=3]
    return df

def process_text(df,m1,m2):

    RE_PUNCTUATION = '|'.join([re.escape(x) for x in string.punctuation])
    df['review_text']=df['review_text'].str.lower() 
    df['review_text'].replace(m1, regex=True, inplace=True)
    df['review_text'].replace(m2, regex=True, inplace=True)
    df['review_text'].replace(regex=True,inplace=True,to_replace=r'\n',value=r'')
    df['review_text'].replace(regex=True,inplace=True,to_replace=r'[0-9]{1,3} stars{0,1}',value=r'')
    df['review_text'].replace(regex=True,inplace=True,to_replace=r'[0-9]{1,3} out of [0-9]{0,1}',value=r'')
    df['review_text']=df['review_text'].str.replace(RE_PUNCTUATION, "")
    df = df[df.review_text.str.contains(r'[a-z]')]
    return df

def get_word_freqs(df):
    return nltk.FreqDist(df.review_text.values)

def remove_spam(df,full_reviews,partial_reviews):
    df= df[~df['review_text'].isin(full_reviews)]
    return df[np.where(df.review_text.str.contains('|'.join(partial_reviews)),False,True)]

def get_dataframe(json_file,chunk_size=250000,n_entries=1e7,cols=None):
    with open('review_text_filters.pickle', 'rb') as handle:
        review_filters = pickle.load(handle)
    df_reader=pd.read_json(json_file,lines=True,chunksize=chunk_size)
    n,count=0,0
    ll,times=[],[]
    for chunk in df_reader:
        print("processing")
        if cols:         
            chunk = chunk[cols]
        n+=len(chunk)
        chunk = get_sentiment(chunk)
        chunk = process_text(chunk,review_filters['replace_abbrs'],
                             review_filters['replace_else'])
        chunk = remove_spam(chunk,review_filters['full_reviews'],review_filters['partial_reviews'])
        ll.append(chunk)
        print(str(round(n/n_entries,2)*100)+"%")
        if n%n_entries==0:
            break
    df=pd.concat(ll,axis=0,ignore_index=True)
    return df

#### Download data

In [147]:
data_sources = get_datalist('reviews')

Unnamed: 0,type,content
0,reviews,"[history, biography]"
1,reviews,[romance]
2,reviews,[spoiler]
3,reviews,[children]
4,reviews,[poetry]
5,reviews,[dedup]
6,reviews,"[young, adult]"
7,reviews,"[comics, graphic]"
8,reviews,"[spoiler, raw]"
9,reviews,"[mystery, thriller, crime]"


In [148]:
selected_id=3
data_sources.iloc[selected_id]

file       goodreads_reviews_children.json.gz
link        1908GDMdrhDN7sTaI_FelSHxbwcNM1EzR
type                                  reviews
content                            [children]
Name: 3, dtype: object

In [149]:
DIR='./data/'
json_file=download_data(DIR,selected_id,data_sources)

downloaded file
extracting


In [150]:
df_reader=pd.read_json(json_file,lines=True,chunksize=1e4)
for chunk in df_reader:
    print(chunk.columns)
    break

Index(['book_id', 'date_added', 'date_updated', 'n_comments', 'n_votes',
       'rating', 'read_at', 'review_id', 'review_text', 'started_at',
       'user_id'],
      dtype='object')


In [151]:
chunk.head()

Unnamed: 0,book_id,date_added,date_updated,n_comments,n_votes,rating,read_at,review_id,review_text,started_at,user_id
0,23310161,Tue Nov 17 11:37:35 -0800 2015,Tue Nov 17 11:38:05 -0800 2015,0,7,4,,f4b4b050f4be00e9283c92a814af2670,Fun sequel to the original.,,8842281e1d1347389f2ab93d60773d4d
1,17290220,Sat Nov 08 08:54:03 -0800 2014,Wed Jan 25 13:56:12 -0800 2017,0,4,5,Tue Jan 24 00:00:00 -0800 2017,22d424a2b0057b18fb6ecf017af7be92,One of my favorite books to read to my 5 year ...,,8842281e1d1347389f2ab93d60773d4d
2,6954929,Thu Oct 23 13:46:20 -0700 2014,Thu Oct 23 13:47:00 -0700 2014,1,6,5,,50ed4431c451d5677d98dd25ca8ec106,One of the best and most imaginative childrens...,,8842281e1d1347389f2ab93d60773d4d
3,460548,Mon Dec 02 10:43:59 -0800 2013,Wed Mar 22 11:47:25 -0700 2017,4,5,5,,1e4de11dd4fa4b7ffa59b6c69a6b28e9,My daughter is loving this. Published in the 6...,,8842281e1d1347389f2ab93d60773d4d
4,11474551,Wed May 11 22:38:11 -0700 2011,Sun Jan 29 15:56:41 -0800 2012,0,5,5,Wed May 11 00:00:00 -0700 2011,2065145714bf747083a1c9ce81d5c4fe,A friend sent me this. Hilarious!,Wed May 11 00:00:00 -0700 2011,8842281e1d1347389f2ab93d60773d4d


#### Process data

In [189]:
chunk_size = 25000
N = 0.5e5
cols_reviews = ['book_id','review_id','review_text','rating']
df = get_dataframe(json_file,chunk_size=chunk_size,n_entries=N,cols=cols_reviews)

processing
50.0%
processing
100.0%


In [190]:
df.head()

Unnamed: 0,book_id,review_id,review_text,rating,sentiment
0,23310161,f4b4b050f4be00e9283c92a814af2670,fun sequel to the original,4,1.0
1,17290220,22d424a2b0057b18fb6ecf017af7be92,one of my favorite books to read to my 5 year ...,5,1.0
2,6954929,50ed4431c451d5677d98dd25ca8ec106,one of the best and most imaginative childrens...,5,1.0
3,460548,1e4de11dd4fa4b7ffa59b6c69a6b28e9,my daughter is loving this published in the 60...,5,1.0
4,11474551,2065145714bf747083a1c9ce81d5c4fe,a friend sent me this hilarious,5,1.0


#### Sample example reviews

In [191]:
n_reviews=3
print("sampling negative reviews: \n")
_=[print("Review(s): "+"\n"+x[:1000]+"\n") 
       for i,x in enumerate(df[df["sentiment"] == 0].sample(n_reviews).review_text.values)]
print("sampling positive reviews: \n")
_=[print("Review(s): "+"\n"+x[:1000]+"\n") 
       for i,x in enumerate(df[df["sentiment"] == 1].sample(n_reviews).review_text.values)]

sampling negative reviews: 

Review(s): 
this book was really bad  terribly tremendously bad  i mean the girls thought about staying with her dad is fine but the constant regret she had the way she wanted to be friends with a lonely unpopular girl but was afraid to because of her super popular best friend jesus  it was awful  this book was put in the ya section the ya section  the girl in the book sounded nine or ten  there was nothing young adult about this book either  overall really bad bad bad bad book

Review(s): 
typical dr seuss fare it was notone of my favourites as a kid but i liked it okay

Review(s): 
i think that when you read this book you need to be aware of this story i wasnt so i did not really care for it i am not sure this event happened or anything  but overall i did not like the book because of the way the characters were drawn and because of the way the story was written i understood what the story was about but it really it was notreally focused on the fact that a

#### Further processing with spacy

In [192]:
import spacy
from spacy.lang.en import English
nlp=spacy.load('en_core_web_sm')
# choose random reviews
chosen_idx = np.random.choice(len(df), replace=False, size=1)
text = df.iloc[chosen_idx].review_text.values[0]

In [193]:
# tokenization
doc=nlp(text)
print([token for token in doc])

[an, imagination, library, book, sam, has, had, for, a, while, but, that, continues, to, be, a, favorite, it, is, pretty, simple, with, cute, illustrations, and, a, nice, rhythm, and, repetition]


In [194]:
# display word stems via leammatization and POS tagging
doc=nlp(text)
for word in doc:
    print(word.text,word.lemma_,word.pos_)

an an DET
imagination imagination NOUN
library library NOUN
book book NOUN
sam sam PROPN
has have AUX
had have VERB
for for ADP
a a DET
while while NOUN
but but CCONJ
that that DET
continues continue VERB
to to PART
be be AUX
a a DET
favorite favorite ADJ
it -PRON- PRON
is be AUX
pretty pretty ADV
simple simple ADJ
with with ADP
cute cute ADJ
illustrations illustration NOUN
and and CCONJ
a a DET
nice nice ADJ
rhythm rhythm NOUN
and and CCONJ
repetition repetition NOUN


#### Text sentiment classification

In [195]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from spacy.lang.en.stop_words import STOP_WORDS
stop_words = spacy.lang.en.stop_words.STOP_WORDS
punctuations = string.punctuation
parser=English()
import string

In [196]:
def spacy_tokenizer(sentence):
    mytokens = nlp(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    return mytokens

def clean_text(text):
    return str(text).strip().lower()

class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}
    
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))


In [199]:
from sklearn.model_selection import train_test_split
# good for detecting nans in dataset
# df_sub =df_sub[~df_sub.isin([np.nan, np.inf, -np.inf]).any(1)]
X = df['review_text']
ylabels=df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)
print(len(X_train))

24821


#### Train model

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

pipe.fit(X_train,y_train)

#### Evaluate model

In [None]:
from sklearn import metrics
predicted = pipe.predict(X_test)

print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))