In [35]:
# Step 0. Load libraries and custom modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
import re
from dateutil.parser import parse
from datetime import datetime
from tqdm import tqdm
# ------------  PREPROCESING -------------
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk import download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#-------------- TRANSFORMERS --------------
import transformers
from transformers.pipelines import PIPELINE_REGISTRY
from transformers import pipeline
import evaluate
from evaluate import load
from transformers import Conversation
transformers.logging.set_verbosity_error()

In [36]:
df_reduced = pd.read_csv('../data/processed/df_reduced.csv')
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15738 entries, 0 to 15737
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  15738 non-null  int64 
 1   num_row     15738 non-null  int64 
 2   text        15738 non-null  object
dtypes: int64(2), object(1)
memory usage: 369.0+ KB


In [37]:
df_reduced = df_reduced.drop(['Unnamed: 0'], axis= 1)

In [38]:
df_reduced.sample(10)

Unnamed: 0,num_row,text
5947,5947,Ice Drift is a story written by Theodore Taylo...
11720,11720,This story of Alex Cross takes up right where ...
8253,8253,I'm an advanced photography student and i use ...
3537,3537,Lewis a conservative Christian put together th...
15101,15101,I bought this book out of curiosity. I am a me...
10908,10908,"I recently purchased his ""The Taming the the S..."
15611,15611,So great learning all about The Amish Heritage...
14693,14693,This book is well written and hilarious. I rec...
8055,8055,What I liked:* Lots of great pictures and illu...
11439,11439,I have enjoyed the entire Mitford Years series...


In [39]:
# def clean_stopwords(text: str,stop_dict: dict)->str:
#     if text is not None:
#         words = text.split()
#         words_clean = []
#         for word in words:
#             if word not in stop_dict:
#                 words_clean.append(word)
#         result = ' '.join(words_clean)
#     else:
#         result = None
#     return result

In [40]:
# 3.10 Text To Lower
df_reduced['text_clean'] = df_reduced['text'].str.lower()
# 3.12 Extract special characters and numbers
df_reduced['text_clean'] = df_reduced['text_clean'].str.replace(r'[^a-z]', ' ', regex=True)
# 3.13 Extract numbers
# df_reduced['text_clean'] = df_reduced['text_clean'].str.replace(r'[\d]+', '', regex=True)
# 3.14 #Change multiple white spaces to a single white space
df_reduced['text_clean'] = df_reduced['text_clean'].str.replace(r'\s+',' ',regex=True)

In [41]:
# 3.15 Lemmatize Text and removing Stopwords

download("wordnet")
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/arnaldochm/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [42]:
download("stopwords")
stop_words = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/arnaldochm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [43]:
def lemmatize_text(words, lemmatizer = lemmatizer):
    words = words.split(' ')
    tokens = [lemmatizer.lemmatize(word) for word in words]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [word for word in tokens if len(word) > 3]
    return ' '.join(tokens)

In [44]:
df_reduced["text_clean"] = df_reduced["text_clean"].apply(lambda x: lemmatize_text(x))
df_reduced["text_clean"].sample(10)

10055    unlike reviewer think oprah compilation three ...
7394     gave star based general subject appeal enterta...
10719    western world must read gritty ground level fi...
9326     part wanted story royal family confinement hou...
11210    tempted favor pick bookstore read page earlier...
2243     great collection clever accomplished song abso...
13701    disappointed book really full story people dre...
11311    frank peretti know grab reader latest novel go...
14731    light reading enjoyed lord ring like work char...
1806     typical vampire book reason really enjoyed rob...
Name: text_clean, dtype: object

In [53]:
df_reduced = df_reduced.drop(['text'],axis=1)

In [54]:
# 3.14 See the results
df_reduced.head(10)

Unnamed: 0,num_row,text_clean
0,0,purchased book neice cruised week loved story ...
1,1,really enjoyed book several front author terri...
2,2,interesting informative spot good book read wa...
3,3,must learn past history better appreciate toda...
4,4,began book high hope cover blurb sounded inter...
5,5,gary demar succeeded writing finest short actu...
6,6,first miss julia book found quite delightful b...
7,7,enjoying read assuming writes like think long ...
8,8,book simply great start reading able page page...
9,9,hobbit written differently lord ring time much...


In [55]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/arnaldochm/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [56]:
vaderSentimentAnalyzer = SentimentIntensityAnalyzer()

In [58]:
df_reduced.iloc[67]['text_clean']

'goethe expressed fundamental truth nothing much philosophy footnote plato plotinus year later plato eliminated platonic dabbling politics quietly separated greek mysticism emerging fundamentalism christianity became somewhat bemusedly cult figure right brian hines deal subject hugely sympathatic hope future something change time plato century close timeless truth time well according plotinus father robin'

In [59]:
vaderSentimentAnalyzer.polarity_scores(df_reduced.iloc[67]['text_clean'])

{'neg': 0.0, 'neu': 0.829, 'pos': 0.171, 'compound': 0.835}

In [60]:
df_reduced['scores']=df_reduced['text_clean'].apply(lambda body: vaderSentimentAnalyzer.polarity_scores(str(body)))
df_reduced.head()

Unnamed: 0,num_row,text_clean,scores
0,0,purchased book neice cruised week loved story ...,"{'neg': 0.0, 'neu': 0.647, 'pos': 0.353, 'comp..."
1,1,really enjoyed book several front author terri...,"{'neg': 0.116, 'neu': 0.564, 'pos': 0.32, 'com..."
2,2,interesting informative spot good book read wa...,"{'neg': 0.0, 'neu': 0.504, 'pos': 0.496, 'comp..."
3,3,must learn past history better appreciate toda...,"{'neg': 0.094, 'neu': 0.554, 'pos': 0.352, 'co..."
4,4,began book high hope cover blurb sounded inter...,"{'neg': 0.094, 'neu': 0.643, 'pos': 0.263, 'co..."


In [61]:
df_reduced['compound_sentiment']=df_reduced['scores'].apply(lambda score_dict:score_dict['compound'])
df_reduced.head()

Unnamed: 0,num_row,text_clean,scores,compound_sentiment
0,0,purchased book neice cruised week loved story ...,"{'neg': 0.0, 'neu': 0.647, 'pos': 0.353, 'comp...",0.9623
1,1,really enjoyed book several front author terri...,"{'neg': 0.116, 'neu': 0.564, 'pos': 0.32, 'com...",0.9904
2,2,interesting informative spot good book read wa...,"{'neg': 0.0, 'neu': 0.504, 'pos': 0.496, 'comp...",0.7096
3,3,must learn past history better appreciate toda...,"{'neg': 0.094, 'neu': 0.554, 'pos': 0.352, 'co...",0.9595
4,4,began book high hope cover blurb sounded inter...,"{'neg': 0.094, 'neu': 0.643, 'pos': 0.263, 'co...",0.9933


In [62]:
df_reduced = df_reduced.drop(['scores'], axis=1)
df_reduced.sample(10)

Unnamed: 0,num_row,text_clean,compound_sentiment
9222,9222,typical grisham book plenty action movement al...,0.7845
13637,13637,tony valentine novel james swain creates chara...,0.9776
15734,15734,georgette heyer mystery writer pleasant surpri...,0.8271
12539,12539,thesaurus laya steinberg illustrated debbie ha...,0.975
1836,1836,interesting deep thoughtful read definitely ad...,0.7906
7484,7484,great book recommend anyone excellent book rea...,0.9153
10726,10726,perilous time great book fate free speech amer...,-0.6013
13871,13871,adventure another keep interest going definite...,0.872
4670,4670,part remind convincing glimpse tortured mind t...,0.9805
2286,2286,northanger abbey apparently written early jane...,0.8932


In [63]:
df_reduced.to_csv('../data/processed/df_reduced_with_sentiment.csv')