In [37]:
# Step 0. Load libraries and custom modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
import re
from dateutil.parser import parse
from datetime import datetime
from tqdm import tqdm
# ------------  PREPROCESING -------------
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk import download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#-------------- TRANSFORMERS --------------
import transformers
from transformers.pipelines import PIPELINE_REGISTRY
from transformers import pipeline
import evaluate
from evaluate import load
from transformers import Conversation
transformers.logging.set_verbosity_error()

In [2]:
df_reduced = pd.read_csv('../data/processed/df_reduced.csv')
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339620 entries, 0 to 339619
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  339620 non-null  int64 
 1   num_row     339620 non-null  int64 
 2   text        339620 non-null  object
dtypes: int64(2), object(1)
memory usage: 7.8+ MB


In [3]:
df_reduced = df_reduced.drop(['Unnamed: 0'], axis= 1)

In [4]:
df_reduced.sample(10)

Unnamed: 0,num_row,text
262937,262937,I read this book for my Junior year in High Sc...
56674,56674,My advice to you is - read the entire thing fi...
270799,270799,Really good book. It read like a bestseller (q...
234366,234366,This was my first Crichton book. I have to say...
21568,21568,I find it extremely impressive that this was w...
274740,274740,This is my second book of McCullough and I jus...
221469,221469,"does not disappoint! beautifully crafted, coll..."
219295,219295,I enjoyed reading The Autobiography of Benjami...
221198,221198,Tolkien's masterwork is not only the best fant...
303860,303860,This Puffin Classics edition of Alexandre Duma...


In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/arnaldochm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
def clean_stopwords(text: str,stop_dict: dict)->str:
    if text is not None:
        words = text.split()
        words_clean = []
        for word in words:
            if word not in stop_dict:
                words_clean.append(word)
        result = ' '.join(words_clean)
    else:
        result = None
    return result

In [7]:
# 3.10 Process text to extract stopwords
df_reduced['text_clean'] = df_reduced['text'].str.lower()
stop_dict = stopwords.words('english')
df_reduced['text_clean'] = df_reduced['text_clean'].apply(lambda x: clean_stopwords(x, stop_dict = stop_dict))

In [8]:
# 3.12 Extract special characters
df_reduced['text_clean'] = df_reduced['text_clean'].str.replace(r'''[!.,():\-%$/'"‘]''', '', regex=True)

In [9]:
# 3.13 Extract numbers
df_reduced['text_clean'] = df_reduced['text_clean'].str.replace(r'[\d]+', '', regex=True)

In [10]:
df_reduced = df_reduced.drop(['text'],axis=1)

In [36]:
# 3.14 See the results
df_reduced.head(10)

Unnamed: 0,num_row,text_clean
0,0,matter many times read book impossible get tir...
1,1,avid scifi fan ive read many books genere foun...
2,2,foundation truly one greatest science fiction ...
3,3,spectacular universe created issac asimov foun...
4,4,think book read every fanatic science fictioni...
5,5,think isaac must robot human could possibly am...
6,6,foundation series still classic must read ever...
7,7,foundation novels great pleasureto read surpri...
8,8,foundation nut given issac asimovs foundatino...
9,9,one asimovs early masterpieces however would r...


In [29]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/arnaldochm/nltk_data...


True

In [40]:
vaderSentimentAnalyzer = SentimentIntensityAnalyzer()

In [41]:
vaderSentimentAnalyzer.polarity_scores(df_reduced.iloc[67]['text_clean'])

{'neg': 0.057, 'neu': 0.657, 'pos': 0.286, 'compound': 0.9953}

In [42]:
df_reduced['scores']=df_reduced['text_clean'].apply(lambda body: vaderSentimentAnalyzer.polarity_scores(str(body)))
df_reduced.head()

Unnamed: 0,num_row,text_clean,scores
0,0,matter many times read book impossible get tir...,"{'neg': 0.075, 'neu': 0.517, 'pos': 0.408, 'co..."
1,1,avid scifi fan ive read many books genere foun...,"{'neg': 0.066, 'neu': 0.471, 'pos': 0.464, 'co..."
2,2,foundation truly one greatest science fiction ...,"{'neg': 0.0, 'neu': 0.478, 'pos': 0.522, 'comp..."
3,3,spectacular universe created issac asimov foun...,"{'neg': 0.094, 'neu': 0.661, 'pos': 0.245, 'co..."
4,4,think book read every fanatic science fictioni...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."


In [43]:
df_reduced['compound']=df_reduced['scores'].apply(lambda score_dict:score_dict['compound'])
df_reduced.head()
df_reduced['pos']=df_reduced['scores'].apply(lambda pos_dict:pos_dict['pos'])
df_reduced.head()
df_reduced['neg']=df_reduced['scores'].apply(lambda neg_dict:neg_dict['neg'])
df_reduced.head()

Unnamed: 0,num_row,text_clean,scores,compound,pos,neg
0,0,matter many times read book impossible get tir...,"{'neg': 0.075, 'neu': 0.517, 'pos': 0.408, 'co...",0.9169,0.408,0.075
1,1,avid scifi fan ive read many books genere foun...,"{'neg': 0.066, 'neu': 0.471, 'pos': 0.464, 'co...",0.9865,0.464,0.066
2,2,foundation truly one greatest science fiction ...,"{'neg': 0.0, 'neu': 0.478, 'pos': 0.522, 'comp...",0.9781,0.522,0.0
3,3,spectacular universe created issac asimov foun...,"{'neg': 0.094, 'neu': 0.661, 'pos': 0.245, 'co...",0.6705,0.245,0.094
4,4,think book read every fanatic science fictioni...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,0.0


In [46]:
df_reduced['sentiment']=''
df_reduced.loc[df_reduced.compound>0,'sentiment']='POS'
df_reduced.loc[df_reduced.compound==0,'sentiment']='NEUTRAL'
df_reduced.loc[df_reduced.compound<0,'sentiment']='NEG'

In [49]:
df_reduced.sample(10)

Unnamed: 0,num_row,text_clean,scores,compound,pos,neg,sentiment
136507,136507,informative moving historical document itself ...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,0.0,0.0,NEUTRAL
64593,64593,hatchet story th years old boy named brian rob...,"{'neg': 0.144, 'neu': 0.685, 'pos': 0.171, 'co...",0.3612,0.171,0.144,POS
32134,32134,read book coollege always liked it author real...,"{'neg': 0.146, 'neu': 0.703, 'pos': 0.151, 'co...",0.0258,0.151,0.146,POS
335126,335126,william golding novel ive read far best uses o...,"{'neg': 0.102, 'neu': 0.652, 'pos': 0.246, 'co...",0.9686,0.246,0.102,POS
112875,112875,gothic horror man made victorian england laugh...,"{'neg': 0.166, 'neu': 0.512, 'pos': 0.322, 'co...",0.9774,0.322,0.166,POS
192423,192423,mother gave copy teens version changed life a...,"{'neg': 0.028, 'neu': 0.815, 'pos': 0.157, 'co...",0.8545,0.157,0.028,POS
326975,326975,price concerned reading would poor however abs...,"{'neg': 0.178, 'neu': 0.575, 'pos': 0.247, 'co...",0.2944,0.247,0.178,POS
104363,104363,one lewis easier reads beautiful imagery cs le...,"{'neg': 0.109, 'neu': 0.665, 'pos': 0.226, 'co...",0.7003,0.226,0.109,POS
113161,113161,picture dorian gray oscar wilde set london eng...,"{'neg': 0.133, 'neu': 0.539, 'pos': 0.328, 'co...",0.9956,0.328,0.133,POS
69982,69982,really good book enjoyed love story suspense w...,"{'neg': 0.0, 'neu': 0.559, 'pos': 0.441, 'comp...",0.9542,0.441,0.0,POS


In [50]:
df_reduced = df_reduced.drop(['scores', 'compound', 'pos', 'neg'], axis=1)
df_reduced.sample(10)

Unnamed: 0,num_row,text_clean,sentiment
326121,326121,novel take place great depresion bigind besi...,POS
247382,247382,one books become immortal perfectly captures e...,POS
40451,40451,jane eyre definitely one wonderful captivating...,POS
164299,164299,brilliant writer brilliant mind indeed nobody ...,POS
136709,136709,found book excellent information piece hitler ...,POS
294679,294679,liked book alot sure points middle wanted some...,POS
33891,33891,wow civil war southern pov knew? yes racism ra...,NEG
16302,16302,im im currently reading unabridged version lo...,POS
139914,139914,treasure island good book many twists moments ...,POS
215023,215023,say large book the uncutedition worth reading ...,NEG


In [51]:
df_reduced.to_csv('../data/processed/df_reduced_with_sentiment.csv')