In [1]:
# Step 0. Load libraries and custom modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
import re
from dateutil.parser import parse
from datetime import datetime
from tqdm import tqdm
# ------------  PREPROCESING -------------
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk import download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#-------------- TRANSFORMERS --------------
import transformers
from transformers.pipelines import PIPELINE_REGISTRY
from transformers import pipeline
import evaluate
from evaluate import load
from transformers import Conversation
transformers.logging.set_verbosity_error()

  from .autonotebook import tqdm as notebook_tqdm
2023-11-01 06:03:34.167719: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-01 06:03:34.169916: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-01 06:03:34.207422: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-01 06:03:34.208052: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df_reduced = pd.read_csv('../data/processed/df_reduced.csv')
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339619 entries, 0 to 339618
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  339619 non-null  int64 
 1   num_row     339619 non-null  int64 
 2   text        339619 non-null  object
dtypes: int64(2), object(1)
memory usage: 7.8+ MB


In [3]:
df_reduced = df_reduced.drop(['Unnamed: 0'], axis= 1)

In [4]:
df_reduced.sample(10)

Unnamed: 0,num_row,text
134398,134398,I wish I could give this zero stars. My daught...
120553,120553,"I'm going to try to keep this short and sweet,..."
335777,335777,I re-read this book because I wanted to go and...
195261,195261,This book was very entertaining but incredibly...
294833,294833,When reading the other reviews on this site fo...
82982,82982,No one can compare with Bram Stoker's Dracula ...
173650,173650,I was supposed to read this book in high schoo...
41597,41597,The setting is in a furturistic society where ...
194824,194824,Tolkien set the background of Heroic Fantasy (...
268867,268867,This is not a light read- but you will remembe...


In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/arnaldochm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
def clean_stopwords(text: str,stop_dict: dict)->str:
    if text is not None:
        words = text.split()
        words_clean = []
        for word in words:
            if word not in stop_dict:
                words_clean.append(word)
        result = ' '.join(words_clean)
    else:
        result = None
    return result

In [7]:
# 3.10 Process text to extract stopwords
df_reduced['text_clean'] = df_reduced['text'].str.lower()
stop_dict = stopwords.words('english')
df_reduced['text_clean'] = df_reduced['text_clean'].apply(lambda x: clean_stopwords(x, stop_dict = stop_dict))

In [8]:
# 3.12 Extract special characters
df_reduced['text_clean'] = df_reduced['text_clean'].str.replace(r'''[!.,():\-%$/'"‘]''', '', regex=True)

In [9]:
# 3.13 Extract numbers
df_reduced['text_clean'] = df_reduced['text_clean'].str.replace(r'[\d]+', '', regex=True)

In [10]:
df_reduced = df_reduced.drop(['text'],axis=1)

In [11]:
# 3.14 See the results
df_reduced.head(10)

Unnamed: 0,num_row,text_clean
0,0,matter many times read book impossible get tir...
1,1,avid scifi fan ive read many books genere foun...
2,2,foundation truly one greatest science fiction ...
3,3,spectacular universe created issac asimov foun...
4,4,think book read every fanatic science fictioni...
5,5,think isaac must robot human could possibly am...
6,6,foundation series still classic must read ever...
7,7,foundation novels great pleasureto read surpri...
8,8,foundation nut given issac asimovs foundatino...
9,9,one asimovs early masterpieces however would r...


In [12]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/arnaldochm/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [13]:
vaderSentimentAnalyzer = SentimentIntensityAnalyzer()

In [14]:
vaderSentimentAnalyzer.polarity_scores(df_reduced.iloc[67]['text_clean'])

{'neg': 0.057, 'neu': 0.657, 'pos': 0.286, 'compound': 0.9953}

In [15]:
df_reduced['scores']=df_reduced['text_clean'].apply(lambda body: vaderSentimentAnalyzer.polarity_scores(str(body)))
df_reduced.head()

Unnamed: 0,num_row,text_clean,scores
0,0,matter many times read book impossible get tir...,"{'neg': 0.075, 'neu': 0.517, 'pos': 0.408, 'co..."
1,1,avid scifi fan ive read many books genere foun...,"{'neg': 0.066, 'neu': 0.471, 'pos': 0.464, 'co..."
2,2,foundation truly one greatest science fiction ...,"{'neg': 0.0, 'neu': 0.478, 'pos': 0.522, 'comp..."
3,3,spectacular universe created issac asimov foun...,"{'neg': 0.094, 'neu': 0.661, 'pos': 0.245, 'co..."
4,4,think book read every fanatic science fictioni...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."


In [16]:
df_reduced['compound_sentiment']=df_reduced['scores'].apply(lambda score_dict:score_dict['compound'])
df_reduced.head()

Unnamed: 0,num_row,text_clean,scores,compound_sentiment
0,0,matter many times read book impossible get tir...,"{'neg': 0.075, 'neu': 0.517, 'pos': 0.408, 'co...",0.9169
1,1,avid scifi fan ive read many books genere foun...,"{'neg': 0.066, 'neu': 0.471, 'pos': 0.464, 'co...",0.9865
2,2,foundation truly one greatest science fiction ...,"{'neg': 0.0, 'neu': 0.478, 'pos': 0.522, 'comp...",0.9781
3,3,spectacular universe created issac asimov foun...,"{'neg': 0.094, 'neu': 0.661, 'pos': 0.245, 'co...",0.6705
4,4,think book read every fanatic science fictioni...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0


In [17]:
# df_reduced['sentiment']=''
# df_reduced.loc[df_reduced.compound>0,'sentiment']='POS'
# df_reduced.loc[df_reduced.compound==0,'sentiment']='NEUTRAL'
# df_reduced.loc[df_reduced.compound<0,'sentiment']='NEG'

In [18]:
df_reduced = df_reduced.drop(['scores'], axis=1)
df_reduced.sample(10)

Unnamed: 0,num_row,text_clean,compound_sentiment
93684,93684,book vein path daggers jordans eighth definite...,0.981
201949,201949,ladies gentlemen reviewed tolles latest one st...,0.9118
243796,243796,one books elite bounce category bounced wall ...,-0.9453
192625,192625,unlike &quot;as driven leaf&quot; statements t...,0.9873
48531,48531,book mormon one books read again every time re...,0.0736
306893,306893,jane eyre wonderful story womans struggle surv...,0.9246
135078,135078,thoughts trilogy correct im wrong said posts w...,0.296
2764,2764,ill admit book starts sorta slow get pages it...,0.8591
12503,12503,books made movie book great far better movie h...,0.8687
317431,317431,bush dodged draft? kind lunatic would think cl...,-0.7319


In [19]:
df_reduced.to_csv('../data/processed/df_reduced_with_sentiment.csv')