In [1]:
# Step 0. Load libraries and custom modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
import re
from dateutil.parser import parse
from datetime import datetime
from tqdm import tqdm
# ------------  PREPROCESING -------------
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk import download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#-------------- TRANSFORMERS --------------
import transformers
from transformers.pipelines import PIPELINE_REGISTRY
from transformers import pipeline
import evaluate
from evaluate import load
from transformers import Conversation
transformers.logging.set_verbosity_error()

  from .autonotebook import tqdm as notebook_tqdm
2023-11-02 22:41:50.564043: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-02 22:41:50.566311: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-02 22:41:50.603289: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-02 22:41:50.604671: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df_reduced = pd.read_csv('../data/processed/df_reduced.csv')
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 552639 entries, 0 to 552638
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  552639 non-null  int64 
 1   num_row     552639 non-null  int64 
 2   text        552639 non-null  object
dtypes: int64(2), object(1)
memory usage: 12.6+ MB


In [3]:
df_reduced = df_reduced.drop(['Unnamed: 0'], axis= 1)

In [4]:
df_reduced.sample(10)

Unnamed: 0,num_row,text
12431,12431,Animal Farm is a book I'm not used to reading....
133978,133978,When Sarah Thomas and Sophia Rizzo are working...
418692,418692,Sometimes it pays to re read the books you HAD...
534445,534445,I found this very informative. It is very well...
143126,143126,"Absolutely wonderful reading, you'll never los..."
448841,448841,This was a book that really built my love of A...
521131,521131,"After reading ""Interview"" I was mesmerized abo..."
257631,257631,I was required to read &quot;The Jungle&quot; ...
513634,513634,The story was told by an experienced storytell...
541998,541998,"In his second of three books, 'GO-KYU: Princip..."


In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/arnaldochm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
def clean_stopwords(text: str,stop_dict: dict)->str:
    if text is not None:
        words = text.split()
        words_clean = []
        for word in words:
            if word not in stop_dict:
                words_clean.append(word)
        result = ' '.join(words_clean)
    else:
        result = None
    return result

In [7]:
# 3.10 Process text to extract stopwords
df_reduced['text_clean'] = df_reduced['text'].str.lower()
stop_dict = stopwords.words('english')
df_reduced['text_clean'] = df_reduced['text_clean'].apply(lambda x: clean_stopwords(x, stop_dict = stop_dict))

In [8]:
# 3.12 Extract special characters
df_reduced['text_clean'] = df_reduced['text_clean'].str.replace(r'''[!.,():\-%$/'"‘]''', '', regex=True)

In [9]:
# 3.13 Extract numbers
df_reduced['text_clean'] = df_reduced['text_clean'].str.replace(r'[\d]+', '', regex=True)

In [10]:
df_reduced = df_reduced.drop(['text'],axis=1)

In [11]:
# 3.14 See the results
df_reduced.head(10)

Unnamed: 0,num_row,text_clean
0,0,pretty good book clear content say something r...
1,1,terry pratchetts first novel the carpet people...
2,2,must around capable artist basil hallward bri...
3,3,first read book early teens reread fourth time...
4,4,patrick kelsey learns woman replace retired co...
5,5,someone know receives diagnosis prostate cance...
6,6,simple point chalk full information herbs uses...
7,7,read book day & half interesting reading belie...
8,8,bought book biological anthropology class text...
9,9,slaughterhouse slaughterhouse five childrens ...


In [12]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/arnaldochm/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [13]:
vaderSentimentAnalyzer = SentimentIntensityAnalyzer()

In [14]:
vaderSentimentAnalyzer.polarity_scores(df_reduced.iloc[67]['text_clean'])

{'neg': 0.162, 'neu': 0.568, 'pos': 0.27, 'compound': 0.7579}

In [15]:
df_reduced['scores']=df_reduced['text_clean'].apply(lambda body: vaderSentimentAnalyzer.polarity_scores(str(body)))
df_reduced.head()

Unnamed: 0,num_row,text_clean,scores
0,0,pretty good book clear content say something r...,"{'neg': 0.076, 'neu': 0.45, 'pos': 0.474, 'com..."
1,1,terry pratchetts first novel the carpet people...,"{'neg': 0.071, 'neu': 0.776, 'pos': 0.153, 'co..."
2,2,must around capable artist basil hallward bri...,"{'neg': 0.119, 'neu': 0.642, 'pos': 0.24, 'com..."
3,3,first read book early teens reread fourth time...,"{'neg': 0.146, 'neu': 0.699, 'pos': 0.154, 'co..."
4,4,patrick kelsey learns woman replace retired co...,"{'neg': 0.158, 'neu': 0.621, 'pos': 0.221, 'co..."


In [16]:
df_reduced['compound_sentiment']=df_reduced['scores'].apply(lambda score_dict:score_dict['compound'])
df_reduced.head()

Unnamed: 0,num_row,text_clean,scores,compound_sentiment
0,0,pretty good book clear content say something r...,"{'neg': 0.076, 'neu': 0.45, 'pos': 0.474, 'com...",0.9948
1,1,terry pratchetts first novel the carpet people...,"{'neg': 0.071, 'neu': 0.776, 'pos': 0.153, 'co...",0.9823
2,2,must around capable artist basil hallward bri...,"{'neg': 0.119, 'neu': 0.642, 'pos': 0.24, 'com...",0.997
3,3,first read book early teens reread fourth time...,"{'neg': 0.146, 'neu': 0.699, 'pos': 0.154, 'co...",0.128
4,4,patrick kelsey learns woman replace retired co...,"{'neg': 0.158, 'neu': 0.621, 'pos': 0.221, 'co...",0.4624


In [17]:
# df_reduced['sentiment']=''
# df_reduced.loc[df_reduced.compound>0,'sentiment']='POS'
# df_reduced.loc[df_reduced.compound==0,'sentiment']='NEUTRAL'
# df_reduced.loc[df_reduced.compound<0,'sentiment']='NEG'

In [18]:
df_reduced = df_reduced.drop(['scores'], axis=1)
df_reduced.sample(10)

Unnamed: 0,num_row,text_clean,compound_sentiment
239503,239503,naked death another success jd robb continuing...,0.2718
180568,180568,bill bailey makes audio cassette version come ...,0.8479
95558,95558,ready really get serious character development...,0.4976
461754,461754,[] []and love book infact one favorites ophie ...,0.9775
253021,253021,veteran dozen books africa quite likely finest...,0.7717
383420,383420,read naturalist recently finished the social c...,0.9432
427497,427497,spirits strengh lifted momentary warmth stood ...,0.9545
435869,435869,emma woodhouse handsome clever rich comfortabl...,0.9988
227822,227822,failure appear lonewolf seattle homicide detec...,-0.9776
226825,226825,perhaps first book ive read deals remnants lon...,0.6124


In [19]:
df_reduced.to_csv('../data/processed/df_reduced_with_sentiment.csv')