In [1]:
# Step 0. Load libraries and custom modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
import re
from dateutil.parser import parse
from datetime import datetime
from tqdm import tqdm
# ------------  PREPROCESING -------------
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk import download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#-------------- TRANSFORMERS --------------
import transformers
from transformers.pipelines import PIPELINE_REGISTRY
from transformers import pipeline
import evaluate
from evaluate import load
from transformers import Conversation
transformers.logging.set_verbosity_error()

  from .autonotebook import tqdm as notebook_tqdm
2023-11-06 23:46:59.035116: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-06 23:46:59.046564: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-06 23:46:59.130923: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-06 23:46:59.131982: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df_reduced = pd.read_csv('../data/processed/df_reduced.csv')
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15425 entries, 0 to 15424
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  15425 non-null  int64 
 1   num_row     15425 non-null  int64 
 2   text        15425 non-null  object
dtypes: int64(2), object(1)
memory usage: 361.6+ KB


In [3]:
df_reduced = df_reduced.drop(['Unnamed: 0'], axis= 1)

In [4]:
df_reduced.sample(10)

Unnamed: 0,num_row,text
5027,5027,This was my first 'Kindle' read and I loved it...
7461,7461,Mr. Coates does professional-sounding arrangem...
5948,5948,Turns out my code was used 4 years ago. Don't ...
63,63,"We were so happy with the Curious George book,..."
6235,6235,"I have not finished the book yet, but far enou..."
1978,1978,Translating the Message can be seen as a long ...
7582,7582,I bought this book for one of my Life Skills s...
909,909,Ordered this book and item was shipped next da...
14357,14357,"I would recommend this product, very useful fo..."
8767,8767,I would recommend reading the series in order....


In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/arnaldochm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
def clean_stopwords(text: str,stop_dict: dict)->str:
    if text is not None:
        words = text.split()
        words_clean = []
        for word in words:
            if word not in stop_dict:
                words_clean.append(word)
        result = ' '.join(words_clean)
    else:
        result = None
    return result

In [7]:
# 3.10 Process text to extract stopwords
df_reduced['text_clean'] = df_reduced['text'].str.lower()
stop_dict = stopwords.words('english')
df_reduced['text_clean'] = df_reduced['text_clean'].apply(lambda x: clean_stopwords(x, stop_dict = stop_dict))

In [8]:
# 3.12 Extract special characters
df_reduced['text_clean'] = df_reduced['text_clean'].str.replace(r'''[!.,():\-%$/'"‘]''', '', regex=True)

In [9]:
# 3.13 Extract numbers
df_reduced['text_clean'] = df_reduced['text_clean'].str.replace(r'[\d]+', '', regex=True)

In [10]:
df_reduced = df_reduced.drop(['text'],axis=1)

In [11]:
# 3.14 See the results
df_reduced.head(10)

Unnamed: 0,num_row,text_clean
0,0,odyssey two strong sequel arthur c clarkes r...
1,1,faith faith devotional awesome book quite insp...
2,2,first reading book absolutely last also read s...
3,3,book explores death penalty delves lot misconc...
4,4,finally printondemand book actually looks like...
5,5,normally enjoy contemporary romances much litt...
6,6,barbara vine marvelous author ruth rendell vin...
7,7,reflect closing year old daughters rd grade y...
8,8,think book gonna read like dan brown book gonn...
9,9,read series long kindles loved them eight inte...


In [12]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/arnaldochm/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [13]:
vaderSentimentAnalyzer = SentimentIntensityAnalyzer()

In [14]:
vaderSentimentAnalyzer.polarity_scores(df_reduced.iloc[67]['text_clean'])

{'neg': 0.097, 'neu': 0.714, 'pos': 0.19, 'compound': 0.9058}

In [15]:
df_reduced['scores']=df_reduced['text_clean'].apply(lambda body: vaderSentimentAnalyzer.polarity_scores(str(body)))
df_reduced.head()

Unnamed: 0,num_row,text_clean,scores
0,0,odyssey two strong sequel arthur c clarkes r...,"{'neg': 0.067, 'neu': 0.734, 'pos': 0.199, 'co..."
1,1,faith faith devotional awesome book quite insp...,"{'neg': 0.0, 'neu': 0.381, 'pos': 0.619, 'comp..."
2,2,first reading book absolutely last also read s...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
3,3,book explores death penalty delves lot misconc...,"{'neg': 0.338, 'neu': 0.611, 'pos': 0.052, 'co..."
4,4,finally printondemand book actually looks like...,"{'neg': 0.0, 'neu': 0.599, 'pos': 0.401, 'comp..."


In [16]:
df_reduced['compound_sentiment']=df_reduced['scores'].apply(lambda score_dict:score_dict['compound'])
df_reduced.head()

Unnamed: 0,num_row,text_clean,scores,compound_sentiment
0,0,odyssey two strong sequel arthur c clarkes r...,"{'neg': 0.067, 'neu': 0.734, 'pos': 0.199, 'co...",0.9956
1,1,faith faith devotional awesome book quite insp...,"{'neg': 0.0, 'neu': 0.381, 'pos': 0.619, 'comp...",0.9643
2,2,first reading book absolutely last also read s...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0
3,3,book explores death penalty delves lot misconc...,"{'neg': 0.338, 'neu': 0.611, 'pos': 0.052, 'co...",-0.9432
4,4,finally printondemand book actually looks like...,"{'neg': 0.0, 'neu': 0.599, 'pos': 0.401, 'comp...",0.9081


In [17]:
df_reduced = df_reduced.drop(['scores'], axis=1)
df_reduced.sample(10)

Unnamed: 0,num_row,text_clean,compound_sentiment
801,801,arthurs writing book excellent like plots stor...,0.9451
2941,2941,like read excentrics great bookthe tenacity ad...,0.5267
4837,4837,figured price classic good cover would typical...,0.8689
1604,1604,saseks books visually lovely doubt it however ...,0.9907
4024,4024,book arrived student ready bible dog retrainin...,0.1027
14499,14499,read book couple years ago great intriguing re...,0.7783
7638,7638,book outstanding written stand see one review ...,0.93
8789,8789,read one book life twice make one read twice ...,-0.5994
12457,12457,read book the name rose years ago first publis...,0.9524
9294,9294,maier managed write story true roots yet also ...,0.8402


In [18]:
df_reduced.to_csv('../data/processed/df_reduced_with_sentiment.csv')