# Proyecto 1: Revisión de sentimientos en comentarios de libros
## Integrantes:
* Natalia Sanabria Forero - 201532265
* Jorge Andrés Esguerra Alarcón - 201719920
* Christian Forigua - 201713023

In [5]:
# Librería para manejar las contracciones que se presentan en el inglés.
!conda deactivate
!pip install contractions
!pip install numpy
!pip install joblib
# librería para manejar las flexiones gramaticales en el idioma inglés.
!pip install inflect
!pip install pandas-profiling==2.7.1
!pip install nltk
!pip install sklearn

Collecting sklearn
  Downloading https://files.pythonhosted.org/packages/1e/7a/dbb3be0ce9bd5c8b7e3d87328e79063f8b263b2b1bfa4774cb1147bfcd3f/sklearn-0.0.tar.gz
Collecting scikit-learn
  Using cached https://files.pythonhosted.org/packages/3f/a4/9ac96921dcd7b36467ec7300ab1f9f5c98cb1a96fea35de467deae493c71/scikit_learn-1.0-cp37-cp37m-win_amd64.whl
Collecting threadpoolctl>=2.0.0
  Using cached https://files.pythonhosted.org/packages/c6/e8/c216b9b60cbba4642d3ca1bae7a53daa0c24426f662e0e3ce3dc7f6caeaa/threadpoolctl-2.2.0-py3-none-any.whl
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py): started
  Building wheel for sklearn (setup.py): finished with status 'done'
  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1321 sha256=c6eb3c7880004ba5a412159df44c2cfdaa7e4fa073058ecb5ca9eac0f3be74e7
  Stored in directory: C:\Users\forid\AppData\Local\pip\Cache\wheels\76\03\bb\589d421d27431bcd2c6da284d5f2286c8e3b2ea3cf1594c074
Successfully b

## 1. Librerías para el procesamiento de texto

In [12]:
# Procesamiento de Lenguaje Natural
import nltk
nltk.download('punkt') # Separar texto en frases
nltk.download('stopwords') # Palabras vacías
nltk.download('wordnet') # Entender el significado de las palabras

# Otras librerías
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import sys
from pandas_profiling import ProfileReport

import re, string, unicodedata
import contractions
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report, confusion_matrix, plot_precision_recall_curve
from sklearn.base import BaseEstimator, ClassifierMixin

import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\forid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\forid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\forid\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 2. Perfilamiento y entendimiento de los datos
### Lectura de los datos

In [14]:
data = pd.read_csv('./kindle_reviews.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,B000F83SZQ,"[0, 0]",5,I enjoy vintage books and movies so I enjoyed ...,"05 5, 2014",A1F6404F1VG29J,Avidreader,Nice vintage story,1399248000
1,1,B000F83SZQ,"[2, 2]",4,This book is a reissue of an old one; the auth...,"01 6, 2014",AN0N05A9LIJEQ,critters,Different...,1388966400
2,2,B000F83SZQ,"[2, 2]",4,This was a fairly interesting read. It had ol...,"04 4, 2014",A795DMNCJILA6,dot,Oldie,1396569600
3,3,B000F83SZQ,"[1, 1]",5,I'd never read any of the Amy Brewster mysteri...,"02 19, 2014",A1FV0SX13TWVXQ,"Elaine H. Turley ""Montana Songbird""",I really liked it.,1392768000
4,4,B000F83SZQ,"[0, 1]",4,"If you like period pieces - clothing, lingo, y...","03 19, 2014",A3SPTOKDG7WBLN,Father Dowling Fan,Period Mystery,1395187200


### Entendimiento de los datos

In [None]:
data.describe()

In [None]:
data['overall'].value_counts().plot(kind = 'bar')
plt.title('Distribución de los puntajes de los productos')
plt.xlabel('Ranking')
plt.xticks(rotation = 0)
plt.ylabel('# Registros')
plt.show()

In [None]:
df_data = (data.isna().sum()).to_frame()
df_data.sort_values(0, ascending = False)

In [None]:
reviews = data[['reviewText', 'overall','summary']]
df_data = (reviews.isna().sum()).to_frame()
df_data.sort_values(0, ascending = False)

In [None]:
reviews.dropna(inplace = True)
reviews

## 3. Preparación de los datos
### 3.1 Limpieza de los datos

In [None]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words
def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_words.append(words.lower())
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    stop_words = set(stopwords.words('english'))
    for word in words:
        if word not in stop_words:
            new_words.append(word)
    return new_words

def preprocessing(words):
    words = to_lowercase(words)
    words = replace_numbers(words)
    words = remove_punctuation(words)
    words = remove_non_ascii(words)
    words = remove_stopwords(words)
    return words

### 3.2 Tokenización

In [None]:
# Se preparan los reviews para la tokenización
reviews['reviewText'] = reviews['reviewText'].apply(contractions.fix)
reviews['summary'] = reviews['summary'].apply(contractions.fix)

Se decidió guardar la columna "summary" ya que podría aportar información a la representación de los reviews. Para verificar lo anterior, se va a crear una representación que solo tiene en cuenta la columna "reviewText" y otra que va tener las columnas "reviewText" y "Summary" concatenadas.

In [None]:
reviews_summary = reviews.copy()
reviews_summary['reviewText'] = reviews_summary['reviewText'] + ' ' + reviews_summary['summary']
reviews_summary = reviews_summary[['reviewText', 'overall']]
reviews = reviews[['reviewText', 'overall']]

In [None]:
reviews_summary['words'] = reviews_summary['reviewText'].apply(word_tokenize).apply(preprocessing)
reviews['words'] = reviews['reviewText'].apply(word_tokenize).apply(preprocessing) #Aplica la eliminación del ruido
reviews.head()

### 3.3 Normalización

In [None]:
def stem_words(words):
    """Stem words in list of tokenized words"""
    lancaster=LancasterStemmer()
    new_words =[]
    for word in words:
        new_words.append(lancaster.stem(word))
    return new_words    

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""

def stem_and_lemmatize(words):
    stems = stem_words(words)
    lemmas = lemmatize_verbs(words)
    return stems + lemmas

In [35]:
# Dimensiones de los datos
rows, cols = reviews.shape
print(f"Reviews filas: {rows}, columnas: {cols}")
rows, cols = reviews_summary.shape
print(f"Reviews + Summary filas: {rows}, columnas: {cols}")

Reviews filas: 982596, columnas: 2
Reviews + Summary filas: 982596, columnas: 2
