<a href="https://colab.research.google.com/github/caroacostatovany/sentiment_analysis_tweets/blob/main/notebooks/limpieza.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install "git+https://github.com/ElenaVillano/sentiment_analysis_tweets.git#egg=nlptweet&subdirectory=src" --quiet

  Building wheel for nlptweet (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
import datetime
# Nuestro paquete
import nlp

In [3]:
# Nombramiento de columnas
col_names = ['target', # Polaridad del twet 0=negativo, 2=neutral, 4=positivo
             'ids',    # ID tweet
             'date',   # Fecha y hora del tweet
             'flag',   # QUERY
             'user',   # Usuario del tweet
             'text']   # Texto del tweet

# Carga y limpieza de datos

In [None]:
# Requiered to select a file to be imported into colab
# Not useful if running locally
from google.colab import files
uploaded = files.upload()

Saving training.1600000.processed.noemoticon.csv to training.1600000.processed.noemoticon.csv


In [None]:
data =  pd.read_csv('training.1600000.processed.noemoticon.csv',
                 encoding='latin-1', names=col_names)

In [None]:
print(data.shape)

In [None]:
# Ejemplo
data.loc[[4,8,27,41,44,35,48,155]]

In [None]:
# Revisamos si tenemos valores nulos.
print("Revisamos si hay valores nulos en el set de entrenamiento\n", data.isna().sum())

In [None]:
data['raw_text'] = data['text']

## Convertimos a minúsculas

In [None]:
from nlp.preprocessing import convierte_a_minusculas

In [None]:
data = convierte_a_minusculas(data)

In [None]:
# Ejemplo
data.loc[[4,8,27,41,44,35,48,155]]

## Quitamos caracteres html como " , < > y &


In [None]:
from nlp.preprocessing import quitar_caracteres_html

In [None]:
data['text'] = data['text'].map(lambda s: quitar_caracteres_html(s))

In [None]:
# Ejemplo
data.loc[[4,8,27,41,44,35,48,155]]

## Reemplazamos las URLs con el texto "URL"

In [None]:
from nlp.preprocessing import reemplazar_urls

In [None]:
data['text'] = data['text'].map(lambda s: reemplazar_urls(s))

In [None]:
# Ejemplo
data.loc[[4,8,27,41,44,35,48,155]]

## Reemplazamos las menciones @ con la palabra "USER_MENTION"



In [None]:
from nlp.preprocessing import reemplazar_usuarios

In [None]:
data['text'] = data['text'].map(lambda s: reemplazar_usuarios(s))

In [None]:
# Ejemplo
data.loc[[4,8,27,41,44,35,48,155]]

## Quitamos el Hashtag , pero dejamos la palabra

In [None]:
# The regular expression used to match hashtags is #(\S+).
from nlp.preprocessing import quitar_hashtag

In [None]:
data['text'] = data['text'].map(lambda s: quitar_hashtag(s))

In [None]:
# Ejemplo
data.loc[[4,8,27,41,44,35,48,155]]

## Quitamos los Retweets, sólo la palabra RT , dejamos el comentario

In [None]:
#  The regular expression used to match retweets is \brt\b.
from nlp.preprocessing import quitar_RT

In [None]:
data['text'] = data['text'].map(lambda s: quitar_RT(s))

In [None]:
data.loc[[4,8,27,41,44,35,48,155]]

## Quitamos espacios o puntos extras

In [None]:
## Reemplazamos los dobles puntos (o más) con un espacio , dos o más espacios con 1 espacio y hacemos strip de espacios y comillas
# Strip any punctuation [’"?!,.():;-'] from the word ??
from nlp.preprocessing import quitar_caracteres_especiales

In [None]:
data['text'] = data['text'].map(lambda s: quitar_caracteres_especiales(s))

In [None]:
data.loc[[4,8,27,41,44,35,48,155]]

## Quitamos expresiones con letras repetidas

In [None]:
#Convert 2 or more letter repetitions to 2 letters.
#Some people send tweets like I am sooooo
#happpppy adding multiple characters to emphasize
#on certain words. This is done to handle such tweets
#by converting them to I am soo happy
from nlp.preprocessing import quitar_letras_repetidas

In [None]:
data['text'] = data['text'].map(lambda s: quitar_letras_repetidas(s))

In [None]:
data.loc[[4,8,27,41,44,35,48,155]]

## Quitar caracteres nonascii

In [None]:
from nlp.preprocessing import quitar_nonascii

In [None]:
data['text'] = data['text'].map(lambda s: quitar_nonascii(s))

In [None]:
data.loc[[4,8,27,41,44,35,48,155]]

## Removemos stopwords

In [None]:
from nlp.preprocessing import separar_abreviaciones

In [None]:
data['text'] = data['text'].map(lambda s: separar_abreviaciones(s))

In [None]:
data.loc[[4,8,27,41,44,35,48,155]]

In [None]:
from nlp.preprocessing import remove_stopwords

In [None]:
data['text'] = data['text'].map(lambda s: remove_stopwords(s))

In [None]:
data.loc[[4,8,27,41,44,35,48,155]]

## Stemming con NLTK

In [None]:
from nlp.preprocessing import oracion_raiz

In [None]:
data['text'] = data['text'].map(lambda s: oracion_raiz(s))

In [None]:
data.loc[[4,8,27,41,44,35,48,155]]

## Cambiamos la etiqueta 0 y 1

In [None]:
from nlp.preprocessing import modificar_etiqueta

In [None]:
data = modificar_etiqueta(data)

In [None]:
data

In [None]:
data.to_csv("1600000_datos_limpios_sin_user_ni_url.csv")

### Escogemos 100,000 k del mismo largo de palabras

In [None]:
data['tokens'] = data['text'].map(lambda s: s.split())

In [None]:
data['len_tokens'] = data['tokens'].map(lambda s: len(s))

In [None]:
data_new = data.sort_values('len_tokens', ascending=False).reset_index().drop('index', axis=1)

In [None]:
data_new = data_new[data_new.len_tokens!=0]

In [None]:
data_new

In [None]:
data_new.plot.bar(x='len_tokens')