## 1. Preparacion  y preprocesamiento de datos

In [12]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

In [4]:
df = pd.read_csv('DataTwitter.csv',encoding ='ISO-8859-1', header=None)

In [6]:
# Descargamos los recursos de NLTK que vamos a usar
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/camila/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/camila/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [8]:
# Limpieza Nivel 1 y 2
# Cambiamos el nombre de las columnas
df.columns = ['sentiment', 'id', 'date', 'flag', 'user', 'text']
# Eliminamos las columnas que no vamos a usar
df = df.drop(['date', 'flag', 'sentiment'], axis=1)

In [9]:
df.head(3)

Unnamed: 0,id,user,text
0,1467810369,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,1467810672,scotthamilton,is upset that he can't update his Facebook by ...
2,1467810917,mattycus,@Kenichan I dived many times for the ball. Man...


In [11]:

stop_words =  set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

In [13]:
# Función de preprocesamiento
def preprocess_text(text):
    # Removemos las palabras que empiezan con '@' y las URLs
    text = re.sub(r"@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+", ' ', text)
    # Tokenización
    tokens = word_tokenize(text)
    # Filtrar Stopwords y Stemming
    filtered_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words and token.isalpha()]
    return " ".join(filtered_tokens)

In [14]:
df['processed_text'] = df['text'].apply(preprocess_text)

In [15]:
df.head(3)

Unnamed: 0,id,user,text,processed_text
0,1467810369,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww bummer you shoulda got david carr third d...
1,1467810672,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,1467810917,mattycus,@Kenichan I dived many times for the ball. Man...,i dive mani time ball manag save the rest go b...


In [16]:
# Dado que el proceso tarda mucho, guardamos el dataset en un csv
df.to_csv('DataTwitter_processed.csv', index=False)


In [17]:
df_processed = pd.read_csv('DataTwitter_processed.csv')

In [18]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 4 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   id              1600000 non-null  int64 
 1   user            1600000 non-null  object
 2   text            1600000 non-null  object
 3   processed_text  1593835 non-null  object
dtypes: int64(1), object(3)
memory usage: 48.8+ MB


Luego de la limpiza la columna procesada tiene 6,165 nulos. Dado que ya no aportan, se eliminan.

In [19]:
df_processed = df_processed.dropna(subset=['processed_text'])


In [20]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1593835 entries, 0 to 1599999
Data columns (total 4 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   id              1593835 non-null  int64 
 1   user            1593835 non-null  object
 2   text            1593835 non-null  object
 3   processed_text  1593835 non-null  object
dtypes: int64(1), object(3)
memory usage: 60.8+ MB
