# Data Exploration
This notebook loads and explore the dataset

## Imports

In [None]:
!pip install -q textblob
!python -m textblob.download_corpora
import nltk
nltk.download('stopwords')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from random import randint
import re
from nltk.corpus import stopwords
from textblob.classifiers import NaiveBayesClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

separator = 100 * '-'

## Load dataset

In [3]:
# file_path = "../data/TRAINING_DATA.txt"
file_path = "TRAINING_DATA.txt"
data = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'text'])
data = data[['text', 'label']]
data.head(10)


Unnamed: 0,text,label
0,"Cuando conocí a Janice en 2013 , una familia n...",1
1,Hwang habló en Sur de este año por Southwest M...,0
2,Usted podría pensar Katy Perry y Robert Pattin...,1
3,Cualquiera que haya volado los cielos del crea...,1
4,"Bueno , este cantante tendrá un LARGO tiempo p...",1
5,"Ya en octubre de 1940 , se registra , Hergé re...",0
6,Yo no creo que debamos seguir viéndonos es b...,0
7,Joe y Teresa Giudice están en peligro de perde...,0
8,"Un día más tarde , historias surgieron sobre e...",0
9,"Él viajó a Las Vegas , días después de cumplir...",0


In [4]:
X = data["text"]
y = data["label"]

def print_text(feature, label, idx=None):
    try:
        print(separator)
        if idx is None:
            idx = randint(0, feature.shape[0])
        print(f"[{idx}]", feature[idx], "-->", label[idx])
        print(separator)
    except:
        print("Can't print email contents.")

print_text(X, y)

----------------------------------------------------------------------------------------------------
[1904] Ver El Teen Mom Cuelgue Con Su Hija y fiesta con sus amigas AQUÍ ! --> 0
----------------------------------------------------------------------------------------------------


## Cleaning/Preprocessing dataset

In [5]:
def clean_text(text):

    # Remove special characters
    text = re.sub(r'[^A-Za-zÁÉÍÓÚáéíóúÑñ\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Substitute multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)

    # Convert to lowercase
    text = text.lower().strip()

    return text

# apply cleaning
X_clean = X.apply(clean_text)

idx = randint(0, len(X))
print_text(X, y, idx)
print_text(X_clean, y, idx)

    

----------------------------------------------------------------------------------------------------
[11374] Eso es algo que quiero decir muy bruscamente justo al principio . --> 1
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
[11374] eso es algo que quiero decir muy bruscamente justo al principio --> 1
----------------------------------------------------------------------------------------------------


## Remove Stopwords

In [6]:
stopwords_sp = stopwords.words("spanish")
print(len(stopwords_sp), "spanish stopwords")  # 313 stopwords for the spanish language

def remove_stopwords(text):

    # tokenize the text by splitting on spaces
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords_sp]
    return ' '.join(tokens)

# apply removing stopwords
X_clean = X_clean.apply(remove_stopwords)

# check results
idx = randint(0, len(X))
print_text(X, y, idx)
print_text(X_clean, y, idx)


313 spanish stopwords
----------------------------------------------------------------------------------------------------
[9250] En las palabras inmortales del patriota escocés Mel Gibson , " usted puede tomar sus vidas , pero nunca tomar sus pompones ! --> 1
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
[9250] palabras inmortales patriota escocés mel gibson usted puede tomar vidas nunca tomar pompones --> 1
----------------------------------------------------------------------------------------------------


## Split Dataset

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_clean, y, test_size=0.3, random_state=42, stratify=data['label'])

# reset indexes
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# print shapes
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(10446,) (10446,)
(4478,) (4478,)
