In [1]:
import nltk   
import spacy            
import re     
import string            
import numpy as np
import pandas as pd
import math
import random
import matplotlib.pyplot as plt
from nltk.corpus import twitter_samples    # Corpus Twitter
from nltk.tokenize import word_tokenize 
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Lectura de Corpus

In [2]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [3]:
pos_tweets = twitter_samples.strings('positive_tweets.json') #tweets positivos
neg_tweets = twitter_samples.strings('negative_tweets.json') #tweets negativos

print("Positive tweets: ", len(pos_tweets))
print("Negative tweets: ", len(neg_tweets))

Positive tweets:  5000
Negative tweets:  5000


Procesamiento


1. LowerCase
2. Lematización / Stemming
3. Remover stopword
4. Remover signos de puntuación
4. Remover urls y manejadores





In [4]:
def custom_tokenizer(nlp):
    special_cases = {":)": [{"ORTH": ":)"}], ":(": [{"ORTH": ":("}]}
    simple_url_re = re.compile(r'''^https?://''')
    suffixes = nlp.Defaults.suffixes + [r'''-+$''',]
    prefixes = nlp.Defaults.prefixes + [r'^[\-\—\–\+\+\.\!\/\,\"\(\)\[\]\{\}\:\;\<\>\?\¿\¡\|\&\#\@\$\%\^\*\_\\\'\`\~]']
    suffix_regex = spacy.util.compile_suffix_regex(suffixes)
    prefixes_regex = spacy.util.compile_prefix_regex(prefixes)
    return spacy.tokenizer.Tokenizer(nlp.vocab, rules=special_cases, suffix_search=suffix_regex.search, prefix_search=prefixes_regex.search, url_match=simple_url_re.match)

nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = custom_tokenizer(nlp)


In [5]:
def normalization(data, regularization="lemma", language='english'):
  stopwords = nltk.corpus.stopwords.words(language)
  ps = PorterStemmer()
  normalized_data = []
  
  for tweet in data:
    tweet = re.sub(r'^RT[\s]+', '', tweet) # identificar retweets
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet) #eliminar links
    tweet = re.sub(r'#', '', tweet) #eliminar símbolo gato
    tweet = re.sub(r'@\w+', '', tweet) #eliminar palabras que inicias con @
    tweet = re.sub(r'\d+', '', tweet) #eliminar números
    tweet = re.sub(' +', ' ', tweet) #quitar espacios

    if regularization == "stem":
      tweetTokenizer = TweetTokenizer()
      words = tweetTokenizer.tokenize(tweet)
      tokens = [ps.stem(w) for w in words]
    if regularization == "lemma":
      doc = nlp(tweet)
      tokens = [token.lemma_ for token in doc]
    else:
      doc = nlp(tweet)
      tokens = [token.text for token in doc]
    
    normalized_tweets = [w for w in tokens if w not in stopwords and not w==' ' and w not in string.punctuation]
    normalized_data.append(normalized_tweets)
  return normalized_data

Create Vocabulary and frequency dictionaries

In [6]:
norm_pos = normalization(pos_tweets)
norm_neg = normalization(neg_tweets)
all_tweets = norm_pos + norm_neg
at = [w for tweet in all_tweets for w in tweet]
fd = nltk.FreqDist(at)
vocabulary = sorted(list(fd.keys()))

print('\nThe vocabulary has %d words.\n' %len(vocabulary))


The vocabulary has 12428 words.



In [7]:
X_features = []
for text in all_tweets:
  vector = [1] # initialize in 1 ?
  for voc in vocabulary:
    # In vector saves a list of vocabulary's length. 
    # Iterate each vocabulary word and count in each text list
    vector.append(text.count(voc))
  X_features.append(vector)

print('\nX_features matrix has m = %d examples (rows).\n' %len(X_features))
print('and  n = %d features (columns).\n' %len(X_features[0]))


X_features matrix has m = 10000 examples (rows).

and  n = 12429 features (columns).



In [8]:
tags = [1]*len(pos_tweets) + [0]*len(neg_tweets)

In [9]:
from sklearn.decomposition import PCA

In [10]:
pca = PCA(n_components=2500)
pca.fit(X_features)
X_features_pca = pca.transform(X_features)

In [19]:
#print(pca.explained_variance_ratio_)
n_components = 2500
suma = np.sum(pca.explained_variance_ratio_[:n_components])
print("Perdida de información es de : " + str(round(1-suma,4)*100) + " %")

Perdida de información es de : 8.01 %


In [None]:
#Principal component analysis
def compute_pca(X, k_components=2, tolerance=0.01):
    """
    Entrada:
        X: Dimensión (m,n) donde n son las características y m son los ejemplos
        k_components: Number of components you want to keep.
    Salida:
        X_reduced: Datos transformados con k_components dimensiones
    """

    # Centrar los datos de entrada a su media
    X_demeaned = X - np.mean(X, axis=0, keepdims=True)

    # Calcula la matriz de covarianza
    covariance_matrix = np.cov(X_demeaned, rowvar=False)

    # Calcula los eigenvectores y eigenvalores de la matriz de covarianza
    eigen_vals, eigen_vecs = np.linalg.eigh(covariance_matrix, UPLO='L')

    # Retorna los índices que ordenarían los eigenvalores de mayor a menor
    idx_sorted = np.argsort(-eigen_vals)

    # Ordena los eigenvalores por idx_sorted
    eigen_vals_sorted = eigen_vals[idx_sorted]

    # Ordena los eigenvectores usando los indices idx_sorted 
    eigen_vecs_sorted = eigen_vecs[:,idx_sorted]

    # Selecciona los primeros k eigenvectores
    eigen_vecs_subset = eigen_vecs_sorted[:,0:k_components]

    # Selecciona los primeros k eigenvalores
    eigen_vals_subset = eigen_vals_sorted[:k_components]

    # Calcula la pérdida de información (Varianza), un numero aceptable es < 0.01
    variance = 1 - (sum(eigen_vals_subset) / sum(eigen_vals))
    if variance <= tolerance:
        print("Pérdida de información aceptable")
    else:
        print("PÉRDIDA DE INFORMACIÓN IMPORTANTE - AUMENTA K-COMPONENTS")

    print("Porcentaje de pérdida de información del " + str(round(1 - (variance*100), 4)) + "%")

    # Transforma los datos, por la multiplicación de la transpuesta de los eigenvectores
    # con la transpuesta de los datos de entrada centrados a su media
    X_reduced = np.matmul(eigen_vecs_subset.T, X_demeaned.T)
    X_reduced = X_reduced.T

    return X_reduced

In [None]:
X_features_pca = compute_pca(X_features, k_components=2500)

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_features_pca,tags,test_size=0.2, random_state=50)
target_names = ['class 0', 'class 1']

In [14]:
# Crear un clasificador con DecisionTree
modeloDT = DecisionTreeClassifier()
# Entrenar el clasificador con los datos de entrenamiento
modeloDT.fit(X_train, y_train)
predicciones = modeloDT.predict(X_test)
# Evaluamos el modelo
puntaje = modeloDT.score(X_test, y_test)
# Imprimimos el puntaje obtenido
print("Puntaje: ", puntaje)
print(classification_report(y_test, predicciones, target_names=target_names, digits=4))

Puntaje:  0.946
              precision    recall  f1-score   support

     class 0     0.9390    0.9540    0.9465      1001
     class 1     0.9532    0.9379    0.9455       999

    accuracy                         0.9460      2000
   macro avg     0.9461    0.9460    0.9460      2000
weighted avg     0.9461    0.9460    0.9460      2000



In [15]:
# Crear un clasificador con RandomForest
modeloRF = RandomForestClassifier()
# Entrenar el clasificador con los datos de entrenamiento
modeloRF.fit(X_train, y_train)
predicciones = modeloRF.predict(X_test)
# Evaluamos el modelo
puntaje = modeloRF.score(X_test, y_test)
# Imprimimos el puntaje obtenido
print("Puntaje: ", puntaje)
print(classification_report(y_test, predicciones, target_names=target_names, digits=4))

Puntaje:  0.9505
              precision    recall  f1-score   support

     class 0     0.9669    0.9331    0.9497      1001
     class 1     0.9352    0.9680    0.9513       999

    accuracy                         0.9505      2000
   macro avg     0.9510    0.9505    0.9505      2000
weighted avg     0.9511    0.9505    0.9505      2000



In [16]:
# Crear un clasificador con SVC
modeloSVC = SVC(kernel='linear', random_state=42)
# Entrenar el clasificador con los datos de entrenamiento
modeloSVC.fit(X_train, y_train)
predicciones = modeloSVC.predict(X_test)
# Evaluamos el modelo
puntaje = modeloSVC.score(X_test, y_test)
# Imprimimos el puntaje obtenido
print("Puntaje: ", puntaje)
print(classification_report(y_test, predicciones, target_names=target_names, digits=4))

Puntaje:  0.961
              precision    recall  f1-score   support

     class 0     0.9763    0.9451    0.9604      1001
     class 1     0.9467    0.9770    0.9616       999

    accuracy                         0.9610      2000
   macro avg     0.9615    0.9610    0.9610      2000
weighted avg     0.9615    0.9610    0.9610      2000



In [17]:
# Crear un clasificador con AdaBoostClassifier
modeloADA = AdaBoostClassifier()
# Entrenar el clasificador con los datos de entrenamiento
modeloADA.fit(X_train, y_train)
predicciones = modeloADA.predict(X_test)
# Evaluamos el modelo
puntaje = modeloADA.score(X_test, y_test)
# Imprimimos el puntaje obtenido
print("Puntaje: ", puntaje)
print(classification_report(y_test, predicciones, target_names=target_names, digits=4))

Puntaje:  0.9525
              precision    recall  f1-score   support

     class 0     0.9567    0.9481    0.9523      1001
     class 1     0.9484    0.9570    0.9527       999

    accuracy                         0.9525      2000
   macro avg     0.9525    0.9525    0.9525      2000
weighted avg     0.9525    0.9525    0.9525      2000



In [18]:
# Crear un clasificador con KNeighborsClassifier
modeloKN = KNeighborsClassifier()
# Entrenar el clasificador con los datos de entrenamiento
modeloKN.fit(X_train, y_train)
predicciones = modeloKN.predict(X_test)
# Evaluamos el modelo
puntaje = modeloKN.score(X_test, y_test)
# Imprimimos el puntaje obtenido
print("Puntaje: ", puntaje)
print(classification_report(y_test, predicciones, target_names=target_names, digits=4))

Puntaje:  0.9375
              precision    recall  f1-score   support

     class 0     0.9252    0.9520    0.9385      1001
     class 1     0.9505    0.9229    0.9365       999

    accuracy                         0.9375      2000
   macro avg     0.9379    0.9375    0.9375      2000
weighted avg     0.9379    0.9375    0.9375      2000

