In [1]:
import nltk   
import spacy            
import re     
import string            
import numpy as np
import pandas as pd
import math
import random
import matplotlib.pyplot as plt
from nltk.corpus import twitter_samples    # Corpus Twitter
from nltk.tokenize import word_tokenize 
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Lectura de Corpus

In [2]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [3]:
pos_tweets = twitter_samples.strings('positive_tweets.json') #tweets positivos
neg_tweets = twitter_samples.strings('negative_tweets.json') #tweets negativos

print("Positive tweets: ", len(pos_tweets))
print("Negative tweets: ", len(neg_tweets))

Positive tweets:  5000
Negative tweets:  5000


Procesamiento


1. LowerCase
2. Lematización / Stemming
3. Remover stopword
4. Remover signos de puntuación
4. Remover urls y manejadores





In [4]:
def custom_tokenizer(nlp):
    special_cases = {":)": [{"ORTH": ":)"}], ":(": [{"ORTH": ":("}]}
    simple_url_re = re.compile(r'''^https?://''')
    suffixes = nlp.Defaults.suffixes + [r'''-+$''',]
    prefixes = nlp.Defaults.prefixes + [r'^[\-\—\–\+\+\.\!\/\,\"\(\)\[\]\{\}\:\;\<\>\?\¿\¡\|\&\#\@\$\%\^\*\_\\\'\`\~]']
    suffix_regex = spacy.util.compile_suffix_regex(suffixes)
    prefixes_regex = spacy.util.compile_prefix_regex(prefixes)
    return spacy.tokenizer.Tokenizer(nlp.vocab, rules=special_cases, suffix_search=suffix_regex.search, prefix_search=prefixes_regex.search, url_match=simple_url_re.match)

nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = custom_tokenizer(nlp)


In [5]:
def normalization(data, regularization="lemma", language='english'):
  stopwords = nltk.corpus.stopwords.words(language)
  ps = PorterStemmer()
  normalized_data = []
  
  for tweet in data:
    tweet = re.sub(r'^RT[\s]+', '', tweet) # identificar retweets
    tweet = re.sub(r'https?://[^\s\n\r]+', '', tweet) #eliminar links
    tweet = re.sub(r'#', '', tweet) #eliminar símbolo gato
    tweet = re.sub(r'@\w+', '', tweet) #eliminar palabras que inicias con @
    tweet = re.sub(r'\d+', '', tweet) #eliminar números
    tweet = re.sub(' +', ' ', tweet) #quitar espacios

    if regularization == "stem":
      tweetTokenizer = TweetTokenizer()
      words = tweetTokenizer.tokenize(tweet)
      tokens = [ps.stem(w) for w in words]
    if regularization == "lemma":
      doc = nlp(tweet)
      tokens = [token.lemma_ for token in doc]
    else:
      doc = nlp(tweet)
      tokens = [token.text for token in doc]
    
    normalized_tweets = [w for w in tokens if w not in stopwords and not w==' ' and w not in string.punctuation]
    normalized_data.append(normalized_tweets)
  return normalized_data

Create Vocabulary and frequency dictionaries

In [6]:
norm_pos = normalization(pos_tweets)
norm_neg = normalization(neg_tweets)
all_tweets = norm_pos + norm_neg

In [7]:
def n_grams(words:list, n_gram:int):
  if int(n_gram) == 1: return words
  return [tuple(words[i:i+int(n_gram)]) for i,w in enumerate(words) if i <= (len(words)-int(n_gram))]

In [8]:
ngrams = 2
n_grams_tweets = [n_grams(tweet, ngrams) for tweet in all_tweets]
at = [w for tweet in n_grams_tweets for w in tweet]
fd = nltk.FreqDist(at)
vocabulary = sorted(list(fd.keys()))

print('\nThe vocabulary has ' + str(len(vocabulary)) + ' ' + str(ngrams) + '-grams.\n')


The vocabulary has 40958 2-grams.



In [9]:
X_features = []
for text in all_tweets:
  vector = [1] # initialize in 1 ?
  for voc in vocabulary:
    # In vector saves a list of vocabulary's length. 
    # Iterate each vocabulary word and count in each text list
    vector.append(text.count(voc))
  X_features.append(vector)

print('\nX_features matrix has m = %d examples (rows).\n' %len(X_features))
print('and  n = %d features (columns).\n' %len(X_features[0]))


X_features matrix has m = 10000 examples (rows).

and  n = 40959 features (columns).



In [10]:
tags = [1]*len(pos_tweets) + [0]*len(neg_tweets)

In [11]:
from sklearn.decomposition import PCA

In [12]:
pca = PCA(n_components=1000)
pca.fit(X_features)
X_features_pca = pca.transform(X_features)

  self.explained_variance_ratio_ = self.explained_variance_ / total_var


In [13]:
#print(pca.explained_variance_ratio_)
n_components = 1000
suma = np.sum(pca.explained_variance_ratio_[:n_components])
print("Perdida de información es de : " + str(round(1-suma,4)) + " %")

Perdida de información es de : nan %


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import classification_report

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_features_pca,tags,test_size=0.2, random_state=50)
target_names = ['class 0', 'class 1']

In [16]:
# Crear un clasificador de Naive Bayes
modelo = MultinomialNB()
# Entrenar el clasificador con los datos de entrenamiento
modelo.fit(X_train, y_train)
predicciones = modelo.predict(X_test)
# Evaluamos el modelo
puntaje = modelo.score(X_test, y_test)
# Imprimimos el puntaje obtenido
print("Puntaje: ", puntaje)
print(classification_report(y_test, predicciones, target_names=target_names, digits=4))

Puntaje:  0.4995
              precision    recall  f1-score   support

     class 0     0.0000    0.0000    0.0000      1001
     class 1     0.4995    1.0000    0.6662       999

    accuracy                         0.4995      2000
   macro avg     0.2497    0.5000    0.3331      2000
weighted avg     0.2495    0.4995    0.3328      2000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
# Crear un clasificador de Naive Bayes
modelo = BernoulliNB()
# Entrenar el clasificador con los datos de entrenamiento
modelo.fit(X_train, y_train)
predicciones = modelo.predict(X_test)
# Evaluamos el modelo
puntaje = modelo.score(X_test, y_test)
# Imprimimos el puntaje obtenido
print("Puntaje: ", puntaje)
print(classification_report(y_test, predicciones, target_names=target_names, digits=4))

Puntaje:  0.4995


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     class 0     0.0000    0.0000    0.0000      1001
     class 1     0.4995    1.0000    0.6662       999

    accuracy                         0.4995      2000
   macro avg     0.2497    0.5000    0.3331      2000
weighted avg     0.2495    0.4995    0.3328      2000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
