#Para el procesamiento se utilizara CountVectorizer

In [2]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
spanish_stopwords = stopwords.words('spanish')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ferdinand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from string import punctuation
non_words = list(punctuation)

#Puntuaciones en español
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))
non_words

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~',
 '¿',
 '¡',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9']

In [4]:
from sklearn.feature_extraction.text import CountVectorizer       
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

stemmer = SnowballStemmer('spanish')
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remueve las no letras
    text = ''.join([c for c in text if c not in non_words])
    # tokenize
    tokens =  word_tokenize(text)

    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

Evaluacion del modelo

In [5]:
from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline



In [6]:
import pandas as pd

tweets_corpus = pd.read_csv("TASSTrain.csv")

#Los neutros se eliminan
tweets_corpus = tweets_corpus[tweets_corpus.sentimiento != 'NEU']

tweets_corpus['polarity_bin'] = 0
tweets_corpus.polarity_bin[tweets_corpus.sentimiento.isin(['P', 'P+'])] = 1
tweets_corpus.polarity_bin.value_counts(normalize=True)
#1 positivo, 0 negativo

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


1    0.569285
0    0.430715
Name: polarity_bin, dtype: float64

In [7]:
tweets_corpus.head(10)

Unnamed: 0.1,Unnamed: 0,tweetid,content,sentimiento,polarity_bin
1,2,142391947707940864,@marodriguezb Gracias MAR,P,1
2,3,142416095012339712,"Off pensando en el regalito Sinde, la que se v...",N+,0
3,4,142422495721562112,Conozco a alguien q es adicto al drama! Ja ja ...,P+,1
4,6,142483342040907776,Toca @crackoviadeTV3 . Grabación dl especial N...,P+,1
5,8,142494476051562496,Buen día todos! Lo primero mandar un abrazo gr...,P+,1
6,9,142496796416016384,Desde el escaño. Todo listo para empezar #endi...,P+,1
7,10,142497735814287360,Bdías. EM no se ira de puente. Si vosotros os ...,P+,1
8,11,142499355360903168,Un sistema económico q recorta dinero para pre...,P+,1
9,12,142504935853006848,#programascambiados caca d ajuste,N+,0
10,13,142507006832553984,Buen viernes,P,1


In [8]:
len(tweets_corpus)

5066

In [9]:
#Definimos el vectorizer y creamos un pipeline de vectorizer ; classificador

from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

vectorizer = CountVectorizer(
                analyzer = 'word',
                tokenizer = tokenize,
                lowercase = True,
                stop_words = spanish_stopwords)

pipeline = Pipeline([
    ('vect', vectorizer),
    ('cls', LinearSVC()),
])



In [10]:
#Definimos el vectorizer de nuevo, se define el clasificador y se entrena

model = LinearSVC(C=.2, loss='squared_hinge',max_iter=1000,multi_class='ovr',
              random_state=None,
              penalty='l2',
              tol=0.0001
)

#parametros
#https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = spanish_stopwords,
    min_df = 50,
    max_df = 1.9,
    ngram_range=(1, 1),
    max_features=1000
)

corpus_data_features = vectorizer.fit_transform(tweets_corpus.content)
corpus_data_features_nd = corpus_data_features.toarray() #toarray to see the 2d-array of the sparse matrix that we just created.

In [17]:
corpus_data_features

<5066x144 sparse matrix of type '<class 'numpy.int64'>'
	with 16567 stored elements in Compressed Sparse Row format>

In [8]:
scores = cross_val_score(
    model,
    corpus_data_features_nd[0:len(tweets_corpus)],
    y=tweets_corpus.polarity_bin,
    scoring='roc_auc',
    cv=5
    )

scores.mean()

0.7814785301237362

In [9]:
tweets = pd.read_csv('keikoTest.csv', encoding='utf-8')

In [10]:
#ajustamos el modelo at corpus de TASS
pipeline.fit(tweets_corpus.content, tweets_corpus.polarity_bin)
#Usamoslos tweets de keiko para hallar laoplaridad
tweets['polarity'] = pipeline.predict(tweets.content)

In [16]:
tweets[['content', 'polarity']].sample(10)
#1 positivo
#0   negativo

Unnamed: 0,content,polarity
100,#Hoy #Lambayeque\r\n#Precandidatos #PrensaRegi...,1
92,Hasta pronto #Ica!!! Una gran visita con un gr...,1
103,Hacer acusaciones a la ligera sobre personas i...,0
144,Quedan casi 4 años de gobierno. Tiempo suficie...,0
29,RT @RedesKeiko: Mariella Balbi en @Peru21notic...,0
106,Este gobierno creía que con la plata se compra...,1
189,"Presidente Kuczynski, usted dice que el indult...",1
134,Vengo siendo investigada 18 meses por el tema ...,0
153,Lo digo por enésima vez: ni Fuerza Popular ni ...,1
133,"Resulta que como ahora la Fiscalía ""necesita m...",0


In [13]:
pos_tweets = [ tweet for index, tweet in enumerate(tweets['content']) if tweets['polarity'][index] > 0]
neg_tweets = [ tweet for index, tweet in enumerate(tweets['content']) if tweets['polarity'][index] == 0]

In [14]:
print("Percentage of positive tweets: {}%".format(len(pos_tweets)*100/len(tweets['content'])))
print("Percentage of negative tweets: {}%".format(len(neg_tweets)*100/len(tweets['content'])))

Percentage of positive tweets: 65.5%
Percentage of negative tweets: 34.5%
