### Importamos Librerías

In [1]:
import pandas as pd
import numpy as np
from lxml import objectify
import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from string import punctuation
from nltk.stem import SnowballStemmer

#Importamos el modelo
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier

from sklearn.cross_validation import cross_val_score

import langid
from langdetect import detect
import textblob




In [212]:
base_location = './'

# To read
tweets_raw_file = base_location + 'datasets/train.csv'
tweets_run_file = base_location + 'datasets/test_nolabel.csv'
corpus_tweets_2012_xml = base_location + 'general-train-tagged-3l.xml'
corpus_tweets_2017_xml = base_location + 'intertass-train-tagged.xml'

# To generate
corpus_tweets_2012_csv = base_location + 'general-train-tagged-3l.csv'
corpus_tweets_2017_csv = base_location + 'intertass-train-tagged.csv'
corpus_tweets_csv = base_location + 'corpus_tweets.csv'

### Load datasets

In [213]:
tweets_raw = pd.read_csv(tweets_raw_file, encoding='utf-8')
tweets_run = pd.read_csv(tweets_run_file, encoding='utf-8')

print('Total tweets: %d' % len(tweets_raw))
print('Evaluated tweets so far: %d' % len(tweets_run))

Total tweets: 411
Evaluated tweets so far: 177


### Train and Test data

In [214]:
# Build dictionary of tweets where key is the task__id
own_tweets = []
tweets_obj = {}
for index, row in tweets_raw.iterrows():
    own_tweets.append({
        'id': row.id,
        'tweet': row.text, 
        'polarity': row.polarity
    })

print('Total different tweets evaluated so far: %d' % len(own_tweets))

Total different tweets evaluated so far: 411


###  POS Tagging

In [215]:
# 4 values of sentiment: N, P, NONE, NEU
xml = objectify.parse(open(corpus_tweets_2017_xml))
root = xml.getroot()
general_tweets_corpus_train_2017 = pd.DataFrame(columns=('content', 'polarity'))
tweets = root.getchildren()
for i in range(0, len(tweets)):
    tweet = tweets[i]
    row = dict(zip(['content', 'polarity'], [tweet.content.text, tweet.sentiment.polarity.value.text]))
    row_s = pd.Series(row)
    row_s.name = i
    general_tweets_corpus_train_2017 = general_tweets_corpus_train_2017.append(row_s)
    
general_tweets_corpus_train_2017.to_csv(corpus_tweets_2017_csv, index=False, encoding='utf-8')

In [216]:
# 4 values of sentiment: N, P, NONE, NEU
xml = objectify.parse(open(corpus_tweets_2012_xml))
root = xml.getroot()
general_tweets_corpus_train_2012 = pd.DataFrame(columns=('content', 'polarity'))
tweets = root.getchildren()
for i in range(0, len(tweets)):
    tweet = tweets[i]
    row = dict(zip(['content', 'polarity'], [tweet.content.text, tweet.sentiments.polarity.value.text]))
    row_s = pd.Series(row)
    row_s.name = i
    general_tweets_corpus_train_2012 = general_tweets_corpus_train_2012.append(row_s)
    
general_tweets_corpus_train_2012.to_csv(corpus_tweets_2012_csv, index=False, encoding='utf-8')

In [217]:
tweets_corpus = pd.concat([
        general_tweets_corpus_train_2012,
        general_tweets_corpus_train_2017
    ])
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
6171,El PP y el Foro utilizan Asturias para un PUL...,N
6402,Resultados en Sarre: CDU de Merkel fuerza más ...,N
1562,Yo te deseo lo mejor de lo mejor porque tb te ...,P
217,@LuisFBonilla @isabelsevilla9 @adirecto @peich...,P
5242,Asi andamos. Este es el ambiente. http://t.co/...,NONE
4895,"Hoy estoy en Córdoba, en el jurado del premio ...",P
2615,"No tenemos nada que ocultar, si apoyan una com...",P
2975,El Madrid superior al Barça en su campo. Es la...,P
6616,CCOO y UGT dicen que en obras de edificación l...,NEU
3316,Todos siguiendo a @josederico y @eltzunamy07 ...,P


In [218]:
print('Total corpus tweets: %d' % len(tweets_corpus))

Total corpus tweets: 8227


In [219]:
tweets_corpus = tweets_corpus.query('polarity != "NONE"')

In [220]:
tweets_corpus = tweets_corpus[-tweets_corpus.content.str.contains('^http.*$')]

In [221]:
url_regex = re.compile('https?:\/\/t\.co\/[\w]{8,8}')
tweets_corpus_no_links = tweets_corpus
tweets_corpus_no_links['content'] = tweets_corpus_no_links['content'].map(lambda x: re.sub(url_regex, '', x))

In [222]:
print('Total corpus tweets after cleaning: %d' % len(tweets_corpus_no_links))

Total corpus tweets after cleaning: 6586


In [223]:
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
945,Repasando discurso #investidura Rajoy: el aume...,N
1811,RT @PoliticaMadrid: Escobar recomienda a IU An...,P
5596,Buenos días desde Mallorca!vuelta a casa!,P
7092,"Valenciano: ""El PSOE tendrá la plantilla que p...",N
4121,Cada día me cae mejor @crispedroche ... Buena ...,P
1926,#Chacón estaba en el gobierno Zapatero? Parece...,N
4386,"Veracruz! Q lindo amanecer.... Q bella mañana,...",P
1610,Esta noche el gazpacho Alvalle simplemente no ...,N
3021,Ayer @PepeGrinan dijo en sede parlamentaria qu...,N
2908,"La Policía Nacional, en colaboración con el FB...",P


### Con prepocessing

In [2]:
run ./preprocessing.ipynb

Total tweets to evaluate: 177
Evaluated tweets so far: 411


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  "File names:"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  ]


Total corpus tweets: 8978
Total corpus tweets after cleaning: 7356


### Tokenization and Stemming

In [3]:
# Se descargan las palabras de parada en español
nltk.download("stopwords")
spanish_stopwords = stopwords.words('spanish')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/javier.smacias/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Obtenemos los signos de puntuación que se utilizan en español
non_words = list(punctuation)
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))

In [5]:
# Se definen las funciones para realizar la tokenización y el stemming
stemmer = SnowballStemmer('spanish')
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # Eliminamos lo que no sean palabras
    text = ''.join([c for c in text if c not in non_words])
    # Tokenización
    tokens = tknzr.tokenize(text)

    # Stemming
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

### Model

In [6]:
# Buscamos los parámetros que podemos utilizar para entrenar el modelo
RandomForestClassifier().get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_split': 1e-07,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [7]:
tweets_corpus['polarity_bin'] = 0
tweets_corpus.polarity_bin[tweets_corpus.polarity.isin(['P'])] = 1
tweets_corpus.polarity_bin[tweets_corpus.polarity.isin(['N'])] = -1
tweets_corpus.polarity_bin.value_counts(normalize=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


 1    0.500816
-1    0.358347
 0    0.140837
Name: polarity_bin, dtype: float64

In [8]:
#from sklearn.preprocessing import label_binarize
#tweets_corpus_no_links.polarity_bin = label_binarize(tweets_corpus_no_links.polarity_bin, classes=[-1, 0, 1])
#tweets_corpus_no_links.polarity_bin = label_binarize(tweets_corpus_no_links.polarity_bin, classes=[-1, 0, 1])

tweets_corpus.sample(10)

Unnamed: 0,content,polarity,polarity_bin
251,🇺,P,1
896,Dice que empresas y familias están en superávi...,N,-1
240,🐾,P,1
5117,"Café. Mucho café. RT : Bns días! Como haces, p...",P,1
2035,"Ah! y hoy pongo una velita, que hay luna llena...",P,1
289,"Me apiado de ti, yo me entero de que existen ...",N,-1
1821,Se ha montado la de Dios! A ver: habrá que sor...,N,-1
1024,Todavía me estoy riendo con lo de la APP de ic...,P,1
139,Viaje a las embajadas catalanas del derroche -...,N,-1
2197,"Para mi, escuchar a Ramoneda me sirve de refer...",P,1


In [9]:
pipeline = Pipeline([
    ('vect',  CountVectorizer(
                analyzer = 'word',
                tokenizer = tokenize,
                lowercase = True,
                stop_words = spanish_stopwords)),
    ('cls', RandomForestClassifier())])
parameters = {
    'cls__criterion': ('gini','entropy'),
    'cls__n_estimators': (1, 10, 100, 1000),
    'cls__class_weight':['balanced', None]    
}

In [13]:
#gs = GridSearchCV(pipeline, parameters, n_jobs=-1, scoring='roc_auc')
gs = GridSearchCV(pipeline, parameters, n_jobs=-1, scoring='f1_micro')
gs.fit(tweets_corpus.content, tweets_corpus.polarity_bin)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['de', 'la'...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'cls__criterion': ('gini', 'entropy'), 'cls__n_estimators': (1, 10, 100, 1000), 'cls__class_weight': ['balanced', None]},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1_micro', verbose=0)

In [14]:
# Mostramos los mejores parámetros del Random Forest obtenidos de la búsqueda con GridSearchCV 
gs.best_params_

{'cls__class_weight': None, 'cls__criterion': 'gini', 'cls__n_estimators': 100}

### Rendimiento

In [16]:
# Mediante validación cruzada obtenemos el rendimiento del modelo
model = RandomForestClassifier(class_weight = 'balanced', criterion = 'gini', n_estimators = 100)

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = spanish_stopwords,
    min_df = 0,
    max_df = 4700,
    max_features=1000
)

tweets_corpus_nl_features = vectorizer.fit_transform(tweets_corpus.content)
tweets_corpus_nl_features_nd = tweets_corpus_nl_features.toarray()

scores = cross_val_score(
    model,
    tweets_corpus_nl_features_nd[0:len(tweets_corpus)],
    y=tweets_corpus.polarity_bin,
    cv=None
    )

scores.mean()

0.62330338202147273

### Polaridad

In [15]:
tweets_no_label = pd.read_csv(test_tweets_raw, encoding='utf-8')
print('Number of tweets: %d' % tweets_no_label.shape[0])
tweets_no_label.sample(10)

Number of tweets: 177


Unnamed: 0,id,text
157,2fb93730,@LuisOmarTapia Se le salió la barcelonitis al ...
51,13581e4b,@AngelVikingo @dircomPedro Y tu que te llamas ...
133,26c47161,Félix Brych ayer en el partido de champions #P...
159,1491c900,Los parisinos no lo verán este año la famosa C...
40,09c0f4cc,El Barça Lassa deja casi vacía la enfermería h...
172,e68db2a9,@Vikingoblanco93 @Ajasakam @CFuenlabradaSAD @r...
165,dec03842,"@neymarjr El Barça no te necesita, mejor ya ve..."
66,23303f58,📷 [GALERIA] El @FCBfutbolsala recupera efectiu...
29,1aee328a,Me sorprende la absurdez del fanatismo que hay...
92,8675ca7f,@ALEX15vs @FCBarcelona_es @FCBarcelona Que noo...


In [16]:
import langid
from langdetect import detect
import textblob

def langid_safe(tweet):
    try:
        return langid.classify(tweet)[0]
    except Exception as e:
        pass
        
def langdetect_safe(tweet):
    try:
        return detect(tweet)
    except Exception as e:
        pass

def textblob_safe(tweet):
    try:
        return textblob.TextBlob(tweet).detect_language()
    except Exception as e:
        pass

Create 3 new columns specifying the detected language of the tweet.

In [17]:
tweets_no_label['lang_langid'] = tweets_no_label.text.apply(langid_safe)
tweets_no_label['lang_langdetect'] = tweets_no_label.text.apply(langdetect_safe)
tweets_no_label['lang_textblob'] = tweets_no_label.text.apply(textblob_safe)

Save as CSV.

In [18]:
tweets_no_label.to_csv('tweets_parsed.csv', encoding='utf-8')

We select the tweets in Spanish as follows:
- If the language detected is Spanish by at least 2 libraries, leave.
- If the language detected is Spanish in at least 1 library, print and append to the dataset manually.
- If none of the languages detected is Spanish, remove.

In [19]:
# Leave tweets whose detected language is Spanish (majority):
spanish_query = ''' (lang_langdetect == 'es' and lang_langid == 'es') or (lang_langdetect == 'es' and lang_textblob == 'es') or (lang_textblob == 'es' and lang_langid == 'es') '''
tweets_spanish = tweets_no_label.query(spanish_query)

print('Tweets in Spanish: %d' % tweets_spanish.shape[0])

# Print tweets in doubtful language:
nonspanish_query = ''' ((lang_langdetect != 'es' and lang_langid != 'es') or (lang_langdetect != 'es' and lang_textblob != 'es') or (lang_textblob != 'es' and lang_langid != 'es')) and (lang_textblob == 'es' or lang_langid == 'es' or lang_langdetect == 'es') '''
tweets_doubtful = tweets_no_label.query(nonspanish_query)

print('Tweets whose language is not clear: %d' % tweets_doubtful.shape[0])

tweets_doubtful

Tweets in Spanish: 153
Tweets whose language is not clear: 15


Unnamed: 0,id,text,lang_langid,lang_langdetect,lang_textblob
8,cd0d8bcb,DRAXLER EXPLOTA vs el PSG | BALOTELLI ‘manda C...,es,ca,ca
17,97af720a,"Para el que quiera ver el Barça hoy, es a las ...",ca,ca,es
40,09c0f4cc,El Barça Lassa deja casi vacía la enfermería h...,an,ca,es
42,5a533794,¡EL INICIO DE UNA LEYENDA! 🔴🔵\n\n¿Qué momento ...,pt,pt,es
55,9046f222,ULTIMA HORA: EL PRESIDENTE DEL PSG LE OFRECE A...,tr,en,es
56,5df2d140,Me ha gustado un vídeo de @YouTube (https://t....,gl,pt,es
72,c5343fa0,@AS_Manolete Y al Atleti que miura la va a toc...,ca,ca,es
74,12d82762,EL DIRECTO EN EL QUE DjMaRiiO USÓ LA CAMISETA ...,tr,en,es
76,dcc02374,@bet365_es No es muy difícil Barça va Madrid,ca,ca,es
96,8f9d73cf,@bet365_es Cualquiera que no estuvieran ni Bar...,ca,ca,es


In [20]:
# Append rest of the tweets in Spanish manually
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '79cdded5' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '26fe7471' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'cd0d8bcb' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '97af720a' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '09c0f4cc' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '5a533794' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '9046f222' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '5df2d140' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'c5343fa0' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '12d82762' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'dcc02374' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '8f9d73cf' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '6f30beca' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '9cd8b232' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '3c78bdb5' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '3beadb3a' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'c8cda282' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'fce60e59' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '7bd204cc' ''')])

print('Tweets in Spanish: %d' % tweets_spanish.shape[0])

Tweets in Spanish: 168


Define pipeline:

In [21]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
            analyzer = 'word',
            tokenizer = tokenize,
            lowercase = True,
            stop_words = spanish_stopwords,
            min_df = 0,
            max_df = 26363,
            max_features=1000
            )),
    ('cls', RandomForestClassifier(class_weight = 'balanced', criterion = 'gini', n_estimators = 100))
])

In [22]:
pipeline.fit(tweets_corpus.content, tweets_corpus.polarity_bin)
tweets_no_label['polarity'] = pipeline.predict(tweets_no_label.text)

In [23]:
tweets_no_label[['text', 'polarity']].sample(30)

Unnamed: 0,text,polarity
50,@pacogcaridad Veremos si es tan superior cuánd...,-1
4,Vengo 2/2 con los clasificados a Cuartos de Ch...,1
134,@FOXSoccer @TeamMessi @FCBarcelona Fox our bar...,0
172,@Vikingoblanco93 @Ajasakam @CFuenlabradaSAD @r...,1
106,Más miedo al Barça que al PSG? Has mencionado ...,-1
150,📌Informa @Anais_mh88: @andresiniesta8 ha fet c...,-1
37,@SeleccionArg \n#JavierMascherano sobre #Messi...,1
90,@realmadridnote @sonmcr Llegará el Barça no te...,1
20,"@Magic_F1 @MrAlexF1 No jodas, tienen a toda un...",1
175,@saguilarcom Y de k vale cumplir siemore salim...,-1


In [24]:
tweets = tweets_no_label.copy()
tweets['polarity_bin'] = 'Neutral'
tweets.polarity_bin[tweets.polarity.isin([1])] = 'Positive'
tweets.polarity_bin[tweets.polarity.isin([-1])] = 'Negative'
tweets.polarity_bin.value_counts(normalize=True)
tweets[['text', 'polarity_bin']].sample(30)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,text,polarity_bin
124,@LuisOmarTapia LO QUE PASA ES QUE EL QUE HABLA...,Negative
110,@2010MisterChip Soy 100% fans del Barça pero n...,Negative
59,@ElGolazoDeGol #golazo sique tú sabes quien se...,Negative
17,"Para el que quiera ver el Barça hoy, es a las ...",Positive
108,@juliup7 Really ? El Espanyol nos ganó. 🙄🙄\nEs...,Positive
92,@ALEX15vs @FCBarcelona_es @FCBarcelona Que noo...,Positive
75,@jotajordi13 @tonintorero Tu diciendo que no d...,Negative
76,@bet365_es No es muy difícil Barça va Madrid,Positive
157,@LuisOmarTapia Se le salió la barcelonitis al ...,Positive
125,"Sport, el Barça descartó en el pasado a Lucas ...",Negative


In [25]:
tweets.drop(['lang_langid', 'lang_langdetect','lang_textblob','polarity'], axis=1, inplace=True)

In [26]:
tweets.sample(10)

Unnamed: 0,id,text,polarity_bin
26,fecab36b,"@JJ_NG81 @blasradio Ya, y estoy de acuerdo. Pe...",Negative
166,fce60e59,Iniesta inicia su plan en la Ciutat Esportiva ...,Positive
120,a195dce5,@SantiagoSty Creo q ustedes son del Madrid y ...,Negative
16,c9029b30,@pol_balletbo @DavidLeonRon @l_Alcazar Sacar 3...,Negative
129,7fa82da8,@Blogcelonista Entonces Arthur ya esta práctic...,Negative
133,26c47161,Félix Brych ayer en el partido de champions #P...,Positive
117,c142d964,¡¡Le robaron al @Atleti!!\n\nSe metieron a una...,Negative
104,0dc07c46,Hoy es el cumpleaños de Francisco Hervás Tirad...,Positive
82,8a67463b,ESTA TARDE a las 18:45 se juega la SUPERCOPA D...,Positive
91,5b4878db,@Cristina_VeMu @ToniCAT1963 @LluisMascaro @spo...,Negative


In [27]:
tweets = tweets.rename(columns={'polarity_bin': 'polarity'})

In [28]:
tweets.sample(10)

Unnamed: 0,id,text,polarity
3,d7d87d07,@Eribert42354852 El Barca aparte de ganar eso ...,Positive
137,65da3719,@LlunaCatalana3 tv3 te els drets de la champio...,Neutral
144,16466a71,@LucasScagliola Dale un rato más y será más q ...,Negative
106,020ee260,Más miedo al Barça que al PSG? Has mencionado ...,Negative
41,62f967e5,"Hace 17 años, Messi por primera vez se puso la...",Positive
31,89edc0c0,@lniestismoFCB Cuidado! Recodad que el Málaga ...,Positive
162,6213b397,@cholomirey @darioleiva1975 lo terrible es que...,Negative
101,21813244,"M. Bartra debió quedarse en el Barcelona, una ...",Negative
17,97af720a,"Para el que quiera ver el Barça hoy, es a las ...",Positive
85,781bcd66,"Hace 17 años, #Messi inició su camino con el @...",Positive


In [29]:
tweets[['id', 'polarity']].to_csv('tweets_polarity_bin.csv', encoding='utf-8', index=False)