## Tweet classification

File names

In [1]:
base_location = './'

# To read
tweets_raw_file = base_location + 'datasets/train.csv'
tweets_run_file = base_location + 'datasets/test_nolabel.csv'
corpus_tweets_2012_xml = base_location + 'general-train-tagged-3l.xml'
corpus_tweets_2017_xml = base_location + 'intertass-train-tagged.xml'

# To generate
corpus_tweets_2012_csv = base_location + 'general-train-tagged-3l.csv'
corpus_tweets_2017_csv = base_location + 'intertass-train-tagged.csv'
corpus_tweets_csv = base_location + 'corpus_tweets.csv'

Import libraries

In [2]:
import pandas as pd
import numpy as np

### Load datasets

In [3]:
tweets_raw = pd.read_csv(tweets_raw_file, encoding='utf-8')
tweets_run = pd.read_csv(tweets_run_file, encoding='utf-8')

print('Total tweets: %d' % len(tweets_raw))
print('Evaluated tweets so far: %d' % len(tweets_run))

Total tweets: 411
Evaluated tweets so far: 177


### Create train and test data

Build new array of dictionaries with keys `id` (the task ID), `tweet` (the tweet string) and `score` (the tweet evaluation) by joining data from both CSV files.

In [4]:
# Build dictionary of tweets where key is the task__id
own_tweets = []
tweets_obj = {}
for index, row in tweets_raw.iterrows():
    own_tweets.append({
        'id': row.id,
        'tweet': row.text, 
        'polarity': row.polarity
    })

print('Total different tweets evaluated so far: %d' % len(own_tweets))

Total different tweets evaluated so far: 411


### POS Tagging

Import libraries to read XML:

In [5]:
from lxml import objectify

Import/read most recent corpus (2017):

In [6]:
# 4 values of sentiment: N, P, NONE, NEU
xml = objectify.parse(open(corpus_tweets_2017_xml))
root = xml.getroot()
general_tweets_corpus_train_2017 = pd.DataFrame(columns=('content', 'polarity'))
tweets = root.getchildren()
for i in range(0, len(tweets)):
    tweet = tweets[i]
    row = dict(zip(['content', 'polarity'], [tweet.content.text, tweet.sentiment.polarity.value.text]))
    row_s = pd.Series(row)
    row_s.name = i
    general_tweets_corpus_train_2017 = general_tweets_corpus_train_2017.append(row_s)
    
general_tweets_corpus_train_2017.to_csv(corpus_tweets_2017_csv, index=False, encoding='utf-8')

Import/read biggest corpus (2012), to concatenate it with the previous noe:

In [7]:
# 4 values of sentiment: N, P, NONE, NEU
xml = objectify.parse(open(corpus_tweets_2012_xml))
root = xml.getroot()
general_tweets_corpus_train_2012 = pd.DataFrame(columns=('content', 'polarity'))
tweets = root.getchildren()
for i in range(0, len(tweets)):
    tweet = tweets[i]
    row = dict(zip(['content', 'polarity'], [tweet.content.text, tweet.sentiments.polarity.value.text]))
    row_s = pd.Series(row)
    row_s.name = i
    general_tweets_corpus_train_2012 = general_tweets_corpus_train_2012.append(row_s)
    
general_tweets_corpus_train_2012.to_csv(corpus_tweets_2012_csv, index=False, encoding='utf-8')

Concatenate general corpus dataset with 2017 one, to have a better result:

In [8]:
tweets_corpus = pd.concat([
        general_tweets_corpus_train_2012,
        general_tweets_corpus_train_2017
    ])
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
132,@Lahierritos pues sabran més q jo \n\n*curso ...,N
6179,Niave. http://t.co/TQzeWm6d,NONE
3381,La tragedia del campo de fútbol egipcio es una...,N
6057,"Hace 200 años, se firmaba en Cádiz la primera ...",NEU
1159,"Juan Moscoso Proyecto, credibilidad y partido...",P
3511,no me parece muy acertado que Chacón haga camp...,N
278,"@KatyKlav se siente, ya está dicho",NEU
6656,"Toxo a Rajoy: ""No se puede arruinar tres décad...",N
834,"Repito, de momento NO HAY FECHA LÍMITE y las p...",NONE
1073,"vergonzoso, littleboy?“@Suanzes: @PilarGGranja...",N


In [9]:
print('Total corpus tweets: %d' % len(tweets_corpus))

Total corpus tweets: 8227


Remove tweets without polarity (polarity `NONE`):

In [10]:
tweets_corpus = tweets_corpus.query('polarity != "NONE"')

Remove tweets that are only a link:

In [11]:
tweets_corpus = tweets_corpus[-tweets_corpus.content.str.contains('^http.*$')]

Import regex tools:

In [12]:
import re

Now, we remove links, usernames, newline characters, multiple spaces and emojis.

In [13]:
tweets_corpus_no_links = tweets_corpus

# Remove links
tweets_corpus_no_links['content'] = tweets_corpus_no_links['content'].map(lambda x: re.sub(re.compile('https?:\/\/t\.co\/[\w]{8,8}'), '', x))

# Remove usernames
tweets_corpus_no_links['content'] = tweets_corpus_no_links['content'].map(lambda x: re.sub(re.compile('@[A-Za-z0-9_]+'), '', x))

# Remove newline character
tweets_corpus_no_links['content'] = tweets_corpus_no_links['content'].map(lambda x: re.sub(re.compile('[\n\r]+'), '', x))

# Replace multiple spaces with single one
tweets_corpus_no_links['content'] = tweets_corpus_no_links['content'].map(lambda x: re.sub(re.compile('[\s]+'), ' ', x))

# Remove emojis
emoji_pattern = re.compile(u'['
     u'\U0001F300-\U0001F64F'
     u'\U0001F680-\U0001F6FF'
     u'\u2600-\u26FF\u2700-\u27BF]+', 
     re.UNICODE)
#tweets_corpus_no_links['content'] = tweets_corpus_no_links['content'].map(lambda x: re.sub(emoji_pattern, ' ', x))

In [14]:
print('Total corpus tweets after cleaning: %d' % len(tweets_corpus_no_links))

Total corpus tweets after cleaning: 6586


In [15]:
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
4981,#29f La calle nos está esperando para decir qu...,N
1894,Eso digo yo!!!! “: estoy de camino a terratpac...,N
2886,"Me voy a dormir, escuchando cantar al gallo qu...",N
6950,450 toneladas de tapones para salvar la vida d...,P
1994,I favorited a video Los recortes sociales del ...,N
3047,"Mira que foto mas chula , hoy en ;-)))))",P
762,Ya no existe razón para que la Generalitat ale...,NEU
1611,FELIZ AÑO DEL CAMBIO ANDALUZ!ESPERO QUE VENGA ...,P
7002,Sobre lo que pudo llegar a ser #Bagdad y nunca...,P
422,Me encanta estar así de feliz...ojalá y esto t...,P


### Tokenization and stemming

Download Spanish stopwords in Spanish:

In [16]:
# Download spanish stopwords
import nltk
nltk.download("stopwords")

from nltk.corpus import stopwords
spanish_stopwords = stopwords.words('spanish')

[nltk_data] Downloading package stopwords to /home/angel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Get non-words, and extend array of non-words with characters `¿` and `¿`.

In [17]:
from string import punctuation
non_words = list(punctuation)

# Add spanish punctuation
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))

Define stemmer and tokenizer, based on previous steps.

In [18]:
from sklearn.feature_extraction.text import CountVectorizer       
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = SnowballStemmer('spanish')
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = ''.join([c for c in text if c not in non_words])
    # tokenize
    tokens =  word_tokenize(text)

    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

In [19]:
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
1592,"Con #rajoynazo1 menos consumo, producción y ac...",N
5875,Buenas tardes twiteros!,P
4771,Las fuerzas de Al Assad ejecutan a 18 personas...,N
392,Somos consciente de la importancia cultural y ...,P
7015,Esas tardes en las que todo de confabula para ...,P
4568,RT : Si tuvieras q buscar un paraiso terrenal ...,P
4787,"“: te mando un saludo de cántabro a cántabro, ...",P
7181,PSOE pedirá impuesto a las grandes fortunas en...,P
6509,Las ofertas de última hora de son la bomba. De...,P
3123,"Demasiados Consejos Europeos ""históricos"".Este...",P


### Model Evaluation

Import libraries:

In [20]:
from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline



We convert from strings to numerics the polarity values

In [21]:
tweets_corpus_no_links['polarity_bin'] = 0
tweets_corpus_no_links.polarity_bin[tweets_corpus_no_links.polarity.isin(['P'])] = 1
tweets_corpus_no_links.polarity_bin[tweets_corpus_no_links.polarity.isin(['N'])] = -1
tweets_corpus_no_links.polarity_bin.value_counts(normalize=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


 1    0.483602
-1    0.394777
 0    0.121622
Name: polarity_bin, dtype: float64

Now we use SVC model with optimization via GridSearch

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
vectorizer = CountVectorizer(
                analyzer = 'word',
                tokenizer = tokenize,
                lowercase = True,
                stop_words = spanish_stopwords)

pipeline = Pipeline([
    ('vect', vectorizer),
    ('cls', OneVsRestClassifier(LinearSVC())),
])

As long as we don't have binary classification, we must binarize the polarity and use a multiclass learning algorithm.

In [23]:
'''from sklearn.preprocessing import label_binarize
tweets_corpus_no_links.polarity_bin = label_binarize(tweets_corpus_no_links.polarity_bin, classes=[-1, 0, 1])'''

'from sklearn.preprocessing import label_binarize\ntweets_corpus_no_links.polarity_bin = label_binarize(tweets_corpus_no_links.polarity_bin, classes=[-1, 0, 1])'

In [24]:
'''params = {
    'vect__max_df': (0.5, 1.9),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'cls__estimator__C': (0.2, 0.5, 0.7),
    'cls__estimator__loss': ('hinge', 'squared_hinge'),
    'cls__estimator__max_iter': (500, 1000)
      }
gs = GridSearchCV(pipeline, params, n_jobs=-1, cv=5, scoring='roc_auc')
gs.fit(tweets_corpus_no_links.content, tweets_corpus_no_links.polarity_bin)'''

"params = {\n    'vect__max_df': (0.5, 1.9),\n    'vect__min_df': (10, 20,50),\n    'vect__max_features': (500, 1000),\n    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams\n    'cls__estimator__C': (0.2, 0.5, 0.7),\n    'cls__estimator__loss': ('hinge', 'squared_hinge'),\n    'cls__estimator__max_iter': (500, 1000)\n      }\ngs = GridSearchCV(pipeline, params, n_jobs=-1, cv=5, scoring='roc_auc')\ngs.fit(tweets_corpus_no_links.content, tweets_corpus_no_links.polarity_bin)"

In [25]:
'''gs.best_params_'''

'gs.best_params_'

We obtain that the best parameters are:

{'cls__estimator__C': 0.2,

 'cls__estimator__loss': 'hinge',
 
 'cls__estimator__max_iter': 1000,
 
 'vect__max_df': 1.9,
 
 'vect__max_features': 1000,
 
 'vect__min_df': 10,
 
 'vect__ngram_range': (1, 1)}

In [26]:
'''from sklearn.externals import joblib
joblib.dump(gs, 'grid_search.pkl')'''

"from sklearn.externals import joblib\njoblib.dump(gs, 'grid_search.pkl')"

Import cross validation:

In [27]:
from sklearn.cross_validation import cross_val_predict

In [28]:
model = LinearSVC(
    C=.2, 
    loss='hinge', 
    max_iter=1000, 
    random_state=None, 
    penalty='l2'
)

# Define vectorizer with the previously created tokenizer and stopwords array
vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = spanish_stopwords,
    min_df = 10,
    max_df = 1.9,
    ngram_range=(1, 1),
    max_features=1000
)

corpus_data_features = vectorizer.fit_transform(tweets_corpus_no_links.content)
corpus_data_features_nd = corpus_data_features.toarray()

In [29]:
y=tweets_corpus_no_links.polarity_bin

In [30]:
'''scores = cross_val_score(
    model,
    corpus_data_features_nd[0:len(tweets_corpus_no_links)],
    y=tweets_corpus_no_links.polarity_bin,
    scoring='roc_auc',
    cv=5
    )

scores.mean()'''

"scores = cross_val_score(\n    model,\n    corpus_data_features_nd[0:len(tweets_corpus_no_links)],\n    y=tweets_corpus_no_links.polarity_bin,\n    scoring='roc_auc',\n    cv=5\n    )\n\nscores.mean()"

### Polarity Prediction

In [31]:
tweets_no_label = pd.read_csv(tweets_run_file, encoding='utf-8')
print('Number of tweets: %d' % tweets_no_label.shape[0])
tweets_no_label.sample(10)

Number of tweets: 177


Unnamed: 0,id,text
119,56479676,¿Equipo de fútbol favorito? — No me gusta el f...
64,e859476c,"“El fracaso, como la tristeza, es corrosivo cu..."
121,dc3d7492,"Se le viene Real Madrid, Barça o Bayern Munich..."
106,020ee260,Más miedo al Barça que al PSG? Has mencionado ...
46,8b973b98,Algunos partidos de Cristiano en eliminatorias...
112,6f30beca,CON LA MIRA PUESTA EN MALAGA.\n\nSE VIENE PART...
173,2ccfd252,@ATLASNU8E Tendrías que haberte hecho del Barç...
115,ea75493e,@diablillocule14 @InvictosSomos ????? A ver su...
113,e1400a58,"@RetrAshado No están trabajando en el juego, e..."
68,b492b317,La 'rajada' de un ex objetivo del Barça sobre ...


Now we do some cleansing of the data, erasing again the links, usernames, newline characters, multiple spaces and emojis.

In [32]:
# Remove links
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(re.compile('https?:\/\/t\.co\/[\w]{8,8}'), '', x))

# Remove usernames
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(re.compile('@[A-Za-z0-9_]+'), '', x))

# Remove newline character
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(re.compile('[\n\r]+'), '', x))

# Replace multiple spaces with single one
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(re.compile('[\s]+'), ' ', x))

# Remove emojis
emoji_pattern = re.compile(u'['
     u'\U0001F300-\U0001F64F'
     u'\U0001F680-\U0001F6FF'
     u'\u2600-\u26FF\u2700-\u27BF]+', 
     re.UNICODE)
#tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(emoji_pattern, ' ', x))

In [33]:
tweets_no_label.sample(10)

Unnamed: 0,id,text
89,2687f611,El atleti no juega contra el PSG x q le elimi...
116,5a311adf,"Claro, no es jugador del Barça. Lo que hizo e..."
115,ea75493e,"????? A ver subnormal, creo que no se te da e..."
4,c4852036,Vengo 2/2 con los clasificados a Cuartos de Ch...
48,51df160e,Tienes razón.
162,6213b397,"lo terrible es que la afición del lo tolere, ..."
60,5d65492b,#BolitaPorfavorINFORMA: Futbol Internacional...
109,6d1bd293,"Yo soy del Barça, hinchaba por Neymar"
138,47d440ff,"Que morro tienen, si en los últimos años no f..."
51,13581e4b,Y tu que te llamas angelvikingo siendo del Ba...


### Language detection

Due to the fact that some tweets are in catalan, for language detection purposes we are only going to process about the ones in spanish for sentiment purposes.

We use three different libraries for language detection and keep those tweets on which at least two of these libraries agree on the language being Spanish.

In [34]:
import langid
from langdetect import detect
import textblob

def langid_safe(tweet):
    try:
        return langid.classify(tweet)[0]
    except Exception as e:
        pass
        
def langdetect_safe(tweet):
    try:
        return detect(tweet)
    except Exception as e:
        pass

def textblob_safe(tweet):
    try:
        return textblob.TextBlob(tweet).detect_language()
    except Exception as e:
        pass

Create 3 new columns specifying the detected language of the tweet.

In [35]:
tweets_no_label['lang_langid'] = tweets_no_label.text.apply(langid_safe)
tweets_no_label['lang_langdetect'] = tweets_no_label.text.apply(langdetect_safe)
tweets_no_label['lang_textblob'] = tweets_no_label.text.apply(textblob_safe)

Save as CSV.

In [36]:
tweets_no_label.to_csv('tweets_parsed.csv', encoding='utf-8')

We select the tweets in Spanish as follows:
- If the language detected is Spanish by at least 2 libraries, leave.
- If the language detected is Spanish in at least 1 library, print and append to the dataset manually.
- If none of the languages detected is Spanish, remove.

In [37]:
# Leave tweets whose detected language is Spanish (majority):
spanish_query = ''' (lang_langdetect == 'es' and lang_langid == 'es') or (lang_langdetect == 'es' and lang_textblob == 'es') or (lang_textblob == 'es' and lang_langid == 'es') '''
tweets_spanish = tweets_no_label.query(spanish_query)

print('Tweets in Spanish: %d' % tweets_spanish.shape[0])

# Print tweets in doubtful language:
nonspanish_query = ''' ((lang_langdetect != 'es' and lang_langid != 'es') or (lang_langdetect != 'es' and lang_textblob != 'es') or (lang_textblob != 'es' and lang_langid != 'es')) and (lang_textblob == 'es' or lang_langid == 'es' or lang_langdetect == 'es') '''
tweets_doubtful = tweets_no_label.query(nonspanish_query)

print('Tweets whose language is not clear: %d' % tweets_doubtful.shape[0])

tweets_doubtful

Tweets in Spanish: 150
Tweets whose language is not clear: 19


Unnamed: 0,id,text,lang_langid,lang_langdetect,lang_textblob
1,79cdded5,FELICIDADES ¡¡ FRANCISCO¡¡,hy,de,es
2,26fe7471,Dedicado para: 0F,pt,pt,es
8,cd0d8bcb,DRAXLER EXPLOTA vs el PSG | BALOTELLI ‘manda C...,es,ca,ca
17,97af720a,"Para el que quiera ver el Barça hoy, es a las ...",ca,ca,es
40,09c0f4cc,El Barça Lassa deja casi vacía la enfermería 9...,an,ca,es
42,5a533794,¡EL INICIO DE UNA LEYENDA! ¿Qué momento recue...,pt,pt,es
55,9046f222,ULTIMA HORA: EL PRESIDENTE DEL PSG LE OFRECE A...,tr,en,es
56,5df2d140,Me ha gustado un vídeo de (tG - EL DIRECTO EN ...,gl,pt,es
72,c5343fa0,Y al Atleti que miura la va a tocar ahora en ...,ca,ca,es
74,12d82762,EL DIRECTO EN EL QUE DjMaRiiO USÓ LA CAMISETA ...,tr,pt,es


In [39]:
# Append rest of the tweets in Spanish manually
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '79cdded5' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '26fe7471' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'cd0d8bcb' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '97af720a' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '09c0f4cc' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '5a533794' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '9046f222' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '5df2d140' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'c5343fa0' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '12d82762' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'dcc02374' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '8f9d73cf' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '6f30beca' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '9cd8b232' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '3c78bdb5' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '3beadb3a' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'c8cda282' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'fce60e59' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '7bd204cc' ''')])

print('Tweets in Spanish: %d' % tweets_spanish.shape[0])

Tweets in Spanish: 169


Define pipeline:

In [40]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
            analyzer = 'word',
            tokenizer = tokenize,
            lowercase = True,
            stop_words = spanish_stopwords,
            min_df = 10,
            max_df = 1.9,
            ngram_range=(1, 1),
            max_features=1000
            )),
    ('cls', OneVsRestClassifier(LinearSVC(C=.2, loss='hinge',max_iter=1000,multi_class='ovr',
             random_state=None,
             penalty='l2',
             ))),
])

In [41]:
pipeline.fit(tweets_corpus_no_links.content, tweets_corpus_no_links.polarity_bin)
tweets_no_label['polarity'] = pipeline.predict(tweets_no_label.text)

In [42]:
tweets_no_label[['text', 'polarity']].sample(30)

Unnamed: 0,text,polarity
166,Iniesta inicia su plan en la Ciutat Esportiva ...,1
13,Recién me compro el conjunto del Barça y ya sa...,1
143,"No vale, no saben lo feliz que estuve cuando ...",1
126,Pavor tengo que ahora los del #PSG miren hacia...,1
168,Griezmann no será por el partido ante el Barç...,1
176,Es que son muy ridículos. Perder contra el Ba...,-1
134,Fox our barca cock suckers,1
131,"Vaya, no nos lo esperábamos...; Es curioso com...",1
10,Un jugador brasileño de 21 años muy bueno q j...,1
43,Please RT!! #atleti #atletico #ATM El fisio ?a...,1


Re-convert polarity to a string.

In [43]:
tweets = tweets_no_label.copy()
tweets['polarity_bin'] = 'Neutral'
tweets.polarity_bin[tweets.polarity.isin([1])] = 'Positive'
tweets.polarity_bin[tweets.polarity.isin([-1])] = 'Negative'
tweets.polarity_bin.value_counts(normalize=True)
tweets[['text', 'polarity_bin']].sample(30)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,text,polarity_bin
121,"Se le viene Real Madrid, Barça o Bayern Munich...",Positive
125,"Sport, el Barça descartó en el pasado a Lucas ...",Positive
135,En dias como hoy te das cuenta porque el R. M...,Negative
29,Me sorprende la absurdez del fanatismo que hay...,Positive
85,"Hace 17 años, #Messi inició su camino con el 4...",Positive
114,Vaya cabezazo del niño. Me recordó al de la f...,Negative
48,Tienes razón.,Negative
136,nada más para aclarar este año al lo hecho el...,Positive
62,entonces por qué lo llamas realmadridización?...,Positive
95,"Jajajajaj puto Mario, pero Simón, la última v...",Negative


Remove aux. columns:

In [44]:
tweets.drop(['lang_langid', 'lang_langdetect','lang_textblob','polarity'], axis=1, inplace=True)

In [45]:
tweets.sample(10)

Unnamed: 0,id,text,polarity_bin
57,c2d98689,El que tiró la liga en septiembre el año pasa...,Positive
109,6d1bd293,"Yo soy del Barça, hinchaba por Neymar",Positive
48,51df160e,Tienes razón.,Negative
62,c3606126,entonces por qué lo llamas realmadridización?...,Positive
154,8ccef7aa,"Comparar a con con la frase ""el dinero no da g...",Negative
54,4e803396,No tranquilo que el chollo del atleti este añ...,Positive
107,ddc5f491,El problema del PSG es su propio presidente q...,Negative
37,67ae6b97,"#JavierMascherano sobre #Messi en :""Es el jug...",Positive
173,2ccfd252,Tendrías que haberte hecho del Barça directam...,Positive
141,ee59a530,"El partidazo que se mandó contra el Barça, y ...",Positive


Rename column `polarity_bin` to `polarity`:

In [46]:
tweets = tweets.rename(columns={'polarity_bin': 'polarity'})

Export tweets as CSV:

In [47]:
tweets[['id', 'polarity']].to_csv('tweets_polarity_bin.csv', encoding='utf-8', index=False)