## Tweet classification

File names

In [4]:
base_location = './'

# To read
tweets_raw_file = base_location + 'datasets/train.csv'
tweets_run_file = base_location + 'datasets/test_nolabel.csv'
corpus_tweets_2012_xml = base_location + 'general-train-tagged-3l.xml'
corpus_tweets_2017_xml = base_location + 'intertass-train-tagged.xml'

# To generate
corpus_tweets_2012_csv = base_location + 'general-train-tagged-3l.csv'
corpus_tweets_2017_csv = base_location + 'intertass-train-tagged.csv'
corpus_tweets_csv = base_location + 'corpus_tweets.csv'

Import libraries

In [5]:
import pandas as pd
import numpy as np

### Load datasets

In [6]:
tweets_raw = pd.read_csv(tweets_raw_file, encoding='utf-8')
tweets_run = pd.read_csv(tweets_run_file, encoding='utf-8')

print('Total tweets: %d' % len(tweets_raw))
print('Evaluated tweets so far: %d' % len(tweets_run))

Total tweets: 411
Evaluated tweets so far: 177


### Create train and test data

Build new array of dictionaries with keys `id` (the task ID), `tweet` (the tweet string) and `score` (the tweet evaluation) by joining data from both CSV files.

In [7]:
# Build dictionary of tweets where key is the task__id
own_tweets = []
tweets_obj = {}
for index, row in tweets_raw.iterrows():
    own_tweets.append({
        'id': row.id,
        'tweet': row.text, 
        'polarity': row.polarity
    })

print('Total different tweets evaluated so far: %d' % len(own_tweets))

Total different tweets evaluated so far: 411


### POS Tagging

Import libraries to read XML:

In [8]:
from lxml import objectify

Import/read most recent corpus (2017):

In [9]:
# 4 values of sentiment: N, P, NONE, NEU
xml = objectify.parse(open(corpus_tweets_2017_xml))
root = xml.getroot()
general_tweets_corpus_train_2017 = pd.DataFrame(columns=('content', 'polarity'))
tweets = root.getchildren()
for i in range(0, len(tweets)):
    tweet = tweets[i]
    row = dict(zip(['content', 'polarity'], [tweet.content.text, tweet.sentiment.polarity.value.text]))
    row_s = pd.Series(row)
    row_s.name = i
    general_tweets_corpus_train_2017 = general_tweets_corpus_train_2017.append(row_s)
    
general_tweets_corpus_train_2017.to_csv(corpus_tweets_2017_csv, index=False, encoding='utf-8')

Import/read biggest corpus (2012), to concatenate it with the previous noe:

In [10]:
# 4 values of sentiment: N, P, NONE, NEU
xml = objectify.parse(open(corpus_tweets_2012_xml))
root = xml.getroot()
general_tweets_corpus_train_2012 = pd.DataFrame(columns=('content', 'polarity'))
tweets = root.getchildren()
for i in range(0, len(tweets)):
    tweet = tweets[i]
    row = dict(zip(['content', 'polarity'], [tweet.content.text, tweet.sentiments.polarity.value.text]))
    row_s = pd.Series(row)
    row_s.name = i
    general_tweets_corpus_train_2012 = general_tweets_corpus_train_2012.append(row_s)
    
general_tweets_corpus_train_2012.to_csv(corpus_tweets_2012_csv, index=False, encoding='utf-8')

Concatenate general corpus dataset with 2017 one, to have a better result:

In [11]:
tweets_corpus = pd.concat([
        general_tweets_corpus_train_2012,
        general_tweets_corpus_train_2017
    ])
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
3989,A base de conquistar derechos de los trabajado...,N
6704,Los programas contra la violencia de género de...,P
998,Vale ya casi casi ... veo que cada minuto somo...,P
2353,Me asombra el extraordinario poder que tiene l...,NEU
5065,Es de tal cinismo RT @juankicoino: @marivirome...,N
4173,"RT ""@RguezBurgos: El Ibex sigue paradito. Apen...",N
1045,"Rajoy a Amaiur: ""El tema es muy serio. La vida...",P
3761,También con la vicepresidenta y el secretario ...,NEU
6353,"Aquí, el protagonista de la cerveza. http://t....",NONE
495,@Currice se que no me vas a leer pero si pudie...,N


In [12]:
print('Total corpus tweets: %d' % len(tweets_corpus))

Total corpus tweets: 8227


Remove tweets without polarity (polarity `NONE`):

In [13]:
tweets_corpus = tweets_corpus.query('polarity != "NONE"')

Remove tweets that are only a link:

In [14]:
tweets_corpus = tweets_corpus[-tweets_corpus.content.str.contains('^http.*$')]

Import regex tools:

In [15]:
import re

Now, we remove links, usernames, newline characters, multiple spaces and emojis.

In [16]:
tweets_corpus_no_links = tweets_corpus

# Remove links
tweets_corpus_no_links['content'] = tweets_corpus_no_links['content'].map(lambda x: re.sub(re.compile('https?:\/\/t\.co\/[\w]{8,8}'), '', x))

# Remove usernames
tweets_corpus_no_links['content'] = tweets_corpus_no_links['content'].map(lambda x: re.sub(re.compile('@[A-Za-z0-9_]+'), '', x))

# Remove newline character
tweets_corpus_no_links['content'] = tweets_corpus_no_links['content'].map(lambda x: re.sub(re.compile('[\n\r]+'), '', x))

# Replace multiple spaces with single one
tweets_corpus_no_links['content'] = tweets_corpus_no_links['content'].map(lambda x: re.sub(re.compile('[\s]+'), ' ', x))

# Remove emojis
emoji_pattern = re.compile(u'['
     u'\U0001F300-\U0001F64F'
     u'\U0001F680-\U0001F6FF'
     u'\u2600-\u26FF\u2700-\u27BF]+', 
     re.UNICODE)
#tweets_corpus_no_links['content'] = tweets_corpus_no_links['content'].map(lambda x: re.sub(emoji_pattern, ' ', x))

In [17]:
print('Total corpus tweets after cleaning: %d' % len(tweets_corpus_no_links))

Total corpus tweets after cleaning: 6586


In [18]:
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
38,muchísimas gracias Luis Tú también eres un ej...,P
2888,La Junta Militar de Egipto levantará hoy la le...,P
308,ya nos contarás si explota Europa (como dice ...,N
596,", vaya, felicidades atrasadas y muchas gracias",P
4985,Para saber sobre como se las gasta la dictadur...,N
5902,Finalmente sólo ha habido un herido leve. Espe...,NEU
3807,Limpiar el cordón umbilical es una medida senc...,P
225,yo no tengo ni idea y soy la primera que dice...,N
5016,Una de fakes de esos que todos los medios nos ...,N
6897,Me gusta :) RT : Por qué decidí escribir un bl...,P


### Tokenization and stemming

Download Spanish stopwords in Spanish:

In [19]:
# Download spanish stopwords
import nltk
nltk.download("stopwords")

from nltk.corpus import stopwords
spanish_stopwords = stopwords.words('spanish')

[nltk_data] Downloading package stopwords to /Users/dass/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Get non-words, and extend array of non-words with characters `¿` and `¿`.

In [20]:
from string import punctuation
non_words = list(punctuation)

# Add spanish punctuation
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))

Define stemmer and tokenizer, based on previous steps.

In [21]:
from sklearn.feature_extraction.text import CountVectorizer       
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = SnowballStemmer('spanish')
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = ''.join([c for c in text if c not in non_words])
    # tokenize
    tokens =  word_tokenize(text)

    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

In [22]:
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
16,Aunque pensaba que no podría hacerlo hasta el ...,P
5400,La RUPTURA LABORAL LAPONIA es un sarcasmo en e...,N
3734,“: Qué botas más grandes y calentitas... O no?...,P
4072,"Listas de espera al alza en Catalunya, estalli...",N
5111,claro que si! Muero de ganas por ir ! Por lo p...,P
1763,Aquí esperando... Los viajes suelen tener retr...,N
40,Me dice un pajarito que ahora mismo no puede r...,N
6773,La cúpula de la Oficina Nacional de investigac...,N
5335,"Buenos días,Hoy foto con el Fórmula1 d Fernand...",P
836,Las Palmas nuevo y modesto líder de 1ª divisió...,P


### Model Evaluation

Import libraries:

In [23]:
from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline



We convert from strings to numerics the polarity values

In [24]:
tweets_corpus_no_links['polarity_bin'] = 0
tweets_corpus_no_links.polarity_bin[tweets_corpus_no_links.polarity.isin(['P'])] = 1
tweets_corpus_no_links.polarity_bin[tweets_corpus_no_links.polarity.isin(['N'])] = -1
tweets_corpus_no_links.polarity_bin.value_counts(normalize=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


 1    0.483602
-1    0.394777
 0    0.121622
Name: polarity_bin, dtype: float64

Now we use SVC model with optimization via GridSearch

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
vectorizer = CountVectorizer(
                analyzer = 'word',
                tokenizer = tokenize,
                lowercase = True,
                stop_words = spanish_stopwords)

pipeline = Pipeline([
    ('vect', vectorizer),
    ('cls', MultinomialNB()),
])

As long as we don't have binary classification, we must binarize the polarity and use a multiclass learning algorithm.

In [28]:
MultinomialNB().get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}

In [23]:
'''from sklearn.preprocessing import label_binarize
tweets_corpus_no_links.polarity_bin = label_binarize(tweets_corpus_no_links.polarity_bin, classes=[-1, 0, 1])'''

'from sklearn.preprocessing import label_binarize\ntweets_corpus_no_links.polarity_bin = label_binarize(tweets_corpus_no_links.polarity_bin, classes=[-1, 0, 1])'

In [29]:
params = {
    'cls__alpha': (0.001, 0.01, 0.1, 1)
      }
gs = GridSearchCV(pipeline, params, n_jobs=-1, cv=5)
gs.fit(tweets_corpus_no_links.content, tweets_corpus_no_links.polarity_bin)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['de', 'la'...1a0d1acae8>, vocabulary=None)), ('cls', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'cls__alpha': (0.001, 0.01, 0.1, 1)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [30]:
gs.best_params_

{'cls__alpha': 1}

We obtain that the best parameters are:

{'cls__alpha': 1}

In [31]:
'''from sklearn.externals import joblib
joblib.dump(gs, 'grid_search.pkl')'''

"from sklearn.externals import joblib\njoblib.dump(gs, 'grid_search.pkl')"

Import cross validation:

In [32]:
from sklearn.cross_validation import cross_val_predict

In [33]:
model = MultinomialNB(
    alpha=1
)

# Define vectorizer with the previously created tokenizer and stopwords array
vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = spanish_stopwords,
    min_df = 10,
    max_df = 1.9,
    ngram_range=(1, 1),
    max_features=1000
)

corpus_data_features = vectorizer.fit_transform(tweets_corpus_no_links.content)
corpus_data_features_nd = corpus_data_features.toarray()

In [34]:
y=tweets_corpus_no_links.polarity_bin

In [35]:
'''scores = cross_val_score(
    model,
    corpus_data_features_nd[0:len(tweets_corpus_no_links)],
    y=tweets_corpus_no_links.polarity_bin,
    scoring='roc_auc',
    cv=5
    )

scores.mean()'''

"scores = cross_val_score(\n    model,\n    corpus_data_features_nd[0:len(tweets_corpus_no_links)],\n    y=tweets_corpus_no_links.polarity_bin,\n    scoring='roc_auc',\n    cv=5\n    )\n\nscores.mean()"

### Polarity Prediction

In [36]:
tweets_no_label = pd.read_csv(tweets_run_file, encoding='utf-8')
print('Number of tweets: %d' % tweets_no_label.shape[0])
tweets_no_label.sample(10)

Number of tweets: 177


Unnamed: 0,id,text
35,74acadab,@MiriamDakirFCB @Tocapilotes La Rata De PSG Se...
128,d6e65407,@jmdelalamo Lo q no comprendo es por q no sale...
132,5f5fc452,"@diariolagrada Pues nada, adeu y barca nova. Y..."
57,c2d98689,@putotrolaso @swivelFCB @LluisMascaro @sport E...
6,ebf7bff7,André Gomes no está dando el nivel pero los ab...
112,6f30beca,CON LA MIRA PUESTA EN MALAGA.\n\nSE VIENE PART...
4,c4852036,Vengo 2/2 con los clasificados a Cuartos de Ch...
76,dcc02374,@bet365_es No es muy difícil Barça va Madrid
67,5bc13938,@profenfurecido9 @Berlustinho Ése es el proble...
164,bf4b2c38,@ALEX15vs @FCBarcelona_es @FCBarcelona https:...


Now we do some cleansing of the data, erasing again the links, usernames, newline characters, multiple spaces and emojis.

In [37]:
# Remove links
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(re.compile('https?:\/\/t\.co\/[\w]{8,8}'), '', x))

# Remove usernames
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(re.compile('@[A-Za-z0-9_]+'), '', x))

# Remove newline character
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(re.compile('[\n\r]+'), '', x))

# Replace multiple spaces with single one
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(re.compile('[\s]+'), ' ', x))

# Remove emojis
emoji_pattern = re.compile(u'['
     u'\U0001F300-\U0001F64F'
     u'\U0001F680-\U0001F6FF'
     u'\u2600-\u26FF\u2700-\u27BF]+', 
     re.UNICODE)
#tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(emoji_pattern, ' ', x))

In [38]:
tweets_no_label.sample(10)

Unnamed: 0,id,text
46,8b973b98,Algunos partidos de Cristiano en eliminatorias...
53,31cb55fc,Cuenta la Leyenda que Todo comenzó hace 17 Año...
124,9cd8b232,LO QUE PASA ES QUE EL QUE HABLA PAJA SOY VOH ...
2,26fe7471,Dedicado para: 0F
78,d644c5df,En cuanto a Emery cada vez que tiene la oport...
157,2fb93730,Se le salió la barcelonitis al madridista oma...
167,a3798203,"Lo siento, pero 3-0. Lo otro son campitos men..."
59,b9a01810,#golazo sique tú sabes quien sería el capitán...
33,bb0ee4ad,No me preocupa tanto el planteamiento táctico...
142,4e3bdec7,Roban en la tienda oficial del en el Metropoli...


### Language detection

Due to the fact that some tweets are in catalan, for language detection purposes we are only going to process about the ones in spanish for sentiment purposes.

We use three different libraries for language detection and keep those tweets on which at least two of these libraries agree on the language being Spanish.

In [39]:
import langid
from langdetect import detect
import textblob

def langid_safe(tweet):
    try:
        return langid.classify(tweet)[0]
    except Exception as e:
        pass
        
def langdetect_safe(tweet):
    try:
        return detect(tweet)
    except Exception as e:
        pass

def textblob_safe(tweet):
    try:
        return textblob.TextBlob(tweet).detect_language()
    except Exception as e:
        pass

Create 3 new columns specifying the detected language of the tweet.

In [41]:
tweets_no_label['lang_langid'] = tweets_no_label.text.apply(langid_safe)
tweets_no_label['lang_langdetect'] = tweets_no_label.text.apply(langdetect_safe)
tweets_no_label['lang_textblob'] = tweets_no_label.text.apply(textblob_safe)

Save as CSV.

In [42]:
tweets_no_label.to_csv('tweets_parsed.csv', encoding='utf-8')

We select the tweets in Spanish as follows:
- If the language detected is Spanish by at least 2 libraries, leave.
- If the language detected is Spanish in at least 1 library, print and append to the dataset manually.
- If none of the languages detected is Spanish, remove.

In [43]:
# Leave tweets whose detected language is Spanish (majority):
spanish_query = ''' (lang_langdetect == 'es' and lang_langid == 'es') or (lang_langdetect == 'es' and lang_textblob == 'es') or (lang_textblob == 'es' and lang_langid == 'es') '''
tweets_spanish = tweets_no_label.query(spanish_query)

print('Tweets in Spanish: %d' % tweets_spanish.shape[0])

# Print tweets in doubtful language:
nonspanish_query = ''' ((lang_langdetect != 'es' and lang_langid != 'es') or (lang_langdetect != 'es' and lang_textblob != 'es') or (lang_textblob != 'es' and lang_langid != 'es')) and (lang_textblob == 'es' or lang_langid == 'es' or lang_langdetect == 'es') '''
tweets_doubtful = tweets_no_label.query(nonspanish_query)

print('Tweets whose language is not clear: %d' % tweets_doubtful.shape[0])

tweets_doubtful

Tweets in Spanish: 151
Tweets whose language is not clear: 18


Unnamed: 0,id,text,lang_langid,lang_langdetect,lang_textblob
1,79cdded5,FELICIDADES ¡¡ FRANCISCO¡¡🎂🎂🎂🎂🎂🎂🎂🎂🎂🎂,zh,de,es
2,26fe7471,Dedicado para: 0F,pt,pt,es
8,cd0d8bcb,DRAXLER EXPLOTA vs el PSG | BALOTELLI ‘manda C...,es,ca,ca
40,09c0f4cc,El Barça Lassa deja casi vacía la enfermería 9...,an,ca,es
42,5a533794,¡EL INICIO DE UNA LEYENDA! 🔴🔵¿Qué momento recu...,pt,pt,es
55,9046f222,ULTIMA HORA: EL PRESIDENTE DEL PSG LE OFRECE A...,tr,en,es
56,5df2d140,Me ha gustado un vídeo de (tG - EL DIRECTO EN ...,gl,pt,es
72,c5343fa0,Y al Atleti que miura la va a tocar ahora en ...,ca,ca,es
74,12d82762,EL DIRECTO EN EL QUE DjMaRiiO USÓ LA CAMISETA ...,tr,et,es
76,dcc02374,No es muy difícil Barça va Madrid,ca,ca,es


In [44]:
# Append rest of the tweets in Spanish manually
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '79cdded5' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '26fe7471' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'cd0d8bcb' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '97af720a' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '09c0f4cc' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '5a533794' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '9046f222' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '5df2d140' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'c5343fa0' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '12d82762' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'dcc02374' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '8f9d73cf' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '6f30beca' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '9cd8b232' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '3c78bdb5' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '3beadb3a' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'c8cda282' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'fce60e59' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '7bd204cc' ''')])

print('Tweets in Spanish: %d' % tweets_spanish.shape[0])

Tweets in Spanish: 169


Define pipeline:

In [45]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
            analyzer = 'word',
            tokenizer = tokenize,
            lowercase = True,
            stop_words = spanish_stopwords,
            min_df = 10,
            max_df = 1.9,
            ngram_range=(1, 1),
            max_features=1000
            )),
    ('cls', MultinomialNB(alpha=1
             )),
])

In [46]:
pipeline.fit(tweets_corpus_no_links.content, tweets_corpus_no_links.polarity_bin)
tweets_no_label['polarity'] = pipeline.predict(tweets_no_label.text)

In [47]:
tweets_no_label[['text', 'polarity']].sample(30)

Unnamed: 0,text,polarity
72,Y al Atleti que miura la va a tocar ahora en ...,1
160,Le dare merito al madrid el dia q vosotros re...,1
69,"Se llevaron todas las Champions, creo.",-1
155,"#LegitimosUsuarios de #Armas inspeccionados, y...",-1
75,Tu diciendo que no deja de pensar en el Barça...,0
5,Pero que te crees? Xk os sorprende lo de este...,-1
30,VRSALJKO🎙: “No me quejé a nadie. Soy un profes...,1
64,"“El fracaso, como la tristeza, es corrosivo cu...",1
11,MESSI tira del carro del Barça y cuando se va...,1
77,"Neymar se fue al PSG en busca de “títulos”, si...",1


Re-convert polarity to a string.

In [48]:
tweets = tweets_no_label.copy()
tweets['polarity_bin'] = 'Neutral'
tweets.polarity_bin[tweets.polarity.isin([1])] = 'Positive'
tweets.polarity_bin[tweets.polarity.isin([-1])] = 'Negative'
tweets.polarity_bin.value_counts(normalize=True)
tweets[['text', 'polarity_bin']].sample(30)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,text,polarity_bin
113,"No están trabajando en el juego, están cosien...",Negative
109,"Yo soy del Barça, hinchaba por Neymar",Positive
110,Soy 100% fans del Barça pero no caigo en eso....,Positive
80,Please RT!! #barcelona #fcbarcelona #Barca #fc...,Positive
67,Ése es el problema. No todos se han formado e...,Negative
31,Cuidado! Recodad que el Málaga por más pésimo...,Negative
18,Pero porque la ganase el Atm o porque la perd...,Neutral
166,Iniesta inicia su plan en la Ciutat Esportiva ...,Positive
43,Please RT!! #atleti #atletico #ATM El fisio ?a...,Positive
12,"Como hará Griezmann para jugar en Madrid, Barç...",Positive


Remove aux. columns:

In [49]:
tweets.drop(['lang_langid', 'lang_langdetect','lang_textblob','polarity'], axis=1, inplace=True)

In [50]:
tweets.sample(10)

Unnamed: 0,id,text,polarity_bin
72,c5343fa0,Y al Atleti que miura la va a tocar ahora en ...,Positive
12,66d69741,"Como hará Griezmann para jugar en Madrid, Barç...",Positive
52,94687a81,Es un torneito molero con premio mas o menos ...,Positive
11,636c8da5,MESSI tira del carro del Barça y cuando se va...,Positive
28,b2bde016,El Espanyol ha sacado más puntos contra el Ma...,Negative
175,96ef2a30,Y de k vale cumplir siemore salimos maltratad...,Negative
84,8d4d51db,Grandísimo. Y esto nunca lo comprenderá un tío...,Negative
23,5b13b74a,"Seguro, el Madrid tiene mejor plantilla y da ...",Neutral
160,7178e29d,Le dare merito al madrid el dia q vosotros re...,Positive
176,eb2cb9f4,Es que son muy ridículos. Perder contra el Ba...,Neutral


Rename column `polarity_bin` to `polarity`:

In [51]:
tweets = tweets.rename(columns={'polarity_bin': 'polarity'})

Export tweets as CSV:

In [52]:
tweets[['id', 'polarity']].to_csv('tweets_polarity_bin_multinomial.csv', encoding='utf-8', index=False)