## Tweet classification

File names

In [1]:
base_location = './'

# To read
tweets_raw_file = base_location + 'datasets/train.csv'
tweets_run_file = base_location + 'datasets/test_nolabel.csv'
corpus_tweets_2012_xml = base_location + 'general-train-tagged-3l.xml'
corpus_tweets_2017_xml = base_location + 'intertass-train-tagged.xml'

# To generate
corpus_tweets_2012_csv = base_location + 'general-train-tagged-3l.csv'
corpus_tweets_2017_csv = base_location + 'intertass-train-tagged.csv'
corpus_tweets_csv = base_location + 'corpus_tweets.csv'

Import libraries

In [2]:
import pandas as pd
import numpy as np

### Load datasets

In [3]:
tweets_raw = pd.read_csv(tweets_raw_file, encoding='utf-8')
tweets_run = pd.read_csv(tweets_run_file, encoding='utf-8')

print('Total tweets: %d' % len(tweets_raw))
print('Evaluated tweets so far: %d' % len(tweets_run))

Total tweets: 411
Evaluated tweets so far: 177


### Create train and test data

Build new array of dictionaries with keys `id` (the task ID), `tweet` (the tweet string) and `score` (the tweet evaluation) by joining data from both CSV files.

In [4]:
# Build dictionary of tweets where key is the task__id
own_tweets = []
tweets_obj = {}
for index, row in tweets_raw.iterrows():
    own_tweets.append({
        'id': row.id,
        'tweet': row.text, 
        'polarity': row.polarity
    })

print('Total different tweets evaluated so far: %d' % len(own_tweets))

Total different tweets evaluated so far: 411


### POS Tagging

Import libraries to read XML:

In [5]:
from lxml import objectify

Import/read most recent corpus (2017):

In [6]:
# 4 values of sentiment: N, P, NONE, NEU
xml = objectify.parse(open(corpus_tweets_2017_xml))
root = xml.getroot()
general_tweets_corpus_train_2017 = pd.DataFrame(columns=('content', 'polarity'))
tweets = root.getchildren()
for i in range(0, len(tweets)):
    tweet = tweets[i]
    row = dict(zip(['content', 'polarity'], [tweet.content.text, tweet.sentiment.polarity.value.text]))
    row_s = pd.Series(row)
    row_s.name = i
    general_tweets_corpus_train_2017 = general_tweets_corpus_train_2017.append(row_s)
    
general_tweets_corpus_train_2017.to_csv(corpus_tweets_2017_csv, index=False, encoding='utf-8')

Import/read biggest corpus (2012), to concatenate it with the previous one:

In [7]:
# 4 values of sentiment: N, P, NONE, NEU
xml = objectify.parse(open(corpus_tweets_2012_xml))
root = xml.getroot()
general_tweets_corpus_train_2012 = pd.DataFrame(columns=('content', 'polarity'))
tweets = root.getchildren()
for i in range(0, len(tweets)):
    tweet = tweets[i]
    row = dict(zip(['content', 'polarity'], [tweet.content.text, tweet.sentiments.polarity.value.text]))
    row_s = pd.Series(row)
    row_s.name = i
    general_tweets_corpus_train_2012 = general_tweets_corpus_train_2012.append(row_s)
    
general_tweets_corpus_train_2012.to_csv(corpus_tweets_2012_csv, index=False, encoding='utf-8')

Concatenate general corpus dataset with 2017 one, to have a better result:

In [8]:
tweets_corpus = pd.concat([
        general_tweets_corpus_train_2012,
        general_tweets_corpus_train_2017
    ])
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
62,"Ah vale, me suelta la puta subnormal, te estoy...",N
1468,#Chacón defiende que las primarias del @PSOE s...,P
22,Viernes negro: Aumenta el paro en Noviembre y ...,N
500,"@PGimenezFuentes @CiudadanosCs paso, paso de t...",N
1475,#fb El ministro del Interior anuncia en @OndaC...,P
6165,Montoro cree q RTVE no tiene q competir con l...,N
1888,“@javiercasqueiro: Aquí está el roscón http://...,N
4761,;-)))) RT @soniachaconp: @mariviromero Gracias...,P
5853,Y si directores generales d la Administración ...,N
3632,@sanchez_sonia No te has ido de ahí?. Buenos días,NONE


In [9]:
print('Total corpus tweets: %d' % len(tweets_corpus))

Total corpus tweets: 8227


Remove tweets without polarity (polarity `NONE`):

In [10]:
tweets_corpus = tweets_corpus.query('polarity != "NONE"')

Remove tweets that are only a link:

In [11]:
tweets_corpus = tweets_corpus[-tweets_corpus.content.str.contains('^http.*$')]

Import regex tools:

In [12]:
import re

Now, we remove links, usernames, newline characters, multiple spaces and emojis.

In [13]:
tweets_corpus_no_links = tweets_corpus

# Remove links
tweets_corpus_no_links['content'] = tweets_corpus_no_links['content'].map(lambda x: re.sub(re.compile('https?:\/\/t\.co\/[\w]{8,8}'), '', x))

# Remove usernames
tweets_corpus_no_links['content'] = tweets_corpus_no_links['content'].map(lambda x: re.sub(re.compile('@[A-Za-z0-9_]+'), '', x))

# Remove newline character
tweets_corpus_no_links['content'] = tweets_corpus_no_links['content'].map(lambda x: re.sub(re.compile('[\n\r]+'), '', x))

# Replace multiple spaces with single one
tweets_corpus_no_links['content'] = tweets_corpus_no_links['content'].map(lambda x: re.sub(re.compile('[\s]+'), ' ', x))

# Remove emojis
emoji_pattern = re.compile(u'['
     u'\U0001F300-\U0001F64F'
     u'\U0001F680-\U0001F6FF'
     u'\u2600-\u26FF\u2700-\u27BF]+', 
     re.UNICODE)
#tweets_corpus_no_links['content'] = tweets_corpus_no_links['content'].map(lambda x: re.sub(emoji_pattern, ' ', x))

In [14]:
print('Total corpus tweets after cleaning: %d' % len(tweets_corpus_no_links))

Total corpus tweets after cleaning: 6586


In [15]:
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
6516,Deseo de mucho éxito al alcalde de Oropesa. Pa...,P
787,¡Qué pintaza! Tanto la chistorra como el lomo...,P
2227,Adiós al maestro de humanidad: vía,N
4584,"En definitiva, un paso más para la creación de...",P
5739,Las simplezas de,N
2639,Desde luego “: Pasado contrapasado no conlleva...,N
3140,Un buen botijo hace buen agua. Pero un buen bo...,NEU
684,La reunión de la Mesa del Congreso convocada p...,N
5657,Quiero ser un alcalde para #Andalucía. Un alca...,P
1732,Y perdonad que no me haya hecho fotos con los ...,P


### Tokenization and stemming

Download Spanish stopwords in Spanish:

In [16]:
# Download spanish stopwords
import nltk
nltk.download("stopwords")

from nltk.corpus import stopwords
spanish_stopwords = stopwords.words('spanish')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jabatrox/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Get non-words, and extend array of non-words with characters `¿` and `¿`.

In [17]:
from string import punctuation
non_words = list(punctuation)

# Add spanish punctuation
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))

Define stemmer and tokenizer, based on previous steps.

In [18]:
from sklearn.feature_extraction.text import CountVectorizer       
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = SnowballStemmer('spanish')
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = ''.join([c for c in text if c not in non_words])
    # tokenize
    tokens =  word_tokenize(text)

    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

In [19]:
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
2005,Happy bday happy bday mom! Feliz cumple a la m...,P
1290,La mejor forma de garantizar la seguridad de n...,N
343,RT Papelón #ZP entre el chipriota y el irlandé...,N
689,"¿Y tú de que trabajas? Yo ayudo a ""perfilar le...",N
2377,""" el cambio que proponemos es el cambio tranqu...",P
923,que es un envidiosillo,N
5250,El secreto para que una dieta funcione es deci...,P
2022,Patxi López reclama acercamiento de presos. 20:30,N
6721,#Presupuestos Y ahora cuenta Montoro algunas d...,N
1835,#cabalgatasevilla llega a Pza.Magdalena.Espect...,P


### Model Evaluation

Import libraries:

In [25]:
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

We convert from strings to numerics the polarity values

In [26]:
tweets_corpus_no_links['polarity_bin'] = 0
tweets_corpus_no_links.polarity_bin[tweets_corpus_no_links.polarity.isin(['P'])] = 1
tweets_corpus_no_links.polarity_bin[tweets_corpus_no_links.polarity.isin(['N'])] = -1
tweets_corpus_no_links.polarity_bin.value_counts(normalize=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


 1    0.483602
-1    0.394777
 0    0.121622
Name: polarity_bin, dtype: float64

Now we use SVC model with optimization via GridSearch

In [28]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jabatrox/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [29]:
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
vectorizer = CountVectorizer(
                analyzer = 'word',
                tokenizer = tokenize,
                lowercase = True,
                stop_words = spanish_stopwords)

pipeline = Pipeline([
    ('vect', vectorizer),
    ('rfc', RandomForestClassifier()),
])

parameters = {
    'vect__max_df': (0.5, 1.9),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'rfc__n_estimators':[50, 100, 200],
    'rfc__min_samples_split':[2, 3, 4, 5, 10]
}


grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)
grid_search.fit(tweets_corpus_no_links.content, tweets_corpus_no_links.polarity_bin)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['de', 'la'...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__max_df': (0.5, 1.9), 'vect__min_df': (10, 20, 50), 'vect__max_features': (500, 1000), 'vect__ngram_range': ((1, 1), (1, 2)), 'rfc__n_estimators': [50, 100, 200], 'rfc__min_samples_split': [2, 3, 4, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

As long as we don't have binary classification, we must binarize the polarity and use a multiclass learning algorithm.

In [30]:
'''from sklearn.preprocessing import label_binarize
tweets_corpus_no_links.polarity_bin = label_binarize(tweets_corpus_no_links.polarity_bin, classes=[-1, 0, 1])'''

'from sklearn.preprocessing import label_binarize\ntweets_corpus_no_links.polarity_bin = label_binarize(tweets_corpus_no_links.polarity_bin, classes=[-1, 0, 1])'

In [31]:
'''params = {
    'vect__max_df': (0.5, 1.9),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'cls__estimator__C': (0.2, 0.5, 0.7),
    'cls__estimator__loss': ('hinge', 'squared_hinge'),
    'cls__estimator__max_iter': (500, 1000)
      }
gs = GridSearchCV(pipeline, params, n_jobs=-1, cv=5, scoring='roc_auc')
gs.fit(tweets_corpus_no_links.content, tweets_corpus_no_links.polarity_bin)'''

"params = {\n    'vect__max_df': (0.5, 1.9),\n    'vect__min_df': (10, 20,50),\n    'vect__max_features': (500, 1000),\n    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams\n    'cls__estimator__C': (0.2, 0.5, 0.7),\n    'cls__estimator__loss': ('hinge', 'squared_hinge'),\n    'cls__estimator__max_iter': (500, 1000)\n      }\ngs = GridSearchCV(pipeline, params, n_jobs=-1, cv=5, scoring='roc_auc')\ngs.fit(tweets_corpus_no_links.content, tweets_corpus_no_links.polarity_bin)"

In [32]:
'''gs.best_params_'''

'gs.best_params_'

We obtain that the best parameters are:

{'cls__estimator__C': 0.2,

 'cls__estimator__loss': 'hinge',
 
 'cls__estimator__max_iter': 1000,
 
 'vect__max_df': 1.9,
 
 'vect__max_features': 1000,
 
 'vect__min_df': 10,
 
 'vect__ngram_range': (1, 1)}

In [33]:
'''from sklearn.externals import joblib
joblib.dump(gs, 'grid_search.pkl')'''

"from sklearn.externals import joblib\njoblib.dump(gs, 'grid_search.pkl')"

Import cross validation:

In [34]:
from sklearn.cross_validation import cross_val_predict

In [36]:
'''
GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['de', 'la'...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__max_df': (0.5, 1.9), 'vect__min_df': (10, 20, 50), 'vect__max_features': (500, 1000), 'vect__ngram_range': ((1, 1), (1, 2)), 'rfc__n_estimators': [50, 100, 200], 'rfc__min_samples_split': [2, 3, 4, 5, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
'''


model = LinearSVC(
    C=.2, 
    loss='hinge', 
    max_iter=1000, 
    random_state=None, 
    penalty='l2'
)

# Define vectorizer with the previously created tokenizer and stopwords array
vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = spanish_stopwords,
    min_df = 10,
    max_df = 1.9,
    ngram_range=(1, 1),
    max_features=1000
)

corpus_data_features = vectorizer.fit_transform(tweets_corpus_no_links.content)
corpus_data_features_nd = corpus_data_features.toarray()

In [37]:
y=tweets_corpus_no_links.polarity_bin

In [38]:
'''scores = cross_val_score(
    model,
    corpus_data_features_nd[0:len(tweets_corpus_no_links)],
    y=tweets_corpus_no_links.polarity_bin,
    scoring='roc_auc',
    cv=5
    )

scores.mean()'''

"scores = cross_val_score(\n    model,\n    corpus_data_features_nd[0:len(tweets_corpus_no_links)],\n    y=tweets_corpus_no_links.polarity_bin,\n    scoring='roc_auc',\n    cv=5\n    )\n\nscores.mean()"

### Polarity Prediction

In [39]:
tweets_no_label = pd.read_csv(tweets_run_file, encoding='utf-8')
print('Number of tweets: %d' % tweets_no_label.shape[0])
tweets_no_label.sample(10)

Number of tweets: 177


Unnamed: 0,id,text
156,42d8ce05,"@maldiniplus Como dijo Draxler, el planteamien..."
58,f067a08b,@moscu_toronto Pues cuando era pequeñito era d...
137,65da3719,@LlunaCatalana3 tv3 te els drets de la champio...
168,e71a115b,@EASPORTSEsp Griezmann no será por el partido ...
71,2ddfa69e,@AlejandroNY75 @2010MisterChip Y peor. Elogia ...
74,12d82762,EL DIRECTO EN EL QUE DjMaRiiO USÓ LA CAMISETA ...
6,ebf7bff7,André Gomes no está dando el nivel pero los ab...
42,5a533794,¡EL INICIO DE UNA LEYENDA! 🔴🔵\n\n¿Qué momento ...
45,41de1a3a,La #EuropeaLeague el #Atlético se la tiene que...
77,6eb6eb37,"Neymar se fue al PSG en busca de “títulos”, si..."


Now we do some cleansing of the data, erasing again the links, usernames, newline characters, multiple spaces and emojis.

In [40]:
# Remove links
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(re.compile('https?:\/\/t\.co\/[\w]{8,8}'), '', x))

# Remove usernames
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(re.compile('@[A-Za-z0-9_]+'), '', x))

# Remove newline character
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(re.compile('[\n\r]+'), '', x))

# Replace multiple spaces with single one
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(re.compile('[\s]+'), ' ', x))

# Remove emojis
emoji_pattern = re.compile(u'['
     u'\U0001F300-\U0001F64F'
     u'\U0001F680-\U0001F6FF'
     u'\u2600-\u26FF\u2700-\u27BF]+', 
     re.UNICODE)
#tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(emoji_pattern, ' ', x))

In [41]:
tweets_no_label.sample(10)

Unnamed: 0,id,text
91,5b4878db,"¿Aquellos casi atracos al Barça?, no se que d..."
73,cee9b0a5,Que solo soy feliz con el atleti de madrid!!!
97,be694b7a,"Le tienen tanto odio, que hasta le critican po..."
105,884da2b9,Resulta que el partido lo perdió el PSG por l...
68,b492b317,La 'rajada' de un ex objetivo del Barça sobre ...
47,484c36cf,La victoria de ayer a la prensa tampoco le val...
10,a122a538,Un jugador brasileño de 21 años muy bueno q j...
158,97e7b943,Eso decidselo a Marca que cada día dan más ve...
1,79cdded5,FELICIDADES ¡¡ FRANCISCO¡¡🎂🎂🎂🎂🎂🎂🎂🎂🎂🎂
89,2687f611,El atleti no juega contra el PSG x q le elimi...


### Language detection

Due to the fact that some tweets are in catalan, for language detection purposes we are only going to process about the ones in spanish for sentiment purposes.

We use three different libraries for language detection and keep those tweets on which at least two of these libraries agree on the language being Spanish.

In [47]:
import langid
from langdetect import detect
import textblob

def langid_safe(tweet):
    try:
        return langid.classify(tweet)[0]
    except Exception as e:
        pass
        
def langdetect_safe(tweet):
    try:
        return detect(tweet)
    except Exception as e:
        pass

def textblob_safe(tweet):
    try:
        return textblob.TextBlob(tweet).detect_language()
    except Exception as e:
        pass

Create 3 new columns specifying the detected language of the tweet.

In [48]:
tweets_no_label['lang_langid'] = tweets_no_label.text.apply(langid_safe)
tweets_no_label['lang_langdetect'] = tweets_no_label.text.apply(langdetect_safe)
tweets_no_label['lang_textblob'] = tweets_no_label.text.apply(textblob_safe)

Save as CSV.

In [49]:
tweets_no_label.to_csv('tweets_parsed.csv', encoding='utf-8')

We select the tweets in Spanish as follows:
- If the language detected is Spanish by at least 2 libraries, leave.
- If the language detected is Spanish in at least 1 library, print and append to the dataset manually.
- If none of the languages detected is Spanish, remove.

In [50]:
# Leave tweets whose detected language is Spanish (majority):
spanish_query = ''' (lang_langdetect == 'es' and lang_langid == 'es') or (lang_langdetect == 'es' and lang_textblob == 'es') or (lang_textblob == 'es' and lang_langid == 'es') '''
tweets_spanish = tweets_no_label.query(spanish_query)

print('Tweets in Spanish: %d' % tweets_spanish.shape[0])

# Print tweets in doubtful language:
nonspanish_query = ''' ((lang_langdetect != 'es' and lang_langid != 'es') or (lang_langdetect != 'es' and lang_textblob != 'es') or (lang_textblob != 'es' and lang_langid != 'es')) and (lang_textblob == 'es' or lang_langid == 'es' or lang_langdetect == 'es') '''
tweets_doubtful = tweets_no_label.query(nonspanish_query)

print('Tweets whose language is not clear: %d' % tweets_doubtful.shape[0])

tweets_doubtful

Tweets in Spanish: 150
Tweets whose language is not clear: 19


Unnamed: 0,id,text,lang_langid,lang_langdetect,lang_textblob
1,79cdded5,FELICIDADES ¡¡ FRANCISCO¡¡🎂🎂🎂🎂🎂🎂🎂🎂🎂🎂,zh,de,es
2,26fe7471,Dedicado para: 0F,pt,pt,es
8,cd0d8bcb,DRAXLER EXPLOTA vs el PSG | BALOTELLI ‘manda C...,es,ca,ca
17,97af720a,"Para el que quiera ver el Barça hoy, es a las ...",ca,ca,es
40,09c0f4cc,El Barça Lassa deja casi vacía la enfermería 9...,an,ca,es
42,5a533794,¡EL INICIO DE UNA LEYENDA! 🔴🔵¿Qué momento recu...,pt,pt,es
55,9046f222,ULTIMA HORA: EL PRESIDENTE DEL PSG LE OFRECE A...,tr,en,es
56,5df2d140,Me ha gustado un vídeo de (tG - EL DIRECTO EN ...,gl,pt,es
72,c5343fa0,Y al Atleti que miura la va a tocar ahora en ...,ca,ca,es
74,12d82762,EL DIRECTO EN EL QUE DjMaRiiO USÓ LA CAMISETA ...,tr,pt,es


In [51]:
# Append rest of the tweets in Spanish manually
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '79cdded5' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '26fe7471' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'cd0d8bcb' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '97af720a' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '09c0f4cc' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '5a533794' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '9046f222' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '5df2d140' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'c5343fa0' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '12d82762' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'dcc02374' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '8f9d73cf' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '6f30beca' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '9cd8b232' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '3c78bdb5' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '3beadb3a' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'c8cda282' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'fce60e59' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '7bd204cc' ''')])

print('Tweets in Spanish: %d' % tweets_spanish.shape[0])

Tweets in Spanish: 169


Define pipeline:

In [52]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
            analyzer = 'word',
            tokenizer = tokenize,
            lowercase = True,
            stop_words = spanish_stopwords,
            min_df = 10,
            max_df = 1.9,
            ngram_range=(1, 1),
            max_features=1000
            )),
    ('cls', OneVsRestClassifier(LinearSVC(C=.2, loss='hinge',max_iter=1000,multi_class='ovr',
             random_state=None,
             penalty='l2',
             ))),
])

In [53]:
pipeline.fit(tweets_corpus_no_links.content, tweets_corpus_no_links.polarity_bin)
tweets_no_label['polarity'] = pipeline.predict(tweets_no_label.text)

In [54]:
tweets_no_label[['text', 'polarity']].sample(30)

Unnamed: 0,text,polarity
12,"Como hará Griezmann para jugar en Madrid, Barç...",1
3,El Barca aparte de ganar eso ha dominado el f...,1
2,Dedicado para: 0F,1
23,"Seguro, el Madrid tiene mejor plantilla y da ...",-1
76,No es muy difícil Barça va Madrid,-1
106,Más miedo al Barça que al PSG? Has mencionado ...,1
137,"tv3 te els drets de la champions, però, no de...",1
4,Vengo 2/2 con los clasificados a Cuartos de Ch...,1
82,ESTA TARDE a las 18:45 se juega la SUPERCOPA D...,1
120,Creo q ustedes son del Madrid y quieren sacar...,-1


Re-convert polarity to a string.

In [55]:
tweets = tweets_no_label.copy()
tweets['polarity_bin'] = 'Neutral'
tweets.polarity_bin[tweets.polarity.isin([1])] = 'Positive'
tweets.polarity_bin[tweets.polarity.isin([-1])] = 'Negative'
tweets.polarity_bin.value_counts(normalize=True)
tweets[['text', 'polarity_bin']].sample(30)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,text,polarity_bin
159,Los parisinos no lo verán este año la famosa C...,Negative
48,Tienes razón.👍,Negative
87,No soy del Atleti pero me da pena lo que han e...,Negative
128,Lo q no comprendo es por q no salen comparand...,Negative
91,"¿Aquellos casi atracos al Barça?, no se que d...",Negative
139,"Arthur prácticamente hecho, todo está firmado ...",Negative
53,Cuenta la Leyenda que Todo comenzó hace 17 Año...,Positive
111,Este es nivel que tiene la gente. El Madrid nu...,Negative
25,Pobre Catalán!! Eres uno de esos del lazo Ama...,Negative
104,Hoy es el cumpleaños de Francisco Hervás Tirad...,Positive


Remove aux. columns:

In [56]:
tweets.drop(['lang_langid', 'lang_langdetect','lang_textblob','polarity'], axis=1, inplace=True)

In [57]:
tweets.sample(10)

Unnamed: 0,id,text,polarity_bin
60,5d65492b,#BolitaPorfavorINFORMA: 📻💻⚽ Futbol Internacion...,Positive
95,737f222e,"Jajajajaj puto Mario, pero Simón, la última v...",Negative
92,8675ca7f,"Que noooo, así nos saca unas risas 😅😅😅",Positive
17,97af720a,"Para el que quiera ver el Barça hoy, es a las ...",Positive
85,781bcd66,"Hace 17 años, #Messi inició su camino con el 4...",Positive
72,c5343fa0,Y al Atleti que miura la va a tocar ahora en ...,Positive
109,6d1bd293,"Yo soy del Barça, hinchaba por Neymar",Positive
64,e859476c,"“El fracaso, como la tristeza, es corrosivo cu...",Positive
84,8d4d51db,Grandísimo. Y esto nunca lo comprenderá un tío...,Negative
110,1e08c5da,Soy 100% fans del Barça pero no caigo en eso....,Positive


Rename column `polarity_bin` to `polarity`:

In [58]:
tweets = tweets.rename(columns={'polarity_bin': 'polarity'})

Export tweets as CSV:

In [59]:
tweets[['id', 'polarity']].to_csv('tweets_polarity_bin.csv', encoding='utf-8', index=False)