### Import Libraries
We import some libraries that we are going to use later

In [4]:
import pandas as pd
import numpy as np
from lxml import objectify
import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from string import punctuation
from nltk.stem import SnowballStemmer

#Importamos el modelo
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier

from sklearn.cross_validation import cross_val_score

import langid
from langdetect import detect
import textblob




### Run preprocessing
Now we run the previous notebook. Another feasible option would be importing just the csv result.

In [2]:
run ./1.Preprocessing.ipynb

Total tweets to evaluate: 177
Evaluated tweets so far: 411


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  ]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  },


Total corpus tweets: 7356
Total corpus tweets after cleaning: 7356


### Tokenization and Stemming

In [5]:
# Se descargan las palabras de parada en español
nltk.download("stopwords")
spanish_stopwords = stopwords.words('spanish')

[nltk_data] Downloading package stopwords to /Users/dass/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Obtenemos los signos de puntuación que se utilizan en español
non_words = list(punctuation)
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))

In [7]:
# Se definen las funciones para realizar la tokenización y el stemming
stemmer = SnowballStemmer('spanish')
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # Eliminamos lo que no sean palabras
    text = ''.join([c for c in text if c not in non_words])
    # Tokenización
    tokens = tknzr.tokenize(text)

    # Stemming
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

We change the string values for integers and check the distribution of values amongst them

In [9]:
tweets_corpus['polarity_bin'] = 0
tweets_corpus.polarity_bin[tweets_corpus.polarity.isin(['P'])] = 1
tweets_corpus.polarity_bin[tweets_corpus.polarity.isin(['N'])] = -1
tweets_corpus.polarity_bin.value_counts(normalize=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


 1    0.500816
-1    0.366911
 0    0.132273
Name: polarity_bin, dtype: float64

### Model: Random Forest

First we look for the parameters which we can use for model training

In [12]:
RandomForestClassifier().get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [13]:
tweets_corpus.sample(10)

Unnamed: 0,content,polarity,polarity_bin
2068,"¿Y a quien responda ""no"" a esa amable petició...",P,1
5853,Y si directores generales d la Administración ...,N,-1
3477,Guindos acaba de limitar los sueldos en cajas ...,NEU,0
2417,A Don Manuel Fraga Iribarne : gracias eternas,P,1
6186,“: Semana del cine español en #Polonia Esta ta...,P,1
4761,;-)))) RT : Gracias!! Menudo #Piropazo :)) #ca...,P,1
6295,Un municipio de Guerrero confirma dos muertos ...,N,-1
361,🇹,P,1
2074,Tenemos que combinar reflexión con ACCION . Te...,P,1
1171,Estepona lleva a los juzgados irregularidades ...,N,-1


Now we make a Pipeline out of Vectorizer and RF, which is going to tell us the best parameters via Grid Search. WARNING: This part takes a lot time in execution

In [14]:
pipeline = Pipeline([
    ('vect',  CountVectorizer(
                analyzer = 'word',
                tokenizer = tokenize,
                lowercase = True,
                stop_words = spanish_stopwords)),
    ('cls', RandomForestClassifier())])
parameters = {
    'cls__criterion': ('gini','entropy'),
    'cls__n_estimators': (1, 10, 100, 1000),
    'cls__class_weight':['balanced', None]    
}

In [13]:
gs = GridSearchCV(pipeline, parameters, n_jobs=-1, scoring='f1_micro')
gs.fit(tweets_corpus.content, tweets_corpus.polarity_bin)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['de', 'la'...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'cls__criterion': ('gini', 'entropy'), 'cls__n_estimators': (1, 10, 100, 1000), 'cls__class_weight': ['balanced', None]},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1_micro', verbose=0)

We check the best values obtained via grid search

In [14]:
gs.best_params_

{'cls__class_weight': None, 'cls__criterion': 'gini', 'cls__n_estimators': 100}

Now we check the performance of the model with cross validation procedure

In [31]:
# Mediante validación cruzada obtenemos el rendimiento del modelo
model = RandomForestClassifier(class_weight = 'balanced', criterion = 'gini', n_estimators = 100)

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = spanish_stopwords,
    min_df = 0,
    max_df = 4700,
    max_features=1000
)

tweets_corpus_nl_features = vectorizer.fit_transform(tweets_corpus.content)
tweets_corpus_nl_features_nd = tweets_corpus_nl_features.toarray()

scores = cross_val_score(
    model,
    tweets_corpus_nl_features_nd[0:len(tweets_corpus)],
    y=tweets_corpus.polarity_bin,
    cv=None
    )

scores.mean()

0.55626068762316083

Not so much, but gives us our best result in Kaggle

### Polarity Prediction
Now that we have trained properly the predictor, it's time to predict the values from the test set

In [16]:
tweets_no_label = pd.read_csv(test_tweets_raw, encoding='utf-8')
print('Number of tweets: %d' % tweets_no_label.shape[0])
tweets_no_label.sample(10)

Number of tweets: 177


Unnamed: 0,id,text
156,42d8ce05,"@maldiniplus Como dijo Draxler, el planteamien..."
74,12d82762,EL DIRECTO EN EL QUE DjMaRiiO USÓ LA CAMISETA ...
144,16466a71,@LucasScagliola Dale un rato más y será más q ...
80,b54f1ea0,Please RT!! #barcelona #fcbarcelona #Barca #fc...
117,c142d964,¡¡Le robaron al @Atleti!!\n\nSe metieron a una...
123,9e1cd421,Con poco suerte tendremos tambièn previa de la...
99,49d77cad,"@Alexmarko88 @ArielRizo11 Claro, lo que le pas..."
41,62f967e5,"Hace 17 años, Messi por primera vez se puso la..."
78,d644c5df,@Digimiono @dircomPedro En cuanto a Emery cada...
103,142c2785,@DiegoACarranza7 @laligaennumeros Ojalá hubies...


At first instance we tried to consider only tweets in Spanish. But as long as the tweet number must be the same as given in Kaggle, we process them.

In [18]:
import langid
from langdetect import detect
import textblob

def langid_safe(tweet):
    try:
        return langid.classify(tweet)[0]
    except Exception as e:
        pass
        
def langdetect_safe(tweet):
    try:
        return detect(tweet)
    except Exception as e:
        pass

def textblob_safe(tweet):
    try:
        return textblob.TextBlob(tweet).detect_language()
    except Exception as e:
        pass

Create 3 new columns specifying the detected language of the tweet.

In [19]:
tweets_no_label['lang_langid'] = tweets_no_label.text.apply(langid_safe)
tweets_no_label['lang_langdetect'] = tweets_no_label.text.apply(langdetect_safe)
tweets_no_label['lang_textblob'] = tweets_no_label.text.apply(textblob_safe)

Save as CSV.

In [20]:
tweets_no_label.to_csv('tweets_parsed.csv', encoding='utf-8')

We select the tweets in Spanish as follows:
- If the language detected is Spanish by at least 2 libraries, leave.
- If the language detected is Spanish in at least 1 library, print and append to the dataset manually.

In [21]:
# Leave tweets whose detected language is Spanish (majority):
spanish_query = ''' (lang_langdetect == 'es' and lang_langid == 'es') or (lang_langdetect == 'es' and lang_textblob == 'es') or (lang_textblob == 'es' and lang_langid == 'es') '''
tweets_spanish = tweets_no_label.query(spanish_query)

print('Tweets in Spanish: %d' % tweets_spanish.shape[0])

# Print tweets in doubtful language:
nonspanish_query = ''' ((lang_langdetect != 'es' and lang_langid != 'es') or (lang_langdetect != 'es' and lang_textblob != 'es') or (lang_textblob != 'es' and lang_langid != 'es')) and (lang_textblob == 'es' or lang_langid == 'es' or lang_langdetect == 'es') '''
tweets_doubtful = tweets_no_label.query(nonspanish_query)

print('Tweets whose language is not clear: %d' % tweets_doubtful.shape[0])

tweets_doubtful

Tweets in Spanish: 152
Tweets whose language is not clear: 16


Unnamed: 0,id,text,lang_langid,lang_langdetect,lang_textblob
8,cd0d8bcb,DRAXLER EXPLOTA vs el PSG | BALOTELLI ‘manda C...,es,ca,ca
17,97af720a,"Para el que quiera ver el Barça hoy, es a las ...",ca,ca,es
40,09c0f4cc,El Barça Lassa deja casi vacía la enfermería h...,an,ca,es
42,5a533794,¡EL INICIO DE UNA LEYENDA! 🔴🔵\n\n¿Qué momento ...,pt,pt,es
55,9046f222,ULTIMA HORA: EL PRESIDENTE DEL PSG LE OFRECE A...,tr,en,es
56,5df2d140,Me ha gustado un vídeo de @YouTube (https://t....,gl,pt,es
72,c5343fa0,@AS_Manolete Y al Atleti que miura la va a toc...,ca,ca,es
74,12d82762,EL DIRECTO EN EL QUE DjMaRiiO USÓ LA CAMISETA ...,tr,en,es
76,dcc02374,@bet365_es No es muy difícil Barça va Madrid,ca,ca,es
96,8f9d73cf,@bet365_es Cualquiera que no estuvieran ni Bar...,ca,ca,es


In [22]:
# Append rest of the tweets in Spanish manually
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '79cdded5' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '26fe7471' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'cd0d8bcb' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '97af720a' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '09c0f4cc' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '5a533794' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '9046f222' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '5df2d140' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'c5343fa0' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '12d82762' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'dcc02374' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '8f9d73cf' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '6f30beca' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '9cd8b232' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '3c78bdb5' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '3beadb3a' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'c8cda282' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == 'fce60e59' ''')])
tweets_spanish = pd.concat([tweets_spanish, tweets_doubtful.query(''' id == '7bd204cc' ''')])

print('Tweets in Spanish: %d' % tweets_spanish.shape[0])

Tweets in Spanish: 167


Define pipeline:

In [23]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
            analyzer = 'word',
            tokenizer = tokenize,
            lowercase = True,
            stop_words = spanish_stopwords,
            min_df = 10,
            max_df = 1.9,
            max_features=1000
            )),
    ('cls', RandomForestClassifier(class_weight = 'balanced', criterion = 'gini', n_estimators = 100))
])

In [24]:
pipeline.fit(tweets_corpus.content, tweets_corpus.polarity_bin)
tweets_no_label['polarity'] = pipeline.predict(tweets_no_label.text)

In [25]:
tweets_no_label[['text', 'polarity']].sample(30)

Unnamed: 0,text,polarity
61,@Footy_Jokes anyone in a match against barca,0
164,@ALEX15vs @FCBarcelona_es @FCBarcelona https:...,0
53,Cuenta la Leyenda que Todo comenzó hace 17 Año...,1
75,@jotajordi13 @tonintorero Tu diciendo que no d...,-1
27,McGuane podría debutar con el Barça y converti...,1
126,Pavor tengo que ahora los del #PSG miren hacia...,-1
99,"@Alexmarko88 @ArielRizo11 Claro, lo que le pas...",-1
155,"#LegitimosUsuarios de #Armas inspeccionados, y...",-1
103,@DiegoACarranza7 @laligaennumeros Ojalá hubies...,-1
173,@ATLASNU8E Tendrías que haberte hecho del Barç...,1


We need to turn back polarity values into string for the data in Kaggle to be correct

In [26]:
tweets = tweets_no_label.copy()
tweets['polarity_bin'] = 'Neutral'
tweets.polarity_bin[tweets.polarity.isin([1])] = 'Positive'
tweets.polarity_bin[tweets.polarity.isin([-1])] = 'Negative'
tweets.polarity_bin.value_counts(normalize=True)
tweets[['text', 'polarity_bin']].sample(30)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,text,polarity_bin
141,@SC_ESPN El partidazo que se mandó contra el B...,Negative
37,@SeleccionArg \n#JavierMascherano sobre #Messi...,Positive
55,ULTIMA HORA: EL PRESIDENTE DEL PSG LE OFRECE A...,Positive
78,@Digimiono @dircomPedro En cuanto a Emery cada...,Negative
117,¡¡Le robaron al @Atleti!!\n\nSe metieron a una...,Negative
98,Lucas Vázquez iguala a #Messi #como los máximo...,Negative
60,#BolitaPorfavorINFORMA: 📻💻⚽ \nFutbol Internaci...,Positive
97,"Le tienen tanto odio, que hasta le critican po...",Positive
107,@mundodeportivo El problema del PSG es su prop...,Positive
114,@duodezzimo @madridisme Vaya cabezazo del niño...,Negative


We remove integer column and rename string column

In [27]:
tweets.drop(['lang_langid', 'lang_langdetect','lang_textblob','polarity'], axis=1, inplace=True)

In [28]:
tweets.sample(10)

Unnamed: 0,id,text,polarity_bin
97,be694b7a,"Le tienen tanto odio, que hasta le critican po...",Positive
116,5a311adf,"@elpezdeciudad Claro, no es jugador del Barça....",Positive
35,74acadab,@MiriamDakirFCB @Tocapilotes La Rata De PSG Se...,Negative
72,c5343fa0,@AS_Manolete Y al Atleti que miura la va a toc...,Positive
143,a9ad7a20,"@sport No vale, no saben lo feliz que estuve c...",Positive
103,142c2785,@DiegoACarranza7 @laligaennumeros Ojalá hubies...,Negative
147,9b223a85,OJITO al EJERCICIO de un niño de SEGUNDO de PR...,Positive
66,23303f58,📷 [GALERIA] El @FCBfutbolsala recupera efectiu...,Positive
96,8f9d73cf,@bet365_es Cualquiera que no estuvieran ni Bar...,Positive
82,8a67463b,ESTA TARDE a las 18:45 se juega la SUPERCOPA D...,Positive


In [29]:
tweets = tweets.rename(columns={'polarity_bin': 'polarity'})

In [30]:
tweets.sample(10)

Unnamed: 0,id,text,polarity
120,a195dce5,@SantiagoSty Creo q ustedes son del Madrid y ...,Negative
11,636c8da5,@peperezp @sport MESSI tira del carro del Barç...,Negative
112,6f30beca,CON LA MIRA PUESTA EN MALAGA.\n\nSE VIENE PART...,Positive
152,c8cda282,@quimdomenech Igual que los del Barça hacerse ...,Neutral
74,12d82762,EL DIRECTO EN EL QUE DjMaRiiO USÓ LA CAMISETA ...,Neutral
42,5a533794,¡EL INICIO DE UNA LEYENDA! 🔴🔵\n\n¿Qué momento ...,Positive
82,8a67463b,ESTA TARDE a las 18:45 se juega la SUPERCOPA D...,Positive
121,dc3d7492,"Se le viene Real Madrid, Barça o Bayern Munich...",Neutral
81,a1be2bde,Madrid: Cuatro manifestaciones por el 8-M... y...,Neutral
46,8b973b98,Algunos partidos de Cristiano en eliminatorias...,Neutral


And, to finish up, we print the predicted data into a csv to be uploaded to Kaggle.

In [57]:
tweets[['id', 'polarity']].to_csv('tweets_polarity_bin.csv', encoding='utf-8', index=False)