## Tweet classification

File names

In [68]:
base_location = './'

# To read
tweets_raw_file = base_location + 'datasets/train.csv'
tweets_run_file = base_location + 'datasets/test_nolabel.csv'
corpus_tweets_2012_xml = base_location + 'general-train-tagged-3l.xml'
corpus_tweets_2017_xml = base_location + 'intertass-train-tagged.xml'

# To generate
corpus_tweets_2012_csv = base_location + 'general-train-tagged-3l.csv'
corpus_tweets_2017_csv = base_location + 'intertass-train-tagged.csv'
corpus_tweets_csv = base_location + 'corpus_tweets.csv'

Import libraries

In [69]:
import pandas as pd
import numpy as np

### Load datasets

In [70]:
tweets_raw = pd.read_csv(tweets_raw_file, encoding='utf-8')
tweets_run = pd.read_csv(tweets_run_file, encoding='utf-8')

print('Total tweets: %d' % len(tweets_raw))
print('Evaluated tweets so far: %d' % len(tweets_run))

Total tweets: 411
Evaluated tweets so far: 177


### Create train and test data

Build new array of dictionaries with keys `id` (the task ID), `tweet` (the tweet string) and `score` (the tweet evaluation) by joining data from both CSV files.

In [71]:
# Build dictionary of tweets where key is the task__id
own_tweets = []
tweets_obj = {}
for index, row in tweets_raw.iterrows():
    own_tweets.append({
        'id': row.id,
        'tweet': row.text, 
        'polarity': row.polarity
    })

print('Total different tweets evaluated so far: %d' % len(own_tweets))

Total different tweets evaluated so far: 411


### POS Tagging

Import libraries to read XML:

In [72]:
from lxml import objectify

Import/read most recent corpus (2017):

In [73]:
# 4 values of sentiment: N, P, NONE, NEU
xml = objectify.parse(open(corpus_tweets_2017_xml))
root = xml.getroot()
general_tweets_corpus_train_2017 = pd.DataFrame(columns=('content', 'polarity'))
tweets = root.getchildren()
for i in range(0, len(tweets)):
    tweet = tweets[i]
    row = dict(zip(['content', 'polarity'], [tweet.content.text, tweet.sentiment.polarity.value.text]))
    row_s = pd.Series(row)
    row_s.name = i
    general_tweets_corpus_train_2017 = general_tweets_corpus_train_2017.append(row_s)
    
general_tweets_corpus_train_2017.to_csv(corpus_tweets_2017_csv, index=False, encoding='utf-8')

Import/read biggest corpus (2012), to concatenate it with the previous noe:

In [74]:
# 4 values of sentiment: N, P, NONE, NEU
xml = objectify.parse(open(corpus_tweets_2012_xml))
root = xml.getroot()
general_tweets_corpus_train_2012 = pd.DataFrame(columns=('content', 'polarity'))
tweets = root.getchildren()
for i in range(0, len(tweets)):
    tweet = tweets[i]
    row = dict(zip(['content', 'polarity'], [tweet.content.text, tweet.sentiments.polarity.value.text]))
    row_s = pd.Series(row)
    row_s.name = i
    general_tweets_corpus_train_2012 = general_tweets_corpus_train_2012.append(row_s)
    
general_tweets_corpus_train_2012.to_csv(corpus_tweets_2012_csv, index=False, encoding='utf-8')

Concatenate general corpus dataset with 2017 one, to have a better result:

In [75]:
tweets_corpus = pd.concat([
        general_tweets_corpus_train_2012,
        general_tweets_corpus_train_2017
    ])
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
4904,"Desde la total discrepancia ideológica, todo e...",N
6700,.@UPyD plantea al @PSOE un gobierno de concent...,NONE
303,@mrswxllace tres tristes trokistas pican piedr...,N
2499,Por San Antón @pablo_roy nos presenta a Guarin...,NONE
1104,Hombre tostada: http://t.co/sMhaMmXn,NONE
6055,,NONE
6298,#Picasso sienta a #Málaga en el diván @teoleon...,NONE
6605,"B. Días. En unos minutos, en Herrera en la Ond...",N
3147,El Gobierno pide a #Griñán que depure responsa...,N
5812,Charla en la bancada azul: Guindos con Valeria...,NONE


In [76]:
print('Total corpus tweets: %d' % len(tweets_corpus))

Total corpus tweets: 8227


Remove tweets without polarity (polarity `NONE`):

In [77]:
tweets_corpus = tweets_corpus.query('polarity != "NONE"')

Remove links:

In [78]:
tweets_corpus = tweets_corpus[-tweets_corpus.content.str.contains('^http.*$')]

In [79]:
import re
url_regex = re.compile('https?:\/\/t\.co\/[\w]{8,8}')
tweets_corpus_no_links = tweets_corpus
tweets_corpus_no_links['content'] = tweets_corpus_no_links['content'].map(lambda x: re.sub(url_regex, '', x))

In [80]:
print('Total corpus tweets after cleaning: %d' % len(tweets_corpus_no_links))

Total corpus tweets after cleaning: 6586


In [95]:
tweets_corpus.sample(10)

Unnamed: 0,content,polarity,polarity_bin
1284,Es lo maravilloso de este nuevo medio de expre...,P,1
1101,@DiegoMazon78 Tú también juras hoy el cargo? P...,P,1
4787,“@AberronchoTV: @David_Busta te mando un salud...,P,1
2623,Me parece bochornoso hacer RTs de comentarios ...,NEU,0
1758,Segurisimo.. Os mando un millon de abrazos. RT...,P,1
176,@Diego_FDM @el_pais La noticia perfecta para #...,P,1
5594,La cultura del esfuerzo personal hay q enseñar...,P,1
5541,La violencia estructural sobre las mujeres es ...,N,-1
5735,30añosdepsoe.es Todo lo que no te han contado,N,-1
593,"Tienes razón, Adriana. Mañana estaré con vosot...",N,-1


### Tokenization and stemming

In [82]:
#download spanish stopwords
import nltk
nltk.download("stopwords")

from nltk.corpus import stopwords
spanish_stopwords = stopwords.words('spanish')

[nltk_data] Downloading package stopwords to /Users/dass/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [83]:
from string import punctuation
non_words = list(punctuation)

#we add spanish punctuation
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))
non_words

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~',
 '¿',
 '¡',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9']

In [84]:
from sklearn.feature_extraction.text import CountVectorizer       
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = SnowballStemmer('spanish')
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = ''.join([c for c in text if c not in non_words])
    # tokenize
    tokens =  word_tokenize(text)

    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

### Model Evaluation

In [85]:
from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

We convert from strings to numerics the polarity values

In [86]:
tweets_corpus_no_links['polarity_bin'] = 0
tweets_corpus_no_links.polarity_bin[tweets_corpus_no_links.polarity.isin(['P'])] = 1
tweets_corpus_no_links.polarity_bin[tweets_corpus_no_links.polarity.isin(['N'])] = -1
tweets_corpus_no_links.polarity_bin.value_counts(normalize=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


 1    0.483602
-1    0.394777
 0    0.121622
Name: polarity_bin, dtype: float64

Now we use SVC model with optimization via GridSearch

In [87]:
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
vectorizer = CountVectorizer(
                analyzer = 'word',
                tokenizer = tokenize,
                lowercase = True,
                stop_words = spanish_stopwords)

pipeline = Pipeline([
    ('vect', vectorizer),
    ('cls', OneVsRestClassifier(LinearSVC())),
])

As long as we don't have binary classification, we must binarize the polarity and use a multiclass learning algorithm

In [88]:
'''from sklearn.preprocessing import label_binarize
tweets_corpus_no_links.polarity_bin = label_binarize(tweets_corpus_no_links.polarity_bin, classes=[-1, 0, 1])'''

'from sklearn.preprocessing import label_binarize\ntweets_corpus_no_links.polarity_bin = label_binarize(tweets_corpus_no_links.polarity_bin, classes=[-1, 0, 1])'

In [89]:
'''params = {
    'vect__max_df': (0.5, 1.9),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'cls__estimator__C': (0.2, 0.5, 0.7),
    'cls__estimator__loss': ('hinge', 'squared_hinge'),
    'cls__estimator__max_iter': (500, 1000)
      }
gs = GridSearchCV(pipeline, params, n_jobs=-1, cv=5, scoring='roc_auc')
gs.fit(tweets_corpus_no_links.content, tweets_corpus_no_links.polarity_bin)'''

"params = {\n    'vect__max_df': (0.5, 1.9),\n    'vect__min_df': (10, 20,50),\n    'vect__max_features': (500, 1000),\n    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams\n    'cls__estimator__C': (0.2, 0.5, 0.7),\n    'cls__estimator__loss': ('hinge', 'squared_hinge'),\n    'cls__estimator__max_iter': (500, 1000)\n      }\ngs = GridSearchCV(pipeline, params, n_jobs=-1, cv=5, scoring='roc_auc')\ngs.fit(tweets_corpus_no_links.content, tweets_corpus_no_links.polarity_bin)"

In [90]:
'''gs.best_params_'''

'gs.best_params_'

We obtain that the best parameters are:

{'cls__estimator__C': 0.2,

 'cls__estimator__loss': 'hinge',
 
 'cls__estimator__max_iter': 1000,
 
 'vect__max_df': 1.9,
 
 'vect__max_features': 1000,
 
 'vect__min_df': 10,
 
 'vect__ngram_range': (1, 1)}

In [91]:
'''from sklearn.externals import joblib
joblib.dump(gs, 'grid_search.pkl')'''

"from sklearn.externals import joblib\njoblib.dump(gs, 'grid_search.pkl')"

In [92]:
from sklearn.cross_validation import cross_val_predict
model = LinearSVC(C=.2, loss='hinge',max_iter=1000,
              random_state=None,
              penalty='l2'
)

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = spanish_stopwords,
    min_df = 10,
    max_df = 1.9,
    ngram_range=(1, 1),
    max_features=1000
)

corpus_data_features = vectorizer.fit_transform(tweets_corpus_no_links.content)
corpus_data_features_nd = corpus_data_features.toarray()

In [93]:
y=tweets_corpus_no_links.polarity_bin
y

1       0
2       1
3      -1
4       1
6       1
8       1
9       1
10      1
11      1
12     -1
13      1
14      1
17      1
18      1
19     -1
20     -1
21     -1
22     -1
23     -1
24      1
25     -1
26      1
27      1
28      1
31      1
32     -1
33     -1
34     -1
35      0
36      1
       ..
971     1
972    -1
973    -1
974    -1
975    -1
976    -1
977    -1
978     0
979    -1
980    -1
981    -1
982    -1
984    -1
985     1
988    -1
989    -1
990     0
992     1
993    -1
995     0
996     0
997     1
998    -1
999    -1
1002   -1
1003    1
1004   -1
1005    0
1006    1
1007    1
Name: polarity_bin, Length: 6586, dtype: int64

In [162]:
'''scores = cross_val_score(
    model,
    corpus_data_features_nd[0:len(tweets_corpus_no_links)],
    y=tweets_corpus_no_links.polarity_bin,
    scoring='roc_auc',
    cv=5
    )

scores.mean()'''

"scores = cross_val_score(\n    model,\n    corpus_data_features_nd[0:len(tweets_corpus_no_links)],\n    y=tweets_corpus_no_links.polarity_bin,\n    scoring='roc_auc',\n    cv=5\n    )\n\nscores.mean()"

### Polarity Prediction

In [147]:
tweets_no_label = pd.read_csv(tweets_run_file, encoding='utf-8')
print(tweets_no_label.shape)
tweets_no_label.sample(10)

(177, 2)


Unnamed: 0,id,text
24,977bf140,@SEUR @LNFS89 Si jugaras en el cielo moriría p...
128,d6e65407,@jmdelalamo Lo q no comprendo es por q no sale...
95,737f222e,@SimonLoveRM @Mariofelipe98 @juancarcalles Jaj...
144,16466a71,@LucasScagliola Dale un rato más y será más q ...
84,8d4d51db,Grandísimo. Y esto nunca lo comprenderá un tío...
139,49a24842,"Arthur prácticamente hecho, todo está firmado ..."
13,dd512ee1,Recién me compro el conjunto del Barça y ya sa...
5,5b9c12ba,@ALEX15vs @FCBarcelona_es @FCBarcelona Pero qu...
22,08398146,@AS_Manolete Antes de la eliminatoria el PSG e...
78,d644c5df,@Digimiono @dircomPedro En cuanto a Emery cada...


Now we do some cleansing of the data, erasing the mentions and links

In [148]:
import re
url_regex_links = re.compile(r'@[A-Za-z0-9]+((_){1}[A-Za-z]*){0,1}')
url_regex_mentions = re.compile('https?:\/\/t\.co\/[\w]{8,8}')
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(url_regex_links, '', x))
tweets_no_label['text'] = tweets_no_label['text'].map(lambda x: re.sub(url_regex_mentions, '', x))

In [149]:
tweets_no_label.sample(10)

Unnamed: 0,id,text
126,58685dd8,Pavor tengo que ahora los del #PSG miren hacia...
96,8f9d73cf,Cualquiera que no estuvieran ni Barça ni Madrid.
18,bc9887eb,_ Pero porque la ganase el Atm o porque la p...
136,5922c3b4,nada más para aclarar este año al lo hecho e...
55,9046f222,ULTIMA HORA: EL PRESIDENTE DEL PSG LE OFRECE A...
157,2fb93730,Se le salió la barcelonitis al madridista oma...
133,26c47161,Félix Brych ayer en el partido de champions #P...
16,c9029b30,"Sacar 3 suplentes de titulares, dejar en ba..."
167,a3798203,"Lo siento, pero 3-0. Lo otro son campitos m..."
13,dd512ee1,Recién me compro el conjunto del Barça y ya sa...


### Language detection

Due to the fact that some tweets are in catalan, for language detection purposes we are only going to process about the ones in spanish for sentiment purposes.

We use three different libraries for language detection and keep those tweets on which at least two of these libraries agree on the language being Spanish.

In [150]:
import langid
from langdetect import detect
import textblob

def langid_safe(tweet):
    try:
        return langid.classify(tweet)[0]
    except Exception as e:
        pass
        
def langdetect_safe(tweet):
    try:
        return detect(tweet)
    except Exception as e:
        pass

def textblob_safe(tweet):
    try:
        return textblob.TextBlob(tweet).detect_language()
    except Exception as e:
        pass

In [151]:
tweets_no_label['lang_langid'] = tweets_no_label.text.apply(langid_safe)
tweets_no_label['lang_langdetect'] = tweets_no_label.text.apply(langdetect_safe)
tweets_no_label['lang_textblob'] = tweets_no_label.text.apply(textblob_safe)

In [152]:
tweets_no_label.to_csv('tweets_parsed2.csv', encoding='utf-8')

In [153]:
#tweets_no_label = tweets_no_label.query(''' lang_langdetect == 'es' or lang_langid == 'es' or lang_textblob == 'es'  ''')
print(tweets_no_label.shape)
tweets_no_label.sample(10)

(177, 5)


Unnamed: 0,id,text,lang_langid,lang_langdetect,lang_textblob
77,6eb6eb37,"Neymar se fue al PSG en busca de “títulos”, si...",es,es,es
132,5f5fc452,"Pues nada, adeu y barca nova. Y mientras tant...",es,es,es
154,8ccef7aa,"Comparar a con con la frase ""el dinero no da...",es,es,es
19,ad9d4385,Definiciones de la liga según quien la gana.\...,es,es,es
26,fecab36b,"81 Ya, y estoy de acuerdo. Pero eso no cambia...",es,es,es
149,b7cf2bde,No pusieron los 5 al barca\nAH NO PARAA,es,es,es
170,7bd204cc,Yo no me abono,pt,it,es
5,5b9c12ba,Pero que te crees? Xk os sorprende lo de es...,es,es,es
10,a122a538,Un jugador brasileño de 21 años muy bueno q ...,es,es,es
127,bec01c4b,Incomparecencia ayer del PSG. Alguien sabe ...,es,es,es


In [154]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
            analyzer = 'word',
            tokenizer = tokenize,
            lowercase = True,
            stop_words = spanish_stopwords,
            min_df = 10,
            max_df = 1.9,
            ngram_range=(1, 1),
            max_features=1000
            )),
    ('cls', OneVsRestClassifier(LinearSVC(C=.2, loss='hinge',max_iter=1000,multi_class='ovr',
             random_state=None,
             penalty='l2',
             ))),
])

In [155]:
pipeline.fit(tweets_corpus_no_links.content, tweets_corpus_no_links.polarity_bin)
tweets_no_label['polarity'] = pipeline.predict(tweets_no_label.text)

In [156]:
tweets_no_label[['text', 'polarity']].sample(30)

Unnamed: 0,text,polarity
68,La 'rajada' de un ex objetivo del Barça sobre ...,1
10,Un jugador brasileño de 21 años muy bueno q ...,1
105,Resulta que el partido lo perdió el PSG po...,-1
118,El Madrid se siente invencible en su competici...,-1
102,Cuando el PSG permitió entrar a sus ultras jus...,-1
39,(Marca) Unos 1.300 efectivos velarán por la se...,1
51,Y tu que te llamas angelvikingo siendo del B...,1
24,Si jugaras en el cielo moriría por verte! #F...,1
92,"Que noooo, así nos saca unas risas 😅😅😅",1
107,El problema del PSG es su propio presidente q...,-1


In [157]:
tweets = tweets_no_label.copy()
tweets['polarity_bin'] = 'Neutral'
tweets.polarity_bin[tweets.polarity.isin([1])] = 'Positive'
tweets.polarity_bin[tweets.polarity.isin([-1])] = 'Negative'
tweets.polarity_bin.value_counts(normalize=True)
tweets[['text', 'polarity_bin']].sample(30)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,text,polarity_bin
36,Todavía duele que con el Barca haya hecho ped...,Positive
114,Vaya cabezazo del niño. Me recordó al de la ...,Negative
70,Como mola!!!!! Ya era hora !!! No sustituirá...,Positive
65,A este Dios de este competicion quitas sus ...,Negative
58,Pues cuando era pequeñito era del Barça... Lu...,Negative
39,(Marca) Unos 1.300 efectivos velarán por la se...,Positive
56,Me ha gustado un vídeo de (tG - EL DIRECTO EN...,Positive
150,📌Informa 88: ha fet cursa contínua avui a la ...,Negative
119,¿Equipo de fútbol favorito? — No me gusta el f...,Positive
75,Tu diciendo que no deja de pensar en el Barç...,Negative


In [158]:
tweets.drop(['lang_langid', 'lang_langdetect','lang_textblob','polarity'], axis=1, inplace=True)

In [159]:
tweets.sample(10)

Unnamed: 0,id,text,polarity_bin
113,e1400a58,"No están trabajando en el juego, están cosien...",Negative
133,26c47161,Félix Brych ayer en el partido de champions #P...,Positive
39,382ae472,(Marca) Unos 1.300 efectivos velarán por la se...,Positive
38,fffdbad5,Tanto lo alababan que fue el creador del fútb...,Positive
176,eb2cb9f4,@_SentimentCule Es que son muy ridículos. Perd...,Negative
45,41de1a3a,La #EuropeaLeague el #Atlético se la tiene que...,Negative
100,2c56e5a7,Mi estimado DON Tapia habla desde el corazon m...,Negative
124,9cd8b232,LO QUE PASA ES QUE EL QUE HABLA PAJA SOY VOH ...,Negative
130,18c31f78,"En esta temporada, City Vs Barça w5",Positive
44,a5b1da99,A ver si encontráis algún tweet así a favor de...,Negative


In [160]:
tweets = tweets.rename(columns={'polarity_bin': 'polarity'})

In [161]:
tweets[['id', 'polarity']].to_csv('tweets_polarity_bin.csv', encoding='utf-8', index=False)