## Tweet classification

File names

In [1]:
base_location = './'

# To read
tweets_raw_file = base_location + 'datasets/train.csv'
tweets_run_file = base_location + 'datasets/test_nolabel.csv'
corpus_tweets_2012_xml = base_location + 'general-train-tagged-3l.xml'
corpus_tweets_2017_xml = base_location + 'intertass-train-tagged.xml'

# To generate
corpus_tweets_2012_csv = base_location + 'general-train-tagged-3l.csv'
corpus_tweets_2017_csv = base_location + 'intertass-train-tagged.csv'
corpus_tweets_csv = base_location + 'corpus_tweets.csv'

Import libraries

In [2]:
import pandas as pd
import numpy as np

### Load datasets

In [3]:
tweets_raw = pd.read_csv(tweets_raw_file, encoding='utf-8')
tweets_run = pd.read_csv(tweets_run_file, encoding='utf-8')

print('Total tweets: %d' % len(tweets_raw))
print('Evaluated tweets so far: %d' % len(tweets_run))

Total tweets: 411
Evaluated tweets so far: 177


### Create train and test data

Build new array of dictionaries with keys `id` (the task ID), `tweet` (the tweet string) and `score` (the tweet evaluation) by joining data from both CSV files.

In [8]:
# Build dictionary of tweets where key is the task__id
own_tweets = []
tweets_obj = {}
for index, row in tweets_raw.iterrows():
    own_tweets.append({
        'id': row.id,
        'tweet': row.text, 
        'polarity': row.polarity
    })

print('Total different tweets evaluated so far: %d' % len(own_tweets))

Total different tweets evaluated so far: 411


### POS Tagging

Import libraries to read XML:

In [9]:
from lxml import objectify

Import/read most recent corpus (2017):

In [10]:
# 4 values of sentiment: N, P, NONE, NEU
xml = objectify.parse(open(corpus_tweets_2017_xml))
root = xml.getroot()
general_tweets_corpus_train_2017 = pd.DataFrame(columns=('content', 'polarity'))
tweets = root.getchildren()
for i in range(0, len(tweets)):
    tweet = tweets[i]
    row = dict(zip(['content', 'polarity'], [tweet.content.text, tweet.sentiment.polarity.value.text]))
    row_s = pd.Series(row)
    row_s.name = i
    general_tweets_corpus_train_2017 = general_tweets_corpus_train_2017.append(row_s)
    
general_tweets_corpus_train_2017.to_csv(corpus_tweets_2017_csv, index=False, encoding='utf-8')

Import/read biggest corpus (2012), to concatenate it with the previous noe:

In [11]:
# 4 values of sentiment: N, P, NONE, NEU
xml = objectify.parse(open(corpus_tweets_2012_xml))
root = xml.getroot()
general_tweets_corpus_train_2012 = pd.DataFrame(columns=('content', 'polarity'))
tweets = root.getchildren()
for i in range(0, len(tweets)):
    tweet = tweets[i]
    row = dict(zip(['content', 'polarity'], [tweet.content.text, tweet.sentiments.polarity.value.text]))
    row_s = pd.Series(row)
    row_s.name = i
    general_tweets_corpus_train_2012 = general_tweets_corpus_train_2012.append(row_s)
    
general_tweets_corpus_train_2012.to_csv(corpus_tweets_2012_csv, index=False, encoding='utf-8')

Concatenate general corpus dataset with 2017 one, to have a better result:

In [12]:
tweets_corpus = pd.concat([
        general_tweets_corpus_train_2012,
        general_tweets_corpus_train_2017
    ])
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
6744,“@policebluetour: Buenas noches a tod@s :-) #F...,P
220,@DieRaposa @DrXaverius y otros biólogos como é...,P
6338,Cafe Berlin. Flipando con el concierto en dire...,P
3450,Como siempre buenísimo Santiago Gonzalez. Sobr...,P
2112,Cuando la corrupción se convierte en nuestro p...,N
1552,"Claro,claro..... sobretodo ahora que no tengo ...",N
2868,"Rajoy: ""ya sabemos que las cosas están mal"". N...",N
2139,Esta es la mejor noticia q he leído hoy en la ...,P
6566,Yo mañana voy a trabajar.,NONE
6093,Hoy 19 de marzo los gaditanos celebramos un gr...,P


In [13]:
print('Total corpus tweets: %d' % len(tweets_corpus))

Total corpus tweets: 8227


Remove tweets without polarity (polarity `NONE`):

In [14]:
tweets_corpus = tweets_corpus.query('polarity != "NONE"')

Remove links:

In [15]:
tweets_corpus = tweets_corpus[-tweets_corpus.content.str.contains('^http.*$')]

In [16]:
import re
url_regex = re.compile('https?:\/\/t\.co\/[\w]{8,8}')
tweets_corpus_no_links = tweets_corpus
tweets_corpus_no_links['content'] = tweets_corpus_no_links['content'].map(lambda x: re.sub(url_regex, '', x))

In [17]:
print('Total corpus tweets after cleaning: %d' % len(tweets_corpus_no_links))

Total corpus tweets after cleaning: 6586


In [56]:
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
1109,"@PabloAIglesias no se si mañana o pasado,jaja",P
2783,"#Andalucia necesita claridad en sus cuentas, t...",N
2263,El Gobierno recuerda a sindicatos y empresario...,N
6233,Según el relato de EP la única presión que se ...,N
3562,Si Chacón gana es urgente que aprenda la difer...,NEU
1284,Es lo maravilloso de este nuevo medio de expre...,P
330,Resumen de la larga y agria noche en Bruselas:...,N
4658,Bruselas será ligeramente flexible con los obj...,N
6951,Desde que en julio de 1988 empecé a trabajar e...,N
1240,Hoy en Cartaya con las personas mayores. Nos m...,P


### Tokenization and stemming

In [18]:
#download spanish stopwords
import nltk
nltk.download("stopwords")

from nltk.corpus import stopwords
spanish_stopwords = stopwords.words('spanish')

[nltk_data] Downloading package stopwords to /Users/dass/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
from string import punctuation
non_words = list(punctuation)

#we add spanish punctuation
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))
non_words

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~',
 '¿',
 '¡',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9']

In [20]:
from sklearn.feature_extraction.text import CountVectorizer       
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = SnowballStemmer('spanish')
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = ''.join([c for c in text if c not in non_words])
    # tokenize
    tokens =  word_tokenize(text)

    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems

### Model Evaluation

In [21]:
from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline



We convert from strings to numerics the polarity values

In [22]:
tweets_corpus_no_links['polarity_bin'] = 0
tweets_corpus_no_links.polarity_bin[tweets_corpus_no_links.polarity.isin(['P'])] = 1
tweets_corpus_no_links.polarity_bin[tweets_corpus_no_links.polarity.isin(['N'])] = -1
tweets_corpus_no_links.polarity_bin.value_counts(normalize=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


 1    0.483602
-1    0.394777
 0    0.121622
Name: polarity_bin, dtype: float64

Now we use SVC model with optimization via GridSearch

In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
vectorizer = CountVectorizer(
                analyzer = 'word',
                tokenizer = tokenize,
                lowercase = True,
                stop_words = spanish_stopwords)

pipeline = Pipeline([
    ('vect', vectorizer),
    ('cls', OneVsRestClassifier(LinearSVC())),
])

As long as we don't have binary classification, we must binarize the polarity and use a multiclass learning algorithm

In [31]:
from sklearn.preprocessing import label_binarize
tweets_corpus_no_links.polarity_bin = label_binarize(tweets_corpus_no_links.polarity_bin, classes=[-1, 0, 1])

In [36]:
params = {
    'vect__max_df': (0.5, 1.9),
    'vect__min_df': (10, 20,50),
    'vect__max_features': (500, 1000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'cls__estimator__C': (0.2, 0.5, 0.7),
    'cls__estimator__loss': ('hinge', 'squared_hinge'),
    'cls__estimator__max_iter': (500, 1000)
      }
gs = GridSearchCV(pipeline, params, n_jobs=-1, cv=5, scoring='roc_auc')
gs.fit(tweets_corpus_no_links.content, tweets_corpus_no_links.polarity_bin)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['de', 'la'...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vect__max_df': (0.5, 1.9), 'vect__min_df': (10, 20, 50), 'vect__max_features': (500, 1000), 'vect__ngram_range': ((1, 1), (1, 2)), 'cls__estimator__C': (0.2, 0.5, 0.7), 'cls__estimator__loss': ('hinge', 'squared_hinge'), 'cls__estimator__max_iter': (500, 1000)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [37]:
gs.best_params_

{'cls__estimator__C': 0.2,
 'cls__estimator__loss': 'hinge',
 'cls__estimator__max_iter': 1000,
 'vect__max_df': 1.9,
 'vect__max_features': 1000,
 'vect__min_df': 10,
 'vect__ngram_range': (1, 1)}

We obtain that the best parameters are:

{'cls__estimator__C': 0.2,

 'cls__estimator__loss': 'hinge',
 
 'cls__estimator__max_iter': 1000,
 
 'vect__max_df': 1.9,
 
 'vect__max_features': 1000,
 
 'vect__min_df': 10,
 
 'vect__ngram_range': (1, 1)}

In [38]:
from sklearn.externals import joblib
joblib.dump(gs, 'grid_search.pkl')

['grid_search.pkl']

In [39]:
model = LinearSVC(C=.2, loss='hinge',max_iter=1000,multi_class='ovr',
              random_state=None,
              penalty='l2'
)

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = spanish_stopwords,
    min_df = 10,
    max_df = 1.9,
    ngram_range=(1, 1),
    max_features=1000
)

corpus_data_features = vectorizer.fit_transform(tweets_corpus_no_links.content)
corpus_data_features_nd = corpus_data_features.toarray()

In [40]:
scores = cross_val_score(
    model,
    corpus_data_features_nd[0:len(tweets_corpus_no_links)],
    y=tweets_corpus_no_links.polarity_bin,
    scoring='roc_auc',
    cv=5
    )

scores.mean()

0.76988723618717669

### Polarity Prediction

In [95]:
tweets_no_label = pd.read_csv(tweets_run_file, encoding='utf-8')
print(tweets_no_label.shape)
tweets_no_label.sample(10)

(177, 2)


Unnamed: 0,id,text
75,c1cad5a6,@jotajordi13 @tonintorero Tu diciendo que no d...
157,2fb93730,@LuisOmarTapia Se le salió la barcelonitis al ...
114,88c29066,@duodezzimo @madridisme Vaya cabezazo del niño...
74,12d82762,EL DIRECTO EN EL QUE DjMaRiiO USÓ LA CAMISETA ...
174,f3d189cf,"Hace 17 años, Messi por primera vez se puso la..."
72,c5343fa0,@AS_Manolete Y al Atleti que miura la va a toc...
132,5f5fc452,"@diariolagrada Pues nada, adeu y barca nova. Y..."
25,e0fd7776,@marcmarquez93 @3gerardpique @SergiRoberto10 @...
148,4f6bbcb5,Aquí celebrando mi cumple con uno del atleti. ...
28,b2bde016,@javirm1993 El Espanyol ha sacado más puntos c...


### Language detection

Due to the fact that some tweets are in catalan, for language detection purposes we are only going to process about the ones in spanish for sentiment purposes.

We use three different libraries for language detection and keep those tweets on which at least two of these libraries agree on the language being Spanish.

In [96]:
import langid
from langdetect import detect
import textblob

def langid_safe(tweet):
    try:
        return langid.classify(tweet)[0]
    except Exception as e:
        pass
        
def langdetect_safe(tweet):
    try:
        return detect(tweet)
    except Exception as e:
        pass

def textblob_safe(tweet):
    try:
        return textblob.TextBlob(tweet).detect_language()
    except Exception as e:
        pass

In [97]:
tweets_no_label['lang_langid'] = tweets_no_label.text.apply(langid_safe)
tweets_no_label['lang_langdetect'] = tweets_no_label.text.apply(langdetect_safe)
tweets_no_label['lang_textblob'] = tweets_no_label.text.apply(textblob_safe)

In [98]:
tweets_no_label.to_csv('tweets_parsed2.csv', encoding='utf-8')

In [99]:
#tweets_no_label = tweets_no_label.query(''' lang_langdetect == 'es' or lang_langid == 'es' or lang_textblob == 'es'  ''')
print(tweets_no_label.shape)
tweets_no_label.sample(10)

(177, 5)


Unnamed: 0,id,text,lang_langid,lang_langdetect,lang_textblob
24,977bf140,@SEUR @LNFS89 Si jugaras en el cielo moriría p...,es,es,es
114,88c29066,@duodezzimo @madridisme Vaya cabezazo del niño...,es,es,es
56,5df2d140,Me ha gustado un vídeo de @YouTube (https://t....,gl,pt,es
6,ebf7bff7,André Gomes no está dando el nivel pero los ab...,es,es,es
65,5e516305,@NostradamusFCB @janescorrea @Cerdido_ A este ...,es,es,es
60,5d65492b,#BolitaPorfavorINFORMA: 📻💻⚽ \nFutbol Internaci...,es,es,es
157,2fb93730,@LuisOmarTapia Se le salió la barcelonitis al ...,es,es,es
2,26fe7471,Dedicado para:\n@Trigueros17 \n@FCBarcelona ht...,pt,es,es
76,dcc02374,@bet365_es No es muy difícil Barça va Madrid,ca,ca,es
108,a7ca41f5,@juliup7 Really ? El Espanyol nos ganó. 🙄🙄\nEs...,es,es,es


In [100]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
            analyzer = 'word',
            tokenizer = tokenize,
            lowercase = True,
            stop_words = spanish_stopwords,
            min_df = 10,
            max_df = 1.9,
            ngram_range=(1, 1),
            max_features=1000
            )),
    ('cls', OneVsRestClassifier(LinearSVC(C=.2, loss='hinge',max_iter=1000,multi_class='ovr',
             random_state=None,
             penalty='l2',
             ))),
])

In [101]:
pipeline.fit(tweets_corpus_no_links.content, tweets_corpus_no_links.polarity_bin)
tweets_no_label['polarity'] = pipeline.predict(tweets_no_label.text)

In [102]:
tweets_no_label[['text', 'polarity']].sample(30)

Unnamed: 0,text,polarity
53,Cuenta la Leyenda que Todo comenzó hace 17 Año...,0
48,@Chrisanuvis @FCBarcelona_es @FCBarcelona Tien...,0
37,@SeleccionArg \n#JavierMascherano sobre #Messi...,0
114,@duodezzimo @madridisme Vaya cabezazo del niño...,1
35,@MiriamDakirFCB @Tocapilotes La Rata De PSG Se...,0
126,Pavor tengo que ahora los del #PSG miren hacia...,0
81,Madrid: Cuatro manifestaciones por el 8-M... y...,0
38,@LuisOmarTapia Tanto lo alababan que fue el cr...,0
65,@NostradamusFCB @janescorrea @Cerdido_ A este ...,1
11,@peperezp @sport MESSI tira del carro del Barç...,1


In [103]:
tweets = tweets_no_label.copy()
tweets['polarity_bin'] = 'Neutral'
tweets.polarity_bin[tweets.polarity.isin([1])] = 'Positive'
tweets.polarity_bin[tweets.polarity.isin([-1])] = 'Negative'
tweets.polarity_bin.value_counts(normalize=True)
tweets[['text', 'polarity_bin']].sample(30)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,text,polarity_bin
67,@profenfurecido9 @Berlustinho Ése es el proble...,Positive
41,"Hace 17 años, Messi por primera vez se puso la...",Neutral
173,@ATLASNU8E Tendrías que haberte hecho del Barç...,Neutral
21,"@elchiringuitotv Este es un cagon, la verdad a...",Neutral
62,@skooldaze_ entonces por qué lo llamas realmad...,Neutral
139,"Arthur prácticamente hecho, todo está firmado ...",Neutral
163,@emedemamon los que se clasifiquen ya son los ...,Neutral
74,EL DIRECTO EN EL QUE DjMaRiiO USÓ LA CAMISETA ...,Neutral
169,VIDEO: La brutal exhibición de Koke en el entr...,Neutral
95,@SimonLoveRM @Mariofelipe98 @juancarcalles Jaj...,Neutral


In [104]:
tweets.drop(['lang_langid', 'lang_langdetect','lang_textblob','polarity'], axis=1, inplace=True)

In [105]:
tweets.sample(10)

Unnamed: 0,id,text,polarity_bin
129,7fa82da8,@Blogcelonista Entonces Arthur ya esta práctic...,Positive
106,020ee260,Más miedo al Barça que al PSG? Has mencionado ...,Neutral
55,9046f222,ULTIMA HORA: EL PRESIDENTE DEL PSG LE OFRECE A...,Neutral
107,ddc5f491,@mundodeportivo El problema del PSG es su prop...,Positive
152,c8cda282,@quimdomenech Igual que los del Barça hacerse ...,Neutral
153,162b825f,@elfenomenor9r9 @diaz25643422 @marianot19 @Cri...,Neutral
104,0dc07c46,Hoy es el cumpleaños de Francisco Hervás Tirad...,Neutral
115,ea75493e,@diablillocule14 @InvictosSomos ????? A ver su...,Positive
38,fffdbad5,@LuisOmarTapia Tanto lo alababan que fue el cr...,Neutral
74,12d82762,EL DIRECTO EN EL QUE DjMaRiiO USÓ LA CAMISETA ...,Neutral


In [106]:
tweets = tweets.rename(columns={'polarity_bin': 'polarity'})

In [107]:
tweets[['id', 'polarity']].to_csv('tweets_polarity_bin.csv', encoding='utf-8', index=False)