## Tweet classification

File names:

In [1]:
# To read
test_tweets_raw = 'datasets/test_nolabel.csv'
train_tweets_raw = 'datasets/train.csv'
corpus_tweets_2012_xml = 'general-train-tagged-3l.xml'
corpus_tweets_2017_xml = 'intertass-train-tagged.xml'
emojis_csv = 'emojis.csv'

# To generate
corpus_tweets_2012_csv = 'general-train-tagged-3l.csv'
corpus_tweets_2017_csv = 'intertass-train-tagged.csv'
corpus_tweets_csv = 'corpus_tweets_preprocessed.csv'

Import Pandas and Numpy:

In [2]:
import pandas as pd
import numpy as np

### Load datasets

In [3]:
tweets_test = pd.read_csv(test_tweets_raw, encoding='utf-8')
tweets_train = pd.read_csv(train_tweets_raw, encoding='utf-8')

print('Total tweets to evaluate: %d' % len(tweets_test))
print('Evaluated tweets so far: %d' % len(tweets_train))

Total tweets to evaluate: 177
Evaluated tweets so far: 411


### POS Tagging

Import libraries to read XML:

In [4]:
from lxml import objectify

Import/read most recent corpus (2017):

In [15]:
# 4 values of sentiment: N, P, NONE, NEU
xml = objectify.parse(open(corpus_tweets_2017_xml))
root = xml.getroot()
general_tweets_corpus_train_2017 = pd.DataFrame(columns=('content', 'polarity'))
tweets = root.getchildren()
for i in range(0, len(tweets)):
    tweet = tweets[i]
    row = dict(zip(['content', 'polarity'], [tweet.content.text, tweet.sentiment.polarity.value.text]))
    row_s = pd.Series(row)
    row_s.name = i
    general_tweets_corpus_train_2017 = general_tweets_corpus_train_2017.append(row_s)

In [13]:
general_tweets_corpus_train_2017.polarity.value_counts(normalize=False)

N       418
P       318
NONE    139
NEU     133
Name: polarity, dtype: int64

In [18]:
# Number of tweets to remove
positive_to_remove = 318-133
negative_to_remove = 418-133
positive_removed = 0
negative_removed = 0

# Init balanced version
general_tweets_corpus_train_2017_balanced = pd.DataFrame(columns=('content', 'polarity'))

# We balance the dataset, to have the same number of positive, negative and neutral polarity
for i, row in general_tweets_corpus_train_2017.iterrows():
    if(row.polarity) == 'P':
        positive_removed = positive_removed + 1
        if(positive_removed > positive_to_remove):
            row_p = dict(zip(['content', 'polarity'], [row.content, row.polarity]))
            row_s = pd.Series(row_p)
            row_s.name = i
            general_tweets_corpus_train_2017_balanced = general_tweets_corpus_train_2017_balanced.append(row_s)
    if(row.polarity) == 'N':
        negative_removed = negative_removed + 1
        if(negative_removed > negative_to_remove):
            row_p = dict(zip(['content', 'polarity'], [row.content, row.polarity]))
            row_s = pd.Series(row_p)
            row_s.name = i
            general_tweets_corpus_train_2017_balanced = general_tweets_corpus_train_2017_balanced.append(row_s)
    if(row.polarity) == 'NEU':
        row_p = dict(zip(['content', 'polarity'], [row.content, row.polarity]))
        row_s = pd.Series(row_p)
        row_s.name = i
        general_tweets_corpus_train_2017_balanced = general_tweets_corpus_train_2017_balanced.append(row_s)

In [19]:
general_tweets_corpus_train_2017_balanced.polarity.value_counts(normalize=False)

P      133
NEU    133
N      133
Name: polarity, dtype: int64

Import/read biggest corpus (2012), to concatenate it with the previous one:

In [20]:
# 4 values of sentiment: N, P, NONE, NEU
xml = objectify.parse(open(corpus_tweets_2012_xml))
root = xml.getroot()
general_tweets_corpus_train_2012 = pd.DataFrame(columns=('content', 'polarity'))
tweets = root.getchildren()
for i in range(0, len(tweets)):
    tweet = tweets[i]
    row = dict(zip(['content', 'polarity'], [tweet.content.text, tweet.sentiments.polarity.value.text]))
    row_s = pd.Series(row)
    row_s.name = i
    general_tweets_corpus_train_2012 = general_tweets_corpus_train_2012.append(row_s)

In [25]:
general_tweets_corpus_train_2012.polarity.value_counts(normalize=False)

P       2884
N       2182
NONE    1483
NEU      670
Name: polarity, dtype: int64

In [24]:
# Number of tweets to remove
positive_to_remove = 2884-670
negative_to_remove = 2182-670
positive_removed = 0
negative_removed = 0

# Init balanced version
general_tweets_corpus_train_2012_balanced = pd.DataFrame(columns=('content', 'polarity'))

# We balance the dataset, to have the same number of positive, negative and neutral polarity
for i, row in general_tweets_corpus_train_2012.iterrows():
    if(row.polarity) == 'P':
        positive_removed = positive_removed + 1
        if(positive_removed > positive_to_remove):
            row_p = dict(zip(['content', 'polarity'], [row.content, row.polarity]))
            row_s = pd.Series(row_p)
            row_s.name = i
            general_tweets_corpus_train_2012_balanced = general_tweets_corpus_train_2012_balanced.append(row_s)
    if(row.polarity) == 'N':
        negative_removed = negative_removed + 1
        if(negative_removed > negative_to_remove):
            row_p = dict(zip(['content', 'polarity'], [row.content, row.polarity]))
            row_s = pd.Series(row_p)
            row_s.name = i
            general_tweets_corpus_train_2012_balanced = general_tweets_corpus_train_2012_balanced.append(row_s)
    if(row.polarity) == 'NEU':
        row_p = dict(zip(['content', 'polarity'], [row.content, row.polarity]))
        row_s = pd.Series(row_p)
        row_s.name = i
        general_tweets_corpus_train_2012_balanced = general_tweets_corpus_train_2012_balanced.append(row_s)

In [26]:
general_tweets_corpus_train_2012_balanced.polarity.value_counts(normalize=False)

P      670
NEU    670
N      670
Name: polarity, dtype: int64

Import/read emoji sentiment dataset, to concatenate with the previous ones. Build column `polarity` according to the following criteria:
- If sentiment score is between -1 and -0.2, consider it a **negative** sentiment (`N`).
- If sentiment score is between -0.2 and 0.2, consider it a **neutral** sentiment (`NEU`).
- If sentiment score is between 0.2 and 1, consider it a **positive** sentiment (`P`).

In [28]:
# Read emojis CSV
emoji_dataset = pd.read_csv(emojis_csv, encoding='utf-8')

# Init dataframe to append to corpus
emoji_corpus = pd.DataFrame(columns=('content', 'polarity'))

# Build column 'polarity
emoji_dataset['polarity'] = 'NEU'
emoji_dataset['polarity'][emoji_dataset.sentiment < 0] = 'N'
emoji_dataset['polarity'][emoji_dataset.sentiment > 0.2] = 'P'

for i, row in emoji_dataset.iterrows():
    new_row = dict(zip(['content', 'polarity'], [chr(int(row.emoji, 16)), row.polarity]))
    row_s = pd.Series(new_row)
    row_s.name = i
    emoji_corpus = emoji_corpus.append(row_s)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [30]:
emoji_corpus.polarity.value_counts(normalize=False)

P      482
NEU    170
N       99
Name: polarity, dtype: int64

Concatenate general corpus dataset with 2017 one, to have a better result:

In [31]:
tweets_corpus = pd.concat([
        general_tweets_corpus_train_2012,
        general_tweets_corpus_train_2017,
        emoji_corpus
    ])

In [32]:
# Remove tweets with polarity 'NONE'
tweets_corpus = tweets_corpus.query('polarity != "NONE"')

In [33]:
tweets_corpus.polarity.value_counts(normalize=True)

P      0.406646
NEU    0.307911
N      0.285443
Name: polarity, dtype: float64

In [34]:
# Import regex tools
import re

# Build emoji regex
emoji_string = '|'.join(emoji_corpus['content'])
emoji_regex = re.compile(r'(%s)' % emoji_string)

In [10]:
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
621,🐐,P
6023,Como ir en avión... http://t.co/AZ9z4uFM,NONE
1690,"""@MartaG_novo: A las 13:00, los portavoces mun...",NONE
818,"Despacio, en paz http://t.co/AQKTSxq4 via @el...",NONE
11,☺,P
4051,¿Y debería ser compatible la tarjeta dorada y...,N
156,Fin a la impunidad en #Internet http://t.co/j7...,P
5477,Si Aguirre y Gallardón se ponen de acuerdo en ...,N
563,▂,NEU
373,🐨,P


In [11]:
print('Total corpus tweets: %d' % len(tweets_corpus))

Total corpus tweets: 8978


### Data cleaning

In [13]:
from cleaner import clean_tweets

Now, we clean the train and test data with the previous function.

In [14]:
tweets_corpus = clean_tweets(tweets_corpus, 'content')

In [15]:
print('Total corpus tweets after cleaning: %d' % len(tweets_corpus))

Total corpus tweets after cleaning: 7356


In [17]:
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
192,"pero nos hemos desmovilizado, Concha. Nosotro...",N
5323,Sí. Grabando el pgrama para el invierno más SE...,N
4168,A las 17.45 viene al estudio de #JELO Alicia S...,P
308,🐻,P
5791,"El lunes 26, estreno de El Número Uno vía",P
192,👸,P
5435,"""Batasuna toma el Parlamento Vasco"" via",N
2171,aquí podéis encontrar las fotos de mi segundo ...,P
3956,Detalle inquietante de la conformación Multidi...,NEU
5347,A visitar Andiex . Empresa antequerana de vent...,P


Export corpus tweets as CSV:

In [56]:
tweets_corpus.to_csv(corpus_tweets_csv, encoding='utf-8', index=False)