## Tweet classification

File names

In [120]:
base_location = '../../ml-football-tweets/'

# To read
tweets_raw_file = base_location + 'footballsentiment_task.csv'
tweets_run_file = base_location + 'footballsentiment_task_run.csv'
corpus_tweets_2012_xml = base_location + 'general-train-tagged-3l.xml'
corpus_tweets_2017_xml = base_location + 'intertass-train-tagged.xml'

# To generate
corpus_tweets_2012_csv = base_location + 'general-train-tagged-3l.csv'
corpus_tweets_2017_csv = base_location + 'intertass-train-tagged.csv'
corpus_tweets_csv = base_location + 'corpus_tweets.csv'

Import libraries

In [2]:
import pandas as pd
import numpy as np

### Load datasets

In [90]:
tweets_raw = pd.read_csv(tweets_raw_file, encoding='utf-8')
tweets_run = pd.read_csv(tweets_run_file, encoding='utf-8')

print('Total tweets: %d' % len(tweets_raw))
print('Evaluated tweets so far: %d' % len(tweets_run))

Total tweets: 801
Evaluated tweets so far: 1696


### Create train and test data

Aux function to convert evaluations to numeric values, accordnig to the rule:
- `Negativo`: -1.
- `Positivo`: 1.
- `Neutro`: 0.
- Anything else: 2.

In [93]:
def convert_to_numeric(evaluation):
    if evaluation == 'Positivo':
        return 1
    elif evaluation == 'Neutro':
        return 0
    elif evaluation == 'Negativo':
        return -1
    else:
        return 2    

Build new array of dictionaries with keys `id` (the task ID), `tweet` (the tweet string) and `score` (the tweet evaluation) by joining data from both CSV files.

In [108]:
# Build dictionary of tweets where key is the task__id
tweets_obj = {}
for index, row in tweets_raw.iterrows():
    tweets_obj[row.task__id] = row.taskinfo__Tweet

# Build dictinary of tweet scores where key is the task_id
scores_obj = {}
for index, row in tweets_run.iterrows():
    scores_obj[row.task_run__task_id] = row.task_run__info

# Create final wteets dictionary
own_tweets = []
for i, key in enumerate(scores_obj):
    own_tweets.append({
        'id': key,
        'tweet': tweets_obj[key], 
        'score': convert_to_numeric(scores_obj[key])
    })

print('Total different tweets evalauted so far: %d' % len(own_tweets))

Total different tweets evalauted so far: 588


### POS Tagging

Import libraries to read XML:

In [111]:
from lxml import objectify

Import/read most recent corpus (2017):

In [131]:
# 4 values of sentiment: N, P, NONE, NEU
xml = objectify.parse(open(corpus_tweets_2017_xml))
root = xml.getroot()
general_tweets_corpus_train = pd.DataFrame(columns=('content', 'polarity'))
tweets = root.getchildren()
for i in range(0, len(tweets)):
    tweet = tweets[i]
    row = dict(zip(['content', 'polarity'], [tweet.content.text, tweet.sentiment.polarity.value.text]))
    row_s = pd.Series(row)
    row_s.name = i
    general_tweets_corpus_train_2017 = general_tweets_corpus_train_2017.append(row_s)
    
general_tweets_corpus_train_2017.to_csv(corpus_tweets_2017_csv, index=False, encoding='utf-8')

Import/read biggest corpus (2012), to concatenate it with the previous noe:

In [132]:
# 4 values of sentiment: N, P, NONE, NEU
xml = objectify.parse(open(corpus_tweets_2012_xml))
root = xml.getroot()
general_tweets_corpus_train = pd.DataFrame(columns=('content', 'polarity'))
tweets = root.getchildren()
for i in range(0, len(tweets)):
    tweet = tweets[i]
    row = dict(zip(['content', 'polarity'], [tweet.content.text, tweet.sentiments.polarity.value.text]))
    row_s = pd.Series(row)
    row_s.name = i
    general_tweets_corpus_train_2012 = general_tweets_corpus_train_2012.append(row_s)
    
general_tweets_corpus_train_2012.to_csv(corpus_tweets_2012_csv, index=False, encoding='utf-8')

Concatenate general corpus dataset with 2017 one, to have a better result:

In [148]:
tweets_corpus = pd.concat([
        general_tweets_corpus_train_2012,
        general_tweets_corpus_train_2017
    ])
tweets_corpus.sample(10)

Unnamed: 0,content,polarity
5726,Comenzamos el encuentro con autónomos de #Mala...,NONE
2056,Enhorabuena a todos lo candidatos del goya,P
4127,siguiendo a @SiempreAsiMlaga ;-))) gran lugar ...,P
2869,http://t.co/0Z79yyLR,NONE
375,@virginiadlp ahora es cuando tomo mis notas y ...,NONE
5424,Deudas de los ayunt. Álvaro habla de la deuda ...,N
5242,Asi andamos. Este es el ambiente. http://t.co/...,NONE
2052,"Revista de Prensa (ABC Punto Radio, 10 de ener...",NONE
3628,"Portada de 'Público', domingo. Rubalcaba gana ...",P
2070,No me parece criticable que la primera entrevi...,P


Remove tweets without polarity (polarity `NONE`):

In [150]:
tweets_corpus = tweets_corpus.query('polarity != "NONE"')

Remove links:

In [153]:
tweets_corpus = tweets_corpus[-tweets_corpus.content.str.contains('^http.*$')]

In [154]:
print('Total corpus tweets: %d' % len(tweets_corpus))

Total corpus tweets: 6588


### Tokenization and stemming

In [None]:
#download spanish stopwords
import nltk
nltk.download("stopwords")

from nltk.corpus import stopwords
spanish_stopwords = stopwords.words('spanish')

In [None]:
from string import punctuation
non_words = list(punctuation)

#we add spanish punctuation
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))
non_words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer       
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = SnowballStemmer('spanish')
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = ''.join([c for c in text if c not in non_words])
    # tokenize
    tokens =  word_tokenize(text)

    # stem
    try:
        stems = stem_tokens(tokens, stemmer)
    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems