# Election Tweets

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.text import *
from fastai.imports import *
from sklearn.model_selection import train_test_split

In [3]:
DATA_PATH=Path('../data/election_tweets')
CLAS_PATH=Path('./tweets_classifier')
TWEETSLM_PATH=Path('./LM_tweets')

## Preparing the data

In [41]:
df = pd.read_csv(DATA_PATH/'corpus_completo_5_temas_08-sep-2016-codificacion_5_temas.csv',
                usecols=['Tema_05', 'Texto'])


In [42]:
df['Tema_05'] = df['Tema_05'].replace([1, 2, 9, 10, 11], [1, 2, 3, 4, 5])

In [43]:
tweets = df['Texto'].values
labels = df['Tema_05'].values
LABELS = set(labels)

In [44]:
def flattern(A):
    'Source: https://stackoverflow.com/a/17867797/7273299'
    
    'Flattens a list of lists and strings into a list.'
    rt = []
    for i in A:
        if isinstance(i,list): rt.extend(flattern(i))
        else: rt.append(i)
    return rt

def isInt(v):
    'Source: https://stackoverflow.com/a/9859202/7273299'
    
    'Checks if a string is a number.'
    try:     i = int(v)
    except:  return False
    return True

def char_count(word, chars, lbound=2):
    char_count = [word.count(char) for char in chars]
    return all(i >= lbound for i in char_count)

def replace_lol(repl_str, texts):
    for string, chars in repl_str:
        texts = [[[string, i] if char_count(i, set(chars), 2) else i for i in text.split()] for text in texts]
        texts = np.array([flattern(text) for text in texts])
        texts = np.array([' '.join(text) for text in texts])
    return texts

def preprocess_tweets(tweets):
    """
    twitter specific text processing and shuffle
    """
    # Placeholders for hyperlinks and user references
    tweets = [['hyp_link' if i.startswith('http') 
               #else 'user_ref' if i.startswith('@') 
               else i for i in tweet.split()] for tweet in tweets]
    tweets = np.array([' '.join(i) for i in tweets])

    # Prefix for Hashtags
    tweets = [[['hash_tag', i] if i.startswith('#') else i for i in tweet.split()] for tweet in tweets]
    tweets = np.array([flattern(tweet) for tweet in tweets])
    tweets = np.array([' '.join(i) for i in tweets])

    # Prefix for integers
    tweets = [[['int_string', i] if isInt(i) else i for i in tweet.split()] for tweet in tweets]
    tweets = np.array([flattern(tweet) for tweet in tweets])
    tweets = np.array([' '.join(i) for i in tweets])

    # Prefix for slang
    tweets = [[['que', 'slang_string'] if i=='q' else ['por', 'slang_string'] if i=='x' else ['de', 'slang_string'] if i=='d' else ['Que', 'slang_string'] if i=='Q' else ['Por', 'slang_string'] if i=='X' else ['De', 'slang_string'] if i=='D' else i for i in tweet.split()] for tweet in tweets]
    tweets = np.array([flattern(tweet) for tweet in tweets])
    tweets = np.array([' '.join(i) for i in tweets])

    # Lol type characters
    repl_str = [('risa_ja','ja'), ('risa_ji','ji'), ('risa_je','je'), ('risa_jo','jo'), ('risa_ju', 'ju')]

    # Adding prefix to lol type characters
    tweets = replace_lol(repl_str, tweets)

   
    
    return tweets



In [45]:
tweets = preprocess_tweets(tweets)

In [46]:
col_names = ['label', 'tweet']

In [47]:
tweets_train, tweets_test, labels_train, labels_test = train_test_split(
                                                    tweets, labels,
                                                    stratify=labels, 
                                                    test_size=0.20)

tweets_train, tweets_valid, labels_train, labels_valid = train_test_split(
                                                    tweets_train, labels_train,
                                                    stratify=labels_train, 
                                                    test_size=0.10)

print('train shapes:', tweets_train.shape, labels_train.shape)
print('valid shapes:', tweets_valid.shape, labels_valid.shape)
print('test shapes:', tweets_test.shape, labels_test.shape)

train shapes: (2242,) (2242,)
valid shapes: (250,) (250,)
test shapes: (624,) (624,)


In [48]:
df_trn = pd.DataFrame({'tweet':tweets_train, 'label':labels_train}, columns=col_names)
df_val = pd.DataFrame({'tweet':tweets_valid, 'label':labels_valid}, columns=col_names)
df_tst = pd.DataFrame({'tweet':tweets_test, 'label':labels_test}, columns=col_names)

In [49]:
print(df_trn['label'].value_counts())
print(df_val['label'].value_counts())
print(df_tst['label'].value_counts())

2    794
1    531
3    516
5    258
4    143
Name: label, dtype: int64
2    88
1    59
3    58
5    29
4    16
Name: label, dtype: int64
2    220
1    148
3    144
5     72
4     40
Name: label, dtype: int64


In [50]:
def balance_df(df):
    lst = [df]
    max_size = df['label'].value_counts().max()
    for class_index, group in df.groupby('label'):
        lst.append(group.sample(max_size-len(group), replace=True))
    df = pd.concat(lst)
    return df

df_trn = balance_df(df_trn)
df_val = balance_df(df_val)

print(df_trn['label'].value_counts())
print(df_val['label'].value_counts())
print(df_tst['label'].value_counts())

5    794
3    794
1    794
4    794
2    794
Name: label, dtype: int64
5    88
4    88
3    88
2    88
1    88
Name: label, dtype: int64
2    220
1    148
3    144
5     72
4     40
Name: label, dtype: int64


In [54]:
df_trn.to_csv(CLAS_PATH/'train.csv', header=False, index=False)
df_val.to_csv(CLAS_PATH/'validation.csv', header=False, index=False)
df_tst.to_csv(CLAS_PATH/'test.csv', header=False, index=False)

(CLAS_PATH/'classes.txt').open('w').writelines(f'{o}\n' for o in LABELS)

### Create dataframes for Language Model fine-tuning

In [55]:
tweets_lm = np.concatenate([tweets_train, tweets_valid, tweets_test])
labels_lm = np.concatenate([labels_train, labels_valid, labels_test])

In [64]:
df_all = pd.DataFrame({'tweet':tweets_lm, 'label':[0]*len(tweets_lm)}, columns=col_names)

In [65]:
df_all.to_csv(TWEETSLM_PATH/'texts.csv', header=False, index=False)

In [66]:
df_all.shape

(3116, 2)

## Language model

In [4]:
bs = 64
data_lm = TextDataBunch.from_csv(TWEETSLM_PATH,
                                 'texts.csv',
                                 valid_pct=0.1,
                                 chunksize=1000,
                                 bs=bs,
                                 tokenizer=Tokenizer(lang='es'))


#data_lm.save('LM_tweets')

In [5]:
data_lm.show_batch()

text,target
"xxbos xxup rt xxunk : hash_tag # xxup pp hash_tag # xxup psoe hash_tag # xxmaj ciudadanos ... la democracia solo la xxunk en las xxunk entre amigos . hash_tag # xxup vox xxmaj por una xxmaj españa con xxmaj valores hyp_link xxup tweet xxup citado xxup por xxup el xxup retweeteado : "" xxmaj el hash_tag # xxup psoe es una fábrica de parados . xxmaj tenéis un problema",0
"xxbos xxup rt xxunk : xxmaj estaba esperando el "" eso no me lo dices en la calle "" . hash_tag # elpaisdebate hyp_link xxup tweet xxup citado xxup por xxup el xxup retweeteado : xxmaj ese "" cómo "" de xxmaj iglesias ha sido como de club a las xxunk . - ¿ xxmaj cómo ? ¿ xxmaj que xxmaj monedero qué ? - xxmaj lo que has xxunk",0
"xxbos xxup rt xxunk : xxup interés > > hash_tag # encuesta sobre el próximo gobierno de xxmaj españa y el bipartidismo hash_tag # xxup cis hash_tag # xxup 20d > > https : / / t.co / xxunk < < hyp_link xxup tweet xxup citado xxup por xxup el xxup retweeteado : xxmaj si el hash_tag # bipartidismo hash_tag # xxup ppsoe no consigue mayoría de gobierno , xxunk",0
"xxbos xxup rt xxunk : xxmaj los debates los ganan los que no van . o el xxunk está xxunk , o nuestros políticos no saben debatir . hyp_link xxup tweet xxup citado xxup por xxup el xxup retweeteado : xxmaj rajoy ganó el debate a int_string 4 al que no fue . xxmaj rivera e xxmaj iglesias han ganado el hash_tag # caraacaral6 al que no han ido .",0
xxbos xxup rt @antoniomaestre : xxmaj el tuit es falso . xxmaj pero está bien que xxunk machista un tuit que defiende lo que decía su programa electoral . hyp_link xxup tweet xxup citado xxup por xxup el xxup retweeteado : xxmaj los nervios del xxup psoe y el ' todo vale ' : xxmaj chacón saca impreso en un mitin un tuit machista falso como si fuera mío .,0


In [6]:
learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.3, pretrained=False)

In [7]:
learn.load_pretrained('LM_tweets/models/LM_wiki/fine_tuned.pth', 'LM_tweets/models/LM_wiki/itos.pkl')
learn.unfreeze()

In [8]:
learn.lr_find()

LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.


ValueError: Expected input batch_size (5504) to match target batch_size (64).

LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.


ValueError: Expected input batch_size (5376) to match target batch_size (64).