# LSTM Corpus Creation

In [22]:
import io
import numpy as np
import pandas as pd

In [23]:
from keras.preprocessing.text import Tokenizer

In [24]:
import pickle

In [25]:
tokenizer = Tokenizer(num_words=20000,lower=False)

In [26]:
def read_ds(ds_list):
    supreme = []
    for tweet in ds_list:
        supreme.append(tweet.split('|',2)[-2:])
    return supreme

def  change_value(strings):
    if strings == 'ironic':
        return 1
    else:
        return 0

In [27]:
def list2vec(corpus_list):
    arr = np.array(read_ds(corpus_list))
    df = pd.DataFrame(data=arr,columns=['cat','tweet'])
    alter = df['cat'].apply(change_value)
    df.insert(loc=2,column='category',value=alter)
    tweets = df['tweet'].iloc[:]
    tokenizer.fit_on_texts(tweets)
    tokenizer.word_index
    tweet_sec = tokenizer.texts_to_sequences(tweets)
    df.insert(loc=3,column='tweet_sec', value=tweet_sec)
    return df[['category','tweet_sec']], tokenizer.word_index

# IMPORTANT CLARIFICATION!!!
In order to build a data set from easy way, for saving and reading, we used the *Pickle Library*, in this particular case we save and read a **Pandas DataFrame**. Because, we consider this is the best way for setting up a data set for future use in any kind of ANN Model, but we have to say, this is not for human purposes, because the nature of *Pickle Library*, try to read the *txt file* can result impossible.  

For this situation, we also save a data set copy by using a **Pandas Method** which is called **to_csv()**, thus we have a readable data set. We do not consider this path for saving and reading the data sets, because we found a **big trouble** when we try to save the *data frame*, **Pandas** saves a string instead of vector (**tweet_sec**), and we prefer to avoid this trouble in order to not increase the complexy of the project.


In [69]:
df = pd.read_csv('./data/ann/vectors_10_90_readable.txt',sep='|')

In [74]:
thing = df.tweet_sec.iloc[::][0] 
print thing
print type(thing)

[1696, 1451, 2007, 1, 44, 779, 584, 1696, 1451, 3658, 10, 44, 5364, 2, 32, 5094, 1, 198, 4, 3]
<type 'str'>


### Corpus with 10-90 distribution

In [109]:
with io.open('data/corpus_10_90.txt','r',encoding='utf8') as f:
    corpus_10_90 = f.readlines()

In [110]:
vecorp_10_90 = list2vec(corpus_10_90) 
vecorp_10_90.count()

category     76530
tweet_sec    76530
dtype: int64

In [112]:
vecorp_10_90.head()

Unnamed: 0,category,tweet_sec
0,0,"[1750, 1550, 2072, 1, 46, 792, 588, 1750, 1550..."
1,0,"[43, 1562, 15, 13, 438, 9, 130]"
2,1,"[309, 25, 296, 31, 310, 91, 25, 145, 294, 21, ..."
3,0,"[24, 4522, 1385, 4415, 23, 21, 25, 2850, 6146,..."
4,0,"[762, 24, 17280, 2, 645, 7, 21, 511, 40, 15364..."


In [113]:
vecfile = open("./data/ann/vectors_10_90.txt","w")
pickle.dump(vecorp_10_90, vecfile)

In [114]:
vecorp_10_90.to_csv('./data/ann/vectors_10_90_readable.txt',sep='|',index=False)

### Corpus with 30-70 distribution

In [115]:
with io.open('data/corpus_30_70.txt','r',encoding='utf8') as f:
    corpus_30_70 = f.readlines()

In [116]:
vecorp_30_70 = list2vec(corpus_30_70)
vecorp_30_70.count()

category     25510
tweet_sec    25510
dtype: int64

In [117]:
vecorp_30_70.head()

Unnamed: 0,category,tweet_sec
0,1,"[1160, 1, 46, 2058, 414, 6, 14474, 1158, 1, 47..."
1,1,"[101, 7, 3233, 5, 6290, 7, 24, 9, 462, 244]"
2,0,"[84, 222, 7205, 95, 1, 35, 66, 14, 27, 63, 100..."
3,1,"[7135, 18, 13753, 2, 16814, 33, 2225, 33, 1924..."
4,0,"[8968, 22, 161, 1, 162, 127, 339, 15, 331, 10,..."


In [118]:
vecfile = open("./data/ann/vectors_30_70.txt","w")
pickle.dump(vecorp_30_70, vecfile)

In [119]:
vecorp_30_70.to_csv('./data/ann/vectors_30_70_readable.txt',sep='|',index=False)

### Corpus with 50-50 distribution

In [31]:
with io.open('data/corpus_50_50.txt','r',encoding='utf8') as f:
    corpus_50_50 = f.readlines()

In [32]:
vecorp_50_50, dict_50_50 = list2vec(corpus_50_50)
vecorp_50_50.count()

category     15306
tweet_sec    15306
dtype: int64

In [33]:
vecorp_50_50.head()

Unnamed: 0,category,tweet_sec
0,1,"[268, 459, 146, 2, 470, 4493]"
1,1,"[2264, 7, 9784, 3050]"
2,0,"[80, 172, 35, 1, 14, 39, 2265, 7, 9785, 2266, ..."
3,1,"[107, 41, 21, 471, 9, 659, 5, 216, 300, 5, 7, ..."
4,1,"[562, 106, 29, 563, 29, 422]"


In [41]:
dict_file = open("./data/ann/dicFile_50_50.txt","wb")
pickle.dump(dict_50_50, dict_file, pickle.HIGHEST_PROTOCOL)

In [34]:
vecfile = open("./data/ann/vectors_50_50.txt","w")
pickle.dump(vecorp_50_50, vecfile)

In [35]:
vecorp_50_50.to_csv('./data/ann/vectors_50_50_readable.txt',sep='|',index=False)

In [40]:
np.save("./data/ann/dicFile_50_50.npy", dict_50_50)