# LSTM Corpus Creation

In [1]:
import io
import numpy as np
import pandas as pd

In [2]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [3]:
import pickle

In [4]:
tokenizer = Tokenizer(num_words=20000,lower=False)

In [5]:
def read_ds(ds_list):
    supreme = []
    for tweet in ds_list:
        supreme.append(tweet.split('|',2)[-2:])
    return supreme

def  change_value(strings):
    if strings == 'ironic':
        return 1
    else:
        return 0

In [6]:
def saveDict(path, sequence_dict):
    return np.save(path, sequence_dict)
    

def getSequence(tweets_text):
    tokenizer.fit_on_texts(tweets_text)
    seq_dict = tokenizer.word_index
    return tokenizer.texts_to_sequences(tweets_text), seq_dict


def list2vec(corpus_list):
    arr = np.array(read_ds(corpus_list))
    df = pd.DataFrame(data=arr,columns=['cat','tweet'])
    alter = df['cat'].apply(change_value)
    df.insert(loc=2,column='category',value=alter)
    tweets = df['tweet'].iloc[:]
    tweet_sec, s_dict = getSequence(tweets)
    df.insert(loc=3,column='tweet_sec', value=tweet_sec)
    return df[['category','tweet_sec']], s_dict

# IMPORTANT CLARIFICATION!!!
In order to build a data set from easy way, for saving and reading, we used the *Pickle Library*, in this particular case we save and read a **Pandas DataFrame**. Because, we consider this is the best way for setting up a data set for future use in any kind of ANN Model, but we have to say, this is not for human purposes, because the nature of *Pickle Library*, try to read the *txt file* can result impossible.  

For this situation, we also save a data set copy by using a **Pandas Method** which is called **to_csv()**, thus we have a readable data set. We do not consider this path for saving and reading the data sets, because we found a **big trouble** when we try to save the *data frame*, **Pandas** saves a string instead of vector (**tweet_sec**), and we prefer to avoid this trouble in order to not increase the complexy of the project.


In [8]:
df = pd.read_csv('./data/ann/vectors_10_90_readable.txt',sep='|')

In [9]:
thing = df.tweet_sec.iloc[::][0] 
print thing
print type(thing)

[1750, 1550, 2072, 1, 46, 792, 588, 1750, 1550, 3982, 10, 46, 5925, 2, 32, 5505, 1, 205, 4, 3]
<type 'str'>


### Corpus with 10-90 distribution

In [109]:
with io.open('data/corpus_10_90.txt','r',encoding='utf8') as f:
    corpus_10_90 = f.readlines()

In [110]:
vecorp_10_90 = list2vec(corpus_10_90) 
vecorp_10_90.count()

category     76530
tweet_sec    76530
dtype: int64

In [112]:
vecorp_10_90.head()

Unnamed: 0,category,tweet_sec
0,0,"[1750, 1550, 2072, 1, 46, 792, 588, 1750, 1550..."
1,0,"[43, 1562, 15, 13, 438, 9, 130]"
2,1,"[309, 25, 296, 31, 310, 91, 25, 145, 294, 21, ..."
3,0,"[24, 4522, 1385, 4415, 23, 21, 25, 2850, 6146,..."
4,0,"[762, 24, 17280, 2, 645, 7, 21, 511, 40, 15364..."


In [113]:
vecfile = open("./data/ann/vectors_10_90.txt","w")
pickle.dump(vecorp_10_90, vecfile)

In [114]:
vecorp_10_90.to_csv('./data/ann/vectors_10_90_readable.txt',sep='|',index=False)

### Corpus with 30-70 distribution

In [115]:
with io.open('data/corpus_30_70.txt','r',encoding='utf8') as f:
    corpus_30_70 = f.readlines()

In [116]:
vecorp_30_70 = list2vec(corpus_30_70)
vecorp_30_70.count()

category     25510
tweet_sec    25510
dtype: int64

In [25]:
vecorp_30_70.head()

NameError: name 'vecorp_30_70' is not defined

In [118]:
vecfile = open("./data/ann/vectors_30_70.txt","w")
pickle.dump(vecorp_30_70, vecfile)

In [119]:
vecorp_30_70.to_csv('./data/ann/vectors_30_70_readable.txt',sep='|',index=False)

### Corpus with 50-50 distribution

In [7]:
with io.open('data/corpus_50_50.txt','r',encoding='utf8') as f:
    corpus_50_50 = f.readlines()

In [8]:
vecorp_50_50, dict_50_50 = list2vec(corpus_50_50)
vecorp_50_50.count()

category     15306
tweet_sec    15306
dtype: int64

In [9]:
vecorp_50_50.head()

Unnamed: 0,category,tweet_sec
0,1,"[268, 459, 146, 2, 470, 4493]"
1,1,"[2264, 7, 9784, 3050]"
2,0,"[80, 172, 35, 1, 14, 39, 2265, 7, 9785, 2266, ..."
3,1,"[107, 41, 21, 471, 9, 659, 5, 216, 300, 5, 7, ..."
4,1,"[562, 106, 29, 563, 29, 422]"


In [15]:
dict_50_50['http://link']

KeyError: '@'

In [14]:
values = dict_50_50.values()
values.sort()
print values[-1]
print len(values)

29505
29505


In [44]:
dict_file = open("./data/ann/dicFile_50_50.p","wb")
pickle.dump(dict_50_50, dict_file)

In [15]:
vecfile = open("./data/ann/vectors_50_50.txt","w")
pickle.dump(vecorp_50_50, vecfile)

In [16]:
vecorp_50_50.to_csv('./data/ann/vectors_50_50_readable.txt',sep='|',index=False)

In [17]:
saveDict("./data/ann/dicFile_50_50.npy", dict_50_50)