In [10]:
import pandas as pd
import tensorflow as tf
import os
import tensorflow_text as tf_text
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [11]:
def first_data_prep():
    data = pd.read_csv('Data/twitter_data.csv', 
                      encoding='latin-1', names=['sentiment', 'id', 'date', 'flag', 'user',
                                                'text'])
    data['sentiment'].replace(4, 1, inplace=True)
    data = data.sample(frac=1).reset_index(drop=True).copy()
    data['text'] = data['text'].str.lower()
    return data

In [12]:
def delete_nicknames(row):
    # Prepare list of words
    words = row.split()
    # Remove nicknames
    for word in words:
        if word[0] == '@':
            words.remove(word)
    # Return string 
    return ' '.join(word for word in words)

In [13]:
def stratified_split(df, size):
    # Split dataset into smaller one 
    col_list = list(df.columns)
    # Drop target column name
    col_list.pop(0)
    x_train, x_valid = train_test_split(
    df, random_state=1, stratify=df['sentiment'], test_size=size)
    # Prepare new indexes 
    x_valid.reset_index(drop=True, inplace=True)
    return x_valid 
    

In [14]:
def replace_with_space(text):
    # Replace new rows with space 
    text = text.replace('\n', " ").replace("\r", " ")
    # Create list of all not needed chars 
    punc_list = '!"@#$%^&*()+_-.<>?/:;[]{}|\~'
    # Make transformation with dict that contains punc_list chars
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    # Apply transformation
    text = text.translate(t)
    # Replace single quote with empty char
    t = text.maketrans(dict.fromkeys("'`"))
    text.translate(t)
    
    return text
    

In [15]:
def remove_stop_words(text):
    # Prepare set of stopwords
    stop_words = set(stopwords.words('english'))
    # Remove stopwords from the text
    filtered_text = [word for word in text.split() if not word in stop_words]
    
    return filtered_text

In [49]:
def tokenize(data, num_words, num_words_pad): 
    data = data.copy()
    # Apply replace func that replace chars with spaces
    data['text'] = data['text'].apply(lambda x: replace_with_space(x)).copy()
    # Apply func that removes stop words
    data['text'] = data['text'].apply(lambda x: remove_stop_words(x))
    # Initialize tokenizer
    tok = tf.keras.preprocessing.text.Tokenizer(num_words=num_words)
    # Updates internal vocabulary based on a list of texts 
    tok.fit_on_texts(list(data['text']))
    # Transforms each text in texts to a sequence of integers.
    seq = tok.texts_to_sequences(list(data['text']))
    # Pad sequences to make them same lenght 
    tf_ready = tf.keras.preprocessing.sequence.pad_sequences(seq)
    
    return tf_ready, tok

In [None]:
data = first_data_prep()
split_data= stratified_split(data, 0.3)
tf_ready, tok = tokenize(split_data, 10000, 15)

In [54]:
tf_df = pd.DataFrame(tf_ready)
tf_df['sentiment'] = data['sentiment']
tf_df.to_csv('tokenized_data.csv', index=False)

In [53]:
td_df = pd.DataFrame(tf_ready)
td_df.shape

(160000, 38)

In [48]:
len(tok.word_index)

147631

In [52]:
def strip(row):
    return row.split()

data['text'].apply(lambda x: strip(x))

0          [@elleasinswell, oh,, i'll, have, to, try, it!...
1          [@cmlundy, done!!!!!, i, really, need, one, to...
2          [lost, google, notebook, ie, add, on, with, th...
3          [@natalietran, at, least, they, have, a, moral...
4          [@kirstyhilton, ive, been, trying, to, get, mi...
                                 ...                        
1599995    [ain't, watching, the, laker, game,, i, can't,...
1599996    [bummed, about, the, softball, loss, 0-1, thes...
1599997    [back, in, god's, hands,, back, in, god's, han...
1599998                                         [bbq, party]
1599999    [@iamjemzie, what, time, is, this, and, where,...
Name: text, Length: 1600000, dtype: object

In [46]:
data['text'].str.len().max()

374

In [51]:
test.strip()

'@cmlundy done!!!!! i really need one too... aritzia hasnt gotten back yet  dammit!!'