In [1]:
import pandas as pd
import tensorflow as tf
import os
import tensorflow_text as tf_text
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [13]:
def first_data_prep():
    data = pd.read_csv('Datasets/trainingandtestdata/160k_train.csv', 
                      encoding='latin-1', names=['sentiment', 'id_number', 'date', 'query',
                                                'user', 'text'])
    data['sentiment'].replace(4, 1, inplace=True)
    data = data.sample(frac=1).reset_index(drop=True).copy()
    data['text'] = data['text'].str.lower()
    return data

In [14]:
def stratified_split(df, size):
    # Split dataset into smaller one 
    col_list = list(df.columns)
    # Drop target column name
    col_list.pop(0)
    x_train, x_valid = train_test_split(
    df, random_state=1, stratify=df['sentiment'], test_size=size)
    # Prepare new indexes 
    x_valid.reset_index(drop=True, inplace=True)
    return x_valid 
    

In [15]:
data = first_data_prep()
data = stratified_split(data, 0.15)

In [16]:
def replace_with_space(text):
    # Replace new rows with space 
    text = text.replace('\n', " ").replace("\r", " ")
    # Create list of all not needed chars 
    punc_list = '!"@#$%^&*()+_-.<>?/:;[]{}|\~'
    # Make transformation with dict that contains punc_list chars
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    # Apply transformation
    text = text.translate(t)
    # Replace single quote with empty char
    t = text.maketrans(dict.fromkeys("'`"))
    text.translate(t)
    
    return text
    

In [17]:
def remove_stop_words(text):
    # Prepare set of stopwords
    stop_words = set(stopwords.words('english'))
    # Remove stopwords from the text
    filtered_text = [word for word in text.split() if not word in stop_words]
    
    return filtered_text

In [18]:
def tokenize(data, num_words, num_words_pad): 
    data = data.copy()
    # Apply replace func that replace chars with spaces
    data['text'] = data['text'].apply(lambda x: replace_with_space(x)).copy()
    # Apply func that removes stop words
    data['text'] = data['text'].apply(lambda x: remove_stop_words(x))
    # Initialize tokenizer
    tok = tf.keras.preprocessing.text.Tokenizer(num_words=num_words)
    # Updates internal vocabulary based on a list of texts 
    tok.fit_on_texts(list(data['text']))
    # Transforms each text in texts to a sequence of integers.
    seq = tok.texts_to_sequences(list(data['text']))
    # Pad sequences to make them same lenght 
    tf_ready = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=num_words_pad)
    
    return tf_ready, tok

In [19]:
tf_ready, tok = tokenize(data, 1000, 100)

In [20]:
tf_df = pd.DataFrame(tf_ready)

In [76]:
data.reset_index(drop=True, inplace=True)

In [24]:
data

Unnamed: 0,sentiment,id_number,date,query,user,text
0,1,1882464257,Fri May 22 06:55:16 PDT 2009,NO_QUERY,xBritishBluex,sweet @sarah1025 &quot;tweet tweet&quot; said...
1,0,2206890830,Wed Jun 17 07:11:49 PDT 2009,NO_QUERY,lhopsss13,ugh i really don't want to work today. someone...
2,1,2176914163,Mon Jun 15 05:02:45 PDT 2009,NO_QUERY,misssxi,missing someone
3,0,2254585170,Sat Jun 20 09:58:42 PDT 2009,NO_QUERY,ACEmusicMGMT,ehhh work.... i'm jealous @josjrosemusic i wan...
4,0,2327217890,Thu Jun 25 08:04:06 PDT 2009,NO_QUERY,CharmingJes,on the way to get maddox's shots.
...,...,...,...,...,...,...
239995,1,1469246015,Tue Apr 07 05:44:50 PDT 2009,NO_QUERY,resurigurl,"@patreng if you have time, let's go back to ol..."
239996,0,1972570190,Sat May 30 09:28:27 PDT 2009,NO_QUERY,kinderriegel,tiny little bit bored. miss my sisters and bes...
239997,1,2014156932,Wed Jun 03 00:37:50 PDT 2009,NO_QUERY,SuneEmil,@yellowteacup17 sounds lovely the strawberry ...
239998,1,1957433816,Fri May 29 00:29:26 PDT 2009,NO_QUERY,kkjordan,@cory_a goodnight cory


In [26]:
tf_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0,0,0,0,0,0,0,0,0,0,...,293,8,145,145,8,210,90,877,135,16
1,0,0,0,0,0,0,0,0,0,0,...,0,218,22,27,12,11,157,4,29,251
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,246,157
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,12,1,481,27,57,101,453
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,63,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239995,0,0,0,0,0,0,0,0,0,0,...,974,559,6,14,144,27,214,23,190,321
239996,0,0,0,0,0,0,0,0,0,0,...,58,179,34,99,178,327,99,178,333,174
239997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,212,316,451,383,290,70,4
239998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,359
