In [3]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import spacy 
nlp = spacy.load("en_core_web_sm")
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from keras import Sequential
from keras.utils import to_categorical
from keras import layers
import numpy as np

User-defined Functions

In [2]:
def preprocess(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not (token.is_punct or token.is_stop) ]
    return ' '.join(tokens)

def find_max_len_string(texts):
    max_len = len(texts.loc[0])
    max_string = ''
    i = 0
    while i < len(df1):
        s = texts.loc[i]
        # print(s)
        if len(s) < max_len:
            max_len = len(s)
            max_string = s
        i += 1
    return  max_string

def findMinMax(arrays):
    max = None
    max_len = 0
    min_len = len(arrays[0])
    min = arrays[0]
    for array in arrays:
        size = len(array)
        if size > max_len:
            max = array
        if size < min_len:
            min = array
    return max, min

Read Data

In [4]:
df = pd.read_csv('../Assignments/archive/training.1600000.processed.noemoticon.csv', encoding = "cp1252")
first_row = df.columns
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']
first_row_df = pd.DataFrame(data=None, columns = df.columns)
first_row_df.loc[0] = first_row
df = pd.concat([first_row_df,df])
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


Choosing a user

In [5]:
user_frequency = df.user.value_counts()
preferred_users = user_frequency[user_frequency > 200]
preferred_users = preferred_users[user_frequency < 300]
print(preferred_users.index.values)

['SallytheShizzle' 'VioletsCRUK' 'mcraddictal' 'tsarnick' 'what_bugs_u'
 'Karen230683' 'DarkPiano' 'SongoftheOss' 'Jayme1988' 'keza34'
 'ramdomthoughts' 'shanajaca' 'wowlew' 'nuttychris' 'TraceyHewins'
 'thisgoeshere' 'Spidersamm' 'StDAY']


Let's choose user DarkPiano for this experiment

In [6]:
user = 'DarkPiano'
df1 = df[df.loc[:,"user"] == user]
df1 = df1.reset_index()
df1.shape

(236, 7)

Preprocessing the tweets

In [7]:
df1.text = df1.text.apply(preprocess)

Create training data

In [14]:
seq_length = 30

# Create a sequence of tokens
sequences = []
for i in range(len(df1)):
    tweet = df1.text.loc[i]

    for i in range(seq_length, len(tweet)):
        if len(tweet) > seq_length:
            seq = tweet[i-seq_length:i+1]
            sequences.append(seq)

# Convert sequences into numerical data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
sequences = tokenizer.texts_to_sequences(sequences)

sequences = pad_sequences(sequences, maxlen=seq_length+1, padding='post')
sequences = np.array(sequences)

# Take the last integer of the arrays as y and rest as X
X = sequences[:,:-1]
y = sequences[:,-1]
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)
y = to_categorical(y, num_classes=vocab_size)

1971


In [15]:
print(X.shape, y.shape)

(2363, 30) (2363, 1971)


Define model

In [16]:
model = Sequential([
    layers.Embedding(vocab_size, 50, input_length=seq_length),
    layers.LSTM(100, return_sequences=True),
    layers.Dropout(0.1),
    layers.LSTM(100),
    layers.Dense(100, activation='relu'),
    layers.Dense(vocab_size, activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 30, 50)            98550     
                                                                 
 lstm_2 (LSTM)               (None, 30, 100)           60400     
                                                                 
 dropout_1 (Dropout)         (None, 30, 100)           0         
                                                                 
 lstm_3 (LSTM)               (None, 100)               80400     
                                                                 
 dense_2 (Dense)             (None, 100)               10100     
                                                                 
 dense_3 (Dense)             (None, 1971)              199071    
                                                                 
Total params: 448,521
Trainable params: 448,521
Non-tr

Train the model

In [17]:
model.fit(X, y , batch_size=128, epochs=100)

2023-05-13 16:44:48.448311: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 18629892 exceeds 10% of free system memory.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f6df5f61d20>