In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
csv_path = '/content/drive/My Drive/Deep Learning/million_tweets_subset.csv'

**Data**

In [14]:
import pandas as pd

column_names = ["Date_and_Time", "Text", "Retweet", "Like", "Account_id", "Followers_count", "Following_count", "Tweets_count"]
df = pd.read_csv(csv_path, nrows=500000, names=column_names)
df.head(10)


Unnamed: 0,Date_and_Time,Text,Retweet,Like,Account_id,Followers_count,Following_count,Tweets_count
0,2019-02-01 00:00:00,RT : به نظر شما وقتی #جرمی_هانت وزیر امورخارجه...,981,0,0.1,144,220,184
1,2019-02-01 00:00:00,RT : بنر تصاویر دیکتاتور توسط #کانونهای_شورشی ...,163,0,1.0,1589,981,31282
2,2019-02-01 00:00:00,RT : مشهد به واسطه وجود حرم مطهر امام رضا(ع) و...,60,0,2.0,380,418,3614
3,2019-02-01 00:00:00,عاخه تو مغزشون کردن هرکسی که مثلا از نظام انت...,0,1,3.0,466,708,5421
4,2019-02-01 00:00:00,نرم باشه لطفاً.,0,2,4.0,381,755,4084
5,2019-02-01 00:00:00,ولي پولشو ميداد راضي تر بودم 😂,0,1,5.0,23,46,406
6,2019-02-01 00:00:00,بونگ بونگ بونگ بونگ بونگ بونگ بونگ بونگ بونگ ب...,0,0,6.0,11532,5,67972
7,2019-02-01 00:00:00,:))))))))))))))))) نیست؟؟؟,0,1,7.0,130,154,843
8,2019-01-31 23:59:59,از ساعت ۱۲-۱ به بعد هم اساسی فرو میکنه 🚶🏻‍♂️🚬,0,3,8.0,353,135,292
9,2019-01-31 23:59:59,ولی خودمونیم هیچی اون قر ریزای همراه آهنگ که ا...,0,14,9.0,826,348,14370


**Data** **Cleaning**

In [15]:
import re

def clean_tweet(text):

    text = str(text)

    #link
    text = re.sub(r'http\S+|www.\S+', '', text)

    #emoji
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # Emoticons
                           u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
                           u"\U0001F680-\U0001F6FF"  # Transport & Map Symbols
                           u"\U0001F1E0-\U0001F1FF"  # Flags
                           u"\U00002500-\U00002BEF"  # Chinese/Japanese/Korean characters
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)



    #additional space
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [16]:
df_clean = df.copy()

In [17]:
df_clean['Text'] = df_clean['Text'].apply(clean_tweet)

In [18]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random

texts = df_clean['Text'].dropna().tolist()
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)

total_words = min(10000, len(tokenizer.word_index) + 1)

raw_seqs = []

# n-gram sequences
raw_seqs = []
for line in texts:
    seq = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(seq)):
        raw_seqs.append(seq[: i+1])

# Identify OOV and padding
oov_idx = tokenizer.word_index[tokenizer.oov_token]
pad_idx = 0

filtered_seqs = [
    seq for seq in raw_seqs
    if seq[-1] not in (oov_idx, pad_idx)
]

MAX_LEN = 25
filtered_seqs = [seq for seq in filtered_seqs if len(seq) <= MAX_LEN]

# pre-padding
padded = pad_sequences(filtered_seqs, maxlen=MAX_LEN, padding='pre')

# Split into X and y
X = padded[:, :-1]
y = padded[:, -1]


In [20]:
for i in range(5):
    inp_seq = X[i]
    label   = y[i]

    # convert non-zero tokens back to words
    words = [
        tokenizer.index_word.get(tok, '<OOV>')
        for tok in inp_seq
        if tok != 0
    ]
    next_word = tokenizer.index_word.get(label, '<OOV>')

    print(f"Sample {i+1}:")
    print("  Input tokens :", inp_seq)
    print("  Input text   :", " ".join(words))
    print("  Next token   :", label, f"({next_word})\n")

Sample 1:
  Input tokens : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3]
  Input text   : rt
  Next token   : 4 (به)

Sample 2:
  Input tokens : [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 4]
  Input text   : rt به
  Next token   : 142 (نظر)

Sample 3:
  Input tokens : [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   3   4 142]
  Input text   : rt به نظر
  Next token   : 34 (شما)

Sample 4:
  Input tokens : [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   3   4 142  34]
  Input text   : rt به نظر شما
  Next token   : 58 (وقتی)

Sample 5:
  Input tokens : [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   3   4 142  34  58]
  Input text   : rt به نظر شما وقتی
  Next token   : 7948 (جرمی)



In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential([
    Embedding(total_words, 128, input_length=MAX_LEN-1),
    Dropout(0.2),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dense(total_words, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])




In [26]:
dataset = tf.data.Dataset.from_tensor_slices((X, y))
dataset = dataset.shuffle(10000).batch(512).prefetch(tf.data.AUTOTUNE)

model.fit(dataset, epochs=7)


Epoch 1/7
[1m11500/11500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 21ms/step - accuracy: 0.0585 - loss: 7.0557
Epoch 2/7
[1m11500/11500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m242s[0m 21ms/step - accuracy: 0.1192 - loss: 6.2173
Epoch 3/7
[1m11500/11500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m241s[0m 21ms/step - accuracy: 0.1475 - loss: 5.8954
Epoch 4/7
[1m11500/11500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m242s[0m 21ms/step - accuracy: 0.1653 - loss: 5.7097
Epoch 5/7
[1m11500/11500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m246s[0m 21ms/step - accuracy: 0.1784 - loss: 5.5814
Epoch 6/7
[1m11500/11500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m242s[0m 21ms/step - accuracy: 0.1885 - loss: 5.4869
Epoch 7/7
[1m11500/11500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m241s[0m 21ms/step - accuracy: 0.1963 - loss: 5.4154


<keras.src.callbacks.history.History at 0x785699357e60>

In [27]:
def sample_with_temperature(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-8) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)


In [28]:
def predict_next_word(seed_text, num_words=1, temperature=1.0):
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=MAX_LEN-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        next_index = sample_with_temperature(predicted[0], temperature)
        next_word = tokenizer.index_word.get(next_index, '')
        if not next_word:
            break
        seed_text += " " + next_word
    return seed_text


In [29]:
print(predict_next_word(" شاهزاده رضا", num_words=1, temperature=0.5))


 شاهزاده رضا پهلوی
