In [1]:
import numpy as np

# Read the file and split each line into words
lines = [line.split() for line in open('/kaggle/input/dataset1/hashed_passwords.txt')]

# Join words in each line into a single string and filter out empty lines
joined_lines = [' '.join(line) for line in lines if line]

# Convert to numpy array (or keep as a list if preferred)
lines_array = np.array(joined_lines)
lines_array = list(map(str, lines_array))

print(lines_array[0])

# Print the result (it will be a 1D array of strings)

8d969eef6ecad3c29a3a629280e686cf0c3f5d5a86aff3ca12020c923adc6c92


In [2]:

# Read the file and split each line into words
linesl = [linel.split() for linel in open('/kaggle/input/dataset1/10-million-password-list-top-1000000.txt')]

# Join words in each line into a single string and filter out empty lines
joined_linesl = [' '.join(linel) for linel in linesl if linel]

# Convert to numpy array (or keep as a list if preferred)
lines_arrayl = np.array(joined_linesl)
lines_arrayl = list(map(str, lines_arrayl))
print(lines_arrayl[:5])


['123456', 'password', '12345678', 'qwerty', '123456789']


In [3]:
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Sample data (replace with your custom data)


# Split the data into training and testing sets
questions_train, questions_test, answers_train, answers_test = train_test_split(lines_array, lines_arrayl, test_size=0.2, random_state=42)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(questions_train + answers_train)

# Convert texts to sequences
train_questions_seq = tokenizer.texts_to_sequences(questions_train)
train_answers_seq = tokenizer.texts_to_sequences(answers_train)

# Pad sequences to ensure uniform input size
max_length = max(len(seq) for seq in train_questions_seq + train_answers_seq)
train_questions_padded = pad_sequences(train_questions_seq, maxlen=max_length, padding='post')
train_answers_padded = pad_sequences(train_answers_seq, maxlen=max_length, padding='post')

# Prepare target data by shifting answers
train_answers_padded_shifted = np.zeros_like(train_answers_padded)
train_answers_padded_shifted[:, :-1] = train_answers_padded[:, 1:]

# Define the model with reduced complexity
model = models.Sequential()
model.add(layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=64, input_length=max_length))
model.add(layers.LSTM(32, return_sequences=True))  # Reduced number of LSTM units
model.add(layers.Dense(len(tokenizer.word_index) + 1, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model with reduced batch size
# Assuming train_questions_padded and train_answers_padded_shifted are numpy arrays
sample_size = int(0.1 * len(train_questions_padded))  # 10% of the dataset
indices = np.random.choice(len(train_questions_padded), sample_size, replace=False)
train_questions_padded = train_questions_padded[indices]
train_answers_padded_shifted = train_answers_padded_shifted[indices]

model.fit(train_questions_padded, np.expand_dims(train_answers_padded_shifted, -1), epochs=8, batch_size=16)





Epoch 1/8




[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3101s[0m 619ms/step - accuracy: 0.9972 - loss: 2.1715
Epoch 2/8
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3095s[0m 619ms/step - accuracy: 0.9991 - loss: 0.0148
Epoch 3/8
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3095s[0m 619ms/step - accuracy: 0.9992 - loss: 0.0120
Epoch 4/8
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3094s[0m 619ms/step - accuracy: 0.9991 - loss: 0.0115
Epoch 5/8
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3095s[0m 619ms/step - accuracy: 0.9991 - loss: 0.0101
Epoch 6/8
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3095s[0m 619ms/step - accuracy: 0.9990 - loss: 0.0101
Epoch 7/8
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3095s[0m 619ms/step - accuracy: 0.9991 - loss: 0.0081
Epoch 8/8
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3095s[0m 619ms/step - accuracy: 0.9990 - loss: 0.0084


<keras.src.callbacks.history.History at 0x7e7e0538b430>

In [4]:
# Function to test the model with new input data
def test_model(input_question):
    # Convert the input question to sequence and pad it
    input_seq = tokenizer.texts_to_sequences([input_question])
    input_padded = pad_sequences(input_seq, maxlen=max_length, padding='post')

    # Make a prediction
    prediction = model.predict(input_padded)

    # Get the predicted word indices and convert them back to words
    predicted_indices = np.argmax(prediction[0], axis=-1)
    predicted_words = [tokenizer.index_word[idx] for idx in predicted_indices if idx > 0]  # Filter out padding

    return ' '.join(predicted_words)

# Test the model with a new question
new_question = "ef797c8118f02dfb649607dd5d3f8c7623048c9c063d532cc95c5ed7a898a64f"
predicted_answer = test_model(new_question)
print(f"Question: {new_question}\nPredicted Answer: {predicted_answer}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step
Question: ef797c8118f02dfb649607dd5d3f8c7623048c9c063d532cc95c5ed7a898a64f
Predicted Answer: 1


In [5]:
model.save('/kaggle/working/trained.h5')
