In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
import nltk

# Download NLTK resources (if not downloaded already)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
2024-04-07 22:16:26.546275: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-07 22:16:27.104320: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow wi

True

In [2]:
df=pd.read_csv("pcl.csv")

In [3]:
df

Unnamed: 0,Project Description,Cover Letter
0,I need a Python developer for a long-term part...,Subject: Application for Long-Term Python Deve...
1,Need to write a file handling program in Pytho...,Subject: Application for Python Programming Tu...
2,I need someone to help me with Agent to contro...,Subject: Application for Python Programming As...
3,We are seeking a skilled Python coder to assis...,Subject: Application for Python Coding Assignm...
4,"I have a scenario with a set of questions, you...",Certainly! Here's a cover letter-style introdu...
5,i have a python script. It is download a file ...,Subject: Application for Enhancement of Python...
6,I need an individual to scrape data from ali e...,Subject: Application for Web Scraping Project:...
7,\n\nJoin our dynamic team as a Junior Python D...,Subject: Application for Junior Python Develop...
8,\n\nAre you passionate about Python programmin...,Subject: Application for Junior Python Develop...
9,Need this function:\n\nUse Python to:\nMake an...,Subject: Application for Text-to-Speech Functi...


In [4]:
# Text preprocessing function
def preprocess_text(text):
    # Remove newline characters
    text = text.replace('**', '')
    text = text.replace('\n', ' ')

    # Tokenization
    tokens = word_tokenize(text)
    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    stripped = [word.translate(table) for word in tokens]
    # Remove non-alphabetic tokens
    words = [word for word in stripped if word.isalpha()]
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if not word in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)


In [5]:



# Extract project descriptions and cover letters
project_descriptions = df['Project Description'].tolist()
cover_letters = df['Cover Letter'].tolist()

# Preprocess the project descriptions and cover letters
preprocessed_project_descriptions = [preprocess_text(desc) for desc in project_descriptions]
preprocessed_cover_letters = [preprocess_text(letter) for letter in cover_letters]

In [6]:


# Tokenize input and output sequences
tokenizer_desc = Tokenizer()
tokenizer_desc.fit_on_texts(preprocessed_project_descriptions)
tokenizer_cover = Tokenizer()
tokenizer_cover.fit_on_texts(preprocessed_cover_letters)

# Convert text sequences to integer sequences
X = tokenizer_desc.texts_to_sequences(preprocessed_project_descriptions)
y = tokenizer_cover.texts_to_sequences(preprocessed_cover_letters)

# Pad sequences to ensure uniform length
max_seq_length = max(max(len(seq) for seq in X), max(len(seq) for seq in y))
X = pad_sequences(X, maxlen=max_seq_length, padding='post')
y = pad_sequences(y, maxlen=max_seq_length, padding='post')

In [7]:

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Model architecture
def create_model(input_vocab_size, output_vocab_size, max_seq_length, hidden_units):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_vocab_size, hidden_units),
        tf.keras.layers.LSTM(hidden_units),
        tf.keras.layers.RepeatVector(max_seq_length),
        tf.keras.layers.LSTM(hidden_units, return_sequences=True),
        tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(output_vocab_size, activation='softmax'))
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
    return model

In [None]:




# Create model
input_vocab_size = len(tokenizer_desc.word_index) + 1
output_vocab_size = len(tokenizer_cover.word_index) + 1
hidden_units = 256
model = create_model(input_vocab_size, output_vocab_size, max_seq_length, hidden_units)

# Train model
batch_size = 64
epochs = 5
model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=batch_size, epochs=epochs)


Epoch 1/10


2024-04-07 22:16:40.213831: E tensorflow/core/util/util.cc:131] oneDNN supports DT_INT32 only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.


In [None]:

# Save the trained model
model.save('cover_letter_generator_model.h5')

# Load the trained model
model = tf.keras.models.load_model('cover_letter_generator_model.h5')


In [None]:
# New project description
new_project_description = ["I need a python react js developer"]

# Preprocess the new project description
preprocessed_new_project_description = preprocess_text(new_project_description[0])

# Convert the preprocessed project description to sequences using the tokenizer
new_project_sequence = tokenizer_desc.texts_to_sequences([preprocessed_new_project_description])

# Pad the sequence to match the input sequence length used during training
new_project_padded_sequence = pad_sequences(new_project_sequence, maxlen=max_seq_length, padding='post')


# Predict the cover letter for the new project description
predicted_cover_letter_sequence = model.predict(new_project_padded_sequence)

# Convert the predicted sequence back to text using the tokenizer for cover letters
predicted_cover_letter_text = []
for sequence in predicted_cover_letter_sequence[0]:  # Take the first sequence (as there's only 1)
    # Sample a token based on its probability distribution
    sampled_token_index = np.random.choice(len(sequence), p=sequence)
    # Convert the index to its corresponding word
    word = tokenizer_cover.index_word.get(sampled_token_index, '')
    # Append the word to the cover letter text
    predicted_cover_letter_text.append(word)

# Join the words to form the predicted cover letter text
predicted_cover_letter_text = ' '.join(predicted_cover_letter_text)

# Print the generated cover letter
print("Generated Cover Letter:")
print(predicted_cover_letter_text)