In [52]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.callbacks import ModelCheckpoint
import joblib
import os

# Download NLTK resources (only need to run this once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [64]:
# Load the dataset
data = pd.read_csv("/content/roo_data.csv")  # Replace with your actual file
from tensorflow.keras.utils import to_categorical
# Concatenate text columns
data['text'] = data['Interested Type of Books'].astype(str) + ' ' + \
               data['Salary Range Expected'].astype(str) + ' ' + \
               data['In a Realtionship?'].astype(str) + ' ' + \
               data['Gentle or Tuff behaviour?'].astype(str) + ' ' + \
               data['Management or Technical'].astype(str)

# Define preprocessing functions
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing
data['text'] = data['text'].apply(preprocess_text)

# Encode target labels
label_encoder = LabelEncoder()
data['Suggested Job Role'] = label_encoder.fit_transform(data['Suggested Job Role'])
job_role_labels = label_encoder.classes_

# Prepare features and labels
X = data['text'].values
y = data['Suggested Job Role'].values
y= to_categorical(y)
# Tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences, padding='post')

# Save tokenizer and label encoder
joblib.dump(tokenizer, 'tokenizer.joblib')
joblib.dump(label_encoder, 'label_encoder.joblib')


['label_encoder.joblib']

In [65]:
len(y)

20000

In [70]:
y.shape[1]

34

In [73]:
import pickle
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from google.colab import files

# Define model parameters
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
max_length = X_padded.shape[1]
rnn_units = 64
num_classes = 34

# Build the RNN model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    SimpleRNN(rnn_units, return_sequences=False),
    Dense(y.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model and capture the training history
history = model.fit(X_padded, y, epochs=5, batch_size=32, validation_split=0.2)



Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.0480 - loss: 3.5278 - val_accuracy: 0.0610 - val_loss: 3.5187
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.0537 - loss: 3.5133 - val_accuracy: 0.0585 - val_loss: 3.5184
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.0583 - loss: 3.5072 - val_accuracy: 0.0595 - val_loss: 3.5221
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.0538 - loss: 3.5069 - val_accuracy: 0.0553 - val_loss: 3.5252
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.0565 - loss: 3.5035 - val_accuracy: 0.0587 - val_loss: 3.5231


In [77]:
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

# Save the label encoder
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Save the padded sequences
with open('X_padded.pkl', 'wb') as f:
    pickle.dump(X_padded, f)

# Save the training history
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [74]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

def predict_job_role(text_input, tokenizer, label_encoder, model, max_length):

    # Tokenize and pad the input text
    sequences = tokenizer.texts_to_sequences([text_input])
    padded_sequence = pad_sequences(sequences, maxlen=max_length, padding='post')

    # Predict the job role
    prediction = model.predict(padded_sequence)
    predicted_class = np.argmax(prediction, axis=1)

    # Convert the predicted class index to the actual job role label
    predicted_job_role = label_encoder.inverse_transform(predicted_class)

    return predicted_job_role[0]

# Example usage:
# text_input = "Your text input here"
# predicted_job_role = predict_job_role(text_input, tokenizer, label_encoder, model, max_length)
# print(f"Predicted Job Role: {predicted_job_role}")


In [75]:
text_input = "I have strong skills in programming, software engineering, and data analysis."
predicted_job_role = predict_job_role(text_input, tokenizer, label_encoder, model, max_length)
print(f"Predicted Job Role: {predicted_job_role}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 585ms/step
Predicted Job Role: Network Security Administrator
