In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertModel

def load_data(train_file, test_file):
    train_data = pd.read_csv(train_file)
    test_data = pd.read_csv(test_file)
    return train_data, test_data

def preprocess_data(train_data, test_data):
    relevant_columns = ['id', 'Job Description', 'skills']
    train_data = train_data[relevant_columns].dropna()
    test_data = test_data[['id', 'Job Description']].dropna()
    return train_data, test_data

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    return outputs.pooler_output.squeeze().detach().numpy()

def train_model(X_train, y_train, params=None):
    if params:
        classifier = RandomForestClassifier(**params)
    else:
        classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    classifier.fit(X_train, y_train)
    return classifier

def evaluate_model(classifier, X_val, y_val):
    y_val_pred = classifier.predict(X_val)
    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred, average='micro')
    recall = recall_score(y_val, y_val_pred, average='micro')
    f1 = f1_score(y_val, y_val_pred, average='micro')
    return accuracy, precision, recall, f1

def generate_predictions(classifier, X_test, test_data):
    test_predictions = classifier.predict(X_test)
    submission = pd.DataFrame({
        'id': test_data['id'],
        'skills': test_predictions
    })
    submission.to_csv('submission.csv', index=False)
    print("Submission file created: submission.csv")

if __name__ == "__main__":
    # Load data
    train_data, test_data = load_data('train.csv', 'test.csv')

    # Preprocess data
    train_data, test_data = preprocess_data(train_data, test_data)

    # Tokenize and create BERT embeddings
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    train_data['embedding'] = train_data['Job Description'].apply(get_bert_embeddings)
    test_data['embedding'] = test_data['Job Description'].apply(get_bert_embeddings)

    # Split data for training and validation
    X_train, X_val, y_train, y_val = train_test_split(np.array([emb for emb in train_data['embedding']]),
                                                      train_data['skills'], test_size=0.2, random_state=42)

    # Hyperparameter tuning
    param_grid = {'n_estimators': [50, 100, 200]}
    grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_

    # Train model
    classifier = train_model(X_train, y_train, params=best_params)

    # Evaluate model
    accuracy, precision, recall, f1 = evaluate_model(classifier, X_val, y_val)
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-Score: {f1}')

    # Generate predictions
    generate_predictions(classifier, np.array([emb for emb in test_data['embedding']]), test_data)




Accuracy: 0.975609756097561
Precision: 0.975609756097561
Recall: 0.975609756097561
F1-Score: 0.975609756097561
Submission file created: submission.csv


In [None]:

#tushar
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
nltk.download('punkt')

# Load the test data
test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train.csv')

# Load the trained SVM model and TF-IDF vectorizer
svm_model = SVC(C=1, kernel='poly')  # Update with your best parameters
tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')  # Update with your vectorizer

# Fit the vectorizer on the training data
X_train_vec = tfidf_vectorizer.fit_transform(train_df['Job Description'])

# Fit the model on the training data
svm_model.fit(X_train_vec, train_df['skills'])

# Load the job descriptions and transform them using the TF-IDF vectorizer
X_test_vec = tfidf_vectorizer.transform(test_df['Job Description'])

# Predict skills for the test set
predicted_skills = svm_model.predict(X_test_vec)
test_df['Predicted Skills'] = predicted_skills


resume_template = """
[Resume]

[Job Title]: {Job Title}

[Responsibilities]:
{Responsibilities}

[Qualifications]:
{Qualifications}

[Skills]:
{Skills}

[Additional Information]:
- Experience: {Experience}
- Location: {Location}
- Company: {Company}
ROUGH Score: {ROUGH_score:.2f}
"""

# Function to generate resumes
def generate_resume(job_desc):
    # Format skills as a single string with the desired format
    skills_list = ', '.join(job_desc['Predicted Skills']).split(', ')
    skills_formatted = ''.join([skill for skill in skills_list])

    # Replace placeholders in the template with the actual information
    resume_text = resume_template.format(
        **{
            'Job Title': job_desc['Job Title'],
            'Responsibilities': job_desc['Responsibilities'],
            'Qualifications': job_desc['Qualifications'],
            'Skills': skills_formatted,  # Include predicted skills as a single string with the desired format
            'Experience': job_desc['Experience'],
            'Location': job_desc['location'],
            'Company': job_desc['Company'],
            'ROUGH_score': len(job_desc['Responsibilities'] + job_desc['Qualifications'] + ' '.join(job_desc['Predicted Skills'])) / len(resume_template)
        }
    )
    return resume_text

# Function to compute BLEU score
def compute_bleu(reference, candidate):
    ref_tokens = nltk.word_tokenize(reference.lower())
    candidate_tokens = nltk.word_tokenize(candidate.lower())
    smoothing = SmoothingFunction().method4
    bleu = sentence_bleu([ref_tokens], candidate_tokens, smoothing_function=smoothing)
    return bleu

# Generate resumes and compute BLEU scores
bleu_scores = []
rough_scores = []
resumes = []
for index, job_desc in test_df.iterrows():
    generated_resume = generate_resume(job_desc)
    bleu_score = compute_bleu(job_desc['Job Description'], generated_resume)
    bleu_scores.append(bleu_score)
    rough_score = len(job_desc['Responsibilities'] + job_desc['Qualifications'] + ''.join(job_desc['Predicted Skills'])) / len(resume_template)
    rough_scores.append(rough_score)
    resumes.append(generated_resume)

# Save the generated resumes to a file
with open('test_resume.txt', 'w') as f:
    for resume, bleu_score, rough_score in zip(resumes, bleu_scores, rough_scores):
        f.write(f"BLEU Score: {bleu_score}\nROUGH Score: {rough_score:.2f}\n\n{resume}\n\n")

print("Resumes generated successfully and saved to 'test_resume.txt'.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Resumes generated successfully and saved to 'test_resume.txt'.


In [None]:
# Perfect Bhavesh Resume TESTING


import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
from docx import Document
from docx.shared import RGBColor
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT

nltk.download('punkt')

# Load the test data
test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train.csv')

# Load the trained SVM model and TF-IDF vectorizer
svm_model = SVC(C=1, kernel='poly')  # Update with your best parameters
tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')  # Update with your vectorizer

# Fit the vectorizer on the training data
X_train_vec = tfidf_vectorizer.fit_transform(train_df['Job Description'])

# Fit the model on the training data
svm_model.fit(X_train_vec, train_df['skills'])

# Load the job descriptions and transform them using the TF-IDF vectorizer
X_test_vec = tfidf_vectorizer.transform(test_df['Job Description'])

# Predict skills for the test set
predicted_skills = svm_model.predict(X_test_vec)
test_df['Predicted Skills'] = predicted_skills

# Function to generate resumes
def generate_resume(job_desc):
    # Format skills as a single string with the desired format
    skills_list = ', '.join(job_desc['Predicted Skills']).split(', ')
    skills_formatted = ''.join([skill for skill in skills_list])

    # Initialize a new Word document
    doc = Document()

    # Add sections to the document
    doc.add_heading('Resume', level=1)

    # Define style for blue and bold text
    blue_bold_style = doc.styles['Heading 1']
    blue_bold_style.font.color.rgb = RGBColor(0, 0, 255)
    blue_bold_style.font.bold = True

    # Add sections with blue and bold text
    doc.add_paragraph(f"Job Title: ", style='Heading 1').add_run(f"{job_desc['Job Title']}").font.color.rgb = RGBColor(0, 0, 0)
    doc.add_paragraph(f"Responsibilities:", style='Heading 1')
    doc.add_paragraph(f"{job_desc['Responsibilities']}", style='Normal')
    doc.add_paragraph(f"Qualifications:", style='Heading 1')
    doc.add_paragraph(f"{job_desc['Qualifications']}", style='Normal')
    doc.add_paragraph(f"Skills:", style='Heading 1')
    doc.add_paragraph(f"{skills_formatted}", style='Normal')
    doc.add_paragraph(f"Additional Information:", style='Heading 1').font.color.rgb = RGBColor(0, 0, 0)
    doc.add_paragraph(f"Experience: ", style='Heading 1')
    doc.add_paragraph(f"{job_desc['Experience']}", style='Normal')
    doc.add_paragraph(f"Location: ", style='Heading 1')
    doc.add_paragraph(f"{job_desc['location']}", style='Normal')
    doc.add_paragraph(f"Company: ", style='Heading 1')
    doc.add_paragraph(f"{job_desc['Company']}", style='Normal')
    doc.add_paragraph(f"ROUGH Score: ", style='Heading 1')
    doc.add_paragraph(f"{len(job_desc['Responsibilities'] + job_desc['Qualifications'] + ' '.join(job_desc['Predicted Skills'])) / len(resume_template):.2f}", style='Normal')


    # Save the document to the "resume" folder
    doc.save(f"resume/{job_desc['id']}_resume.docx")

# Generate resumes
for index, job_desc in test_df.iterrows():
    generate_resume(job_desc)

print("Resumes generated successfully.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Resumes generated successfully.


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
import os

nltk.download('punkt')

# Load the test data
test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train.csv')

# Load the trained SVM model and TF-IDF vectorizer
svm_model = SVC(C=1, kernel='poly')  # Update with your best parameters
tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')  # Update with your vectorizer

# Fit the vectorizer on the training data
X_train_vec = tfidf_vectorizer.fit_transform(train_df['Job Description'])

# Fit the model on the training data
svm_model.fit(X_train_vec, train_df['skills'])

resume_template = """
# Resume for {}

**Job Title:** {}

**Responsibilities:**
{}

**Qualifications:**
{}

**Skills:**
*{}

**Additional Information:**
- Experience: {}
- Location: {}
- Company: {}
ROUGH Score: {:.2f}
"""

# Function to generate resumes
def generate_resume(job_desc, name):
    # Format skills as a single string with the desired format
    skills_list = ', '.join(job_desc['Predicted Skills']).split(', ')
    skills_formatted = '*{}*'.format('*\n*'.join([skill for skill in skills_list]))

    # Replace placeholders in the template with the actual information
    resume_text = resume_template.format(
        name,
        job_desc['Job Title'],
        job_desc['Responsibilities'],
        job_desc['Qualifications'],
        skills_formatted,  # Include predicted skills as a single string with the desired format
        job_desc['Experience'],
        job_desc['location'],
        job_desc['Company'],
        len(job_desc['Responsibilities'] + job_desc['Qualifications'] + ''.join(job_desc['Predicted Skills'])) / len(resume_template)
    )
    return resume_text

# Function to compute BLEU score
def compute_bleu(reference, candidate):
    ref_tokens = nltk.word_tokenize(reference.lower())
    candidate_tokens = nltk.word_tokenize(candidate.lower())
    smoothing = SmoothingFunction().method4
    bleu = sentence_bleu([ref_tokens], candidate_tokens, smoothing_function=smoothing)
    return bleu

# Function to generate resume based on user input
def generate_resume_from_input():
    name = input("Enter Your Name: ")
    job_title = input("Enter Job Title: ")
    responsibilities = input("Enter Responsibilities: ")
    qualifications = input("Enter Qualifications: ")
    experience = input("Enter Experience: ")
    location = input("Enter Location: ")
    company = input("Enter Company: ")

    job_desc = pd.DataFrame({
        'Job Title': [job_title],
        'Responsibilities': [responsibilities],
        'Qualifications': [qualifications],
        'Experience': [experience],
        'location': [location],
        'Company': [company]
    })

    # Transform job description using TF-IDF vectorizer
    X_test_vec = tfidf_vectorizer.transform(job_desc['Responsibilities'])

    # Predict skills for the job description
    predicted_skills = svm_model.predict(X_test_vec)
    job_desc['Predicted Skills'] = predicted_skills

    # Generate resume
    generated_resume = generate_resume(job_desc.iloc[0], name)

    # Compute BLEU score
    bleu_score = compute_bleu(job_desc['Responsibilities'][0], generated_resume)

    # Save the generated resume to a file
    filename = input("Enter the filename to save the resume (include .md extension): ")
    with open(filename, 'w') as f:
        f.write(f"# Resume for {name}\n\n")
        f.write(generated_resume)

    print(f"Resume generated successfully and saved to '{filename}'.")

# Generate resume based on user input
generate_resume_from_input()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


KeyboardInterrupt: Interrupted by user

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
from docx import Document
from docx.shared import RGBColor
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT

nltk.download('punkt')

# Load the test data
test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train.csv')

# Load the trained SVM model and TF-IDF vectorizer
svm_model = SVC(C=1, kernel='poly')  # Update with your best parameters
tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')  # Update with your vectorizer

# Fit the vectorizer on the training data
X_train_vec = tfidf_vectorizer.fit_transform(train_df['Job Description'])

# Fit the model on the training data
svm_model.fit(X_train_vec, train_df['skills'])

# Function to generate skills based on job title
def generate_skills(job_title):
    # Predict skills for the provided job title
    job_desc = {'Job Description': job_title}  # Create a dummy job description with the provided job title
    X_test_vec = tfidf_vectorizer.transform([job_title])
    predicted_skills = svm_model.predict(X_test_vec)
    return predicted_skills

# Function to generate resume
def generate_resume(name, job_title, responsibilities, qualifications, experience, location, company, coding_profile_link):
    # Generate skills based on the provided job title
    skills = generate_skills(job_title)

    # Initialize a new Word document
    doc = Document()

    # Add sections to the document
    doc.add_heading('Resume', level=1)

    # Define style for blue and bold text
    blue_bold_style = doc.styles['Heading 1']
    blue_bold_style.font.color.rgb = RGBColor(0, 0, 255)
    blue_bold_style.font.bold = True

    # Add sections with blue and bold text
    doc.add_paragraph(f"Name: ", style='Heading 1').add_run(f"{name}").font.color.rgb = RGBColor(0, 0, 0)
    doc.add_paragraph(f"Job Title: ", style='Heading 1').add_run(f"{job_title}").font.color.rgb = RGBColor(0, 0, 0)
    doc.add_paragraph(f"Responsibilities:", style='Heading 1')
    doc.add_paragraph(f"{responsibilities}", style='Normal')
    doc.add_paragraph(f"Qualifications:", style='Heading 1')
    doc.add_paragraph(f"{qualifications}", style='Normal')
    doc.add_paragraph(f"Experience: ", style='Heading 1').add_run(f"{experience}").font.color.rgb = RGBColor(0, 0, 0)
    doc.add_paragraph(f"Location: ", style='Heading 1').add_run(f"{location}").font.color.rgb = RGBColor(0, 0, 0)
    doc.add_paragraph(f"Company: ", style='Heading 1').add_run(f"{company}").font.color.rgb = RGBColor(0, 0, 0)
    doc.add_paragraph(f"Coding Profile Link: ", style='Heading 1').add_run(f"{coding_profile_link}").font.color.rgb = RGBColor(0, 0, 0)
    doc.add_paragraph(f"Skills:", style='Heading 1')
    # Add skills in bullet format
    skills_paragraph = doc.add_paragraph(style='Normal')
    for skill in skills:
        skills_paragraph.add_run(f"• {skill}\n")

    # Ask user for the file name to save
    file_name = input("Enter the file name to save the resume: ")

    # Save the document
    doc.save(f"{file_name}.docx")
    print("Resume generated successfully.")

# Ask for personal information
name = input("Enter your name: ")
job_title = input("Enter your job title: ")
responsibilities = input("Enter your responsibilities: ")
qualifications = input("Enter your qualifications: ")
experience = input("Enter your experience: ")
location = input("Enter your location: ")
company = input("Enter your company: ")
coding_profile_link = input("Enter your coding profile link: ")

# Generate resume
generate_resume(name, job_title, responsibilities, qualifications, experience, location, company, coding_profile_link)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Enter your name: asdf
Enter your job title: Ui ux designer
Enter your responsibilities: good knowledge of desing, design prinicple, component making, 
Enter your qualifications: btech
Enter your experience: 0
Enter your location: pune
Enter your company: uber
Enter your coding profile link: www.bhavesh.com
Enter the file name to save the resume: bhavesh2
Resume generated successfully.


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertModel
import joblib

def load_data(train_file, test_file):
    train_data = pd.read_csv(train_file)
    test_data = pd.read_csv(test_file)
    return train_data, test_data

def preprocess_data(train_data, test_data):
    relevant_columns = ['id', 'Job Description', 'skills']
    train_data = train_data[relevant_columns].dropna()
    test_data = test_data[['id', 'Job Description']].dropna()
    return train_data, test_data

def get_bert_embeddings(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    return outputs.pooler_output.squeeze().detach().numpy()

def train_model(X_train, y_train, params=None):
    if params:
        classifier = RandomForestClassifier(**params)
    else:
        classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    classifier.fit(X_train, y_train)
    return classifier

def evaluate_model(classifier, X_val, y_val):
    y_val_pred = classifier.predict(X_val)
    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred, average='micro')
    recall = recall_score(y_val, y_val_pred, average='micro')
    f1 = f1_score(y_val, y_val_pred, average='micro')
    return accuracy, precision, recall, f1

def save_models(classifier, tokenizer, model):
    joblib.dump(classifier, 'model/random_forest_model.pkl')
    joblib.dump(tokenizer, 'model/bert_tokenizer.pkl')
    joblib.dump(model, 'model/bert_model.pkl')

if __name__ == "__main__":
    # Load data
    train_data, test_data = load_data('train.csv', 'test.csv')

    # Preprocess data
    train_data, test_data = preprocess_data(train_data, test_data)

    # Tokenize and create BERT embeddings
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    train_data['embedding'] = train_data['Job Description'].apply(lambda x: get_bert_embeddings(x, tokenizer, model))
    test_data['embedding'] = test_data['Job Description'].apply(lambda x: get_bert_embeddings(x, tokenizer, model))

    # Split data for training and validation
    X_train, X_val, y_train, y_val = train_test_split(np.array([emb for emb in train_data['embedding']]),
                                                      train_data['skills'], test_size=0.2, random_state=42)

    # Hyperparameter tuning
    param_grid = {'n_estimators': [50, 100, 200]}
    grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_

    # Train model
    classifier = train_model(X_train, y_train, params=best_params)

    # Evaluate model
    accuracy, precision, recall, f1 = evaluate_model(classifier, X_val, y_val)
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1-Score: {f1}')

    # Save models and tokenizer
    save_models(classifier, tokenizer, model)




Accuracy: 0.975609756097561
Precision: 0.975609756097561
Recall: 0.975609756097561
F1-Score: 0.975609756097561
