In [299]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [300]:
# Load the dataset
predict_df = pd.read_excel('prediction_data.xlsx')

# Display the first few rows of the dataset
print("Initial Dataset:\n")
predict_df.head()


Initial Dataset:



Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,ID,Name,Role,Transcript,Resume,Reason for decision,Job Description,num_words_in_transcript
0,514,537,rivash0038,lahar singh,software engineer,**lahar singh: software engineer candidate int...,**lahar singh**\n**software engineer candidate...,"expected_experience : 9+ years, domains: e-com...","communicated ideas clearly and effectively., h...",956
1,214,225,benjry660,benjamin ryan,data engineer,interview transcript: data engineer position\n...,here's a sample resume for benjamin ryan apply...,cultural fit,we are looking for a skilled data engineer wit...,551
2,1408,1467,rivash0968,amisha bedi,data scientist,"**interview transcript: amisha bedi, data scie...",**candidate profile: amisha bedi**\n\n**role:*...,"expected_experience : 6-8 years, domains: heal...","lacked key technical skills for the role., nee...",612
3,1071,1122,rivash0623,kairav mishra,product manager,**interview transcript: product manager positi...,**kairav mishra: product manager**\n\nas a sea...,"expected_experience : 6-8 years, domains: tech...","had impressive experience and qualifications.,...",793
4,390,410,bradgr792,bradley gross,product manager,product manager interview transcript\n\ninterv...,here's a sample resume for bradley gross apply...,cultural fit,we are looking for a skilled product manager w...,665


In [301]:
# Define text columns for preprocessing
text_columns = ['Transcript', 'Resume', 'Job Description', 'Reason for decision']

# Convert text to lowercase
for col in text_columns:
    if col in predict_df.columns:
        predict_df[col] = predict_df[col].str.lower()


In [302]:
# Handle duplicates and missing values
predict_df.drop_duplicates(inplace=True)
predict_df.fillna('Not Specified', inplace=True)

In [303]:
# Check for null values
print("Null values in combined dataset:")
predict_df.isnull().sum()


Null values in combined dataset:


Unnamed: 0,0
Unnamed: 0.1,0
Unnamed: 0,0
ID,0
Name,0
Role,0
Transcript,0
Resume,0
Reason for decision,0
Job Description,0
num_words_in_transcript,0


In [304]:
prediction_df['Role'].unique()

array(['software engineer', 'data engineer', 'data scientist',
       'product manager', 'data analyst', 'ui designer', 'ui engineer'],
      dtype=object)

In [305]:
unique_count = prediction_df.groupby('Role')['ID'].count()
unique_count

Unnamed: 0_level_0,ID
Role,Unnamed: 1_level_1
data analyst,9
data engineer,14
data scientist,20
product manager,21
software engineer,22
ui designer,9
ui engineer,5


In [306]:
# Initialize and train TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
all_text = pd.concat([predict_df[col] for col in text_columns if col in predict_df.columns])
tfidf_vectorizer.fit(all_text) # Fit on all text data

In [307]:
# Transform text columns
tfidf_transcript = tfidf_vectorizer.transform(predict_df['Transcript'])
tfidf_resume = tfidf_vectorizer.transform(predict_df['Resume'])
tfidf_job_desc = tfidf_vectorizer.transform(predict_df['Job Description'])

In [308]:
# Calculate cosine similarities
predict_df['resume_job_similarity'] = [cosine_similarity(tfidf_resume[i], tfidf_job_desc[i])[0][0] for i in range(len(predict_df))]
predict_df['transcript_job_similarity'] = [cosine_similarity(tfidf_transcript[i], tfidf_job_desc[i])[0][0] for i in range(len(predict_df))]

In [309]:
# Prepare features and target variable
features = ['resume_job_similarity', 'transcript_job_similarity']
# If you have a 'decision' column (ground truth), use it.
# Otherwise, you'll need a way to assign labels for training.
# Here, I'm creating a dummy target if you don't have one:
predict_df['decision'] = np.random.choice(['selected', 'rejected'], size=len(predict_df))

In [310]:
# Split data into training and testing sets
X = predict_df[features]
y = predict_df['decision']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [311]:
# Initialize, train, and save the Random Forest model
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train, y_train)

In [312]:
# Save the trained models
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(random_forest, f)

In [313]:
# Make predictions on the entire dataset (or a subset)
X_new = predict_df[features]
predictions = random_forest.predict(X_new)
predict_df['predicted_decision'] = predictions

In [314]:
# Display or save the predictions
predict_df[['ID', 'predicted_decision']]

Unnamed: 0,ID,predicted_decision
0,rivash0038,rejected
1,benjry660,rejected
2,rivash0968,rejected
3,rivash0623,selected
4,bradgr792,selected
...,...,...
95,rivash0939,rejected
96,rivash0073,selected
97,rivash0509,selected
98,rivash0096,rejected


In [315]:
selected_count = predict_df['predicted_decision'].value_counts()['selected']
rejected_count = predict_df['predicted_decision'].value_counts()['rejected']

print(f"Selected: {selected_count}")
print(f"Rejected: {rejected_count}")


Selected: 60
Rejected: 40


In [317]:
predict_df.to_excel('predictions_output.xlsx', index=False)

In [318]:
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders
import os

class EmailSender:
    def __init__(self, provider='gmail'):
        self.providers = {
            'gmail': {
                'smtp_server': 'smtp.gmail.com',
                'port': 587
            },
            'outlook': {
                'smtp_server': 'smtp-mail.outlook.com',
                'port': 587
            },
            'yahoo': {
                'smtp_server': 'smtp.mail.yahoo.com',
                'port': 587
            }
        }
        self.provider = provider

    def send_email(self, sender_email, sender_password, to_email, subject, body, file_path=None):
        try:
            # Configure SMTP server details
            smtp_server = self.providers[self.provider]['smtp_server']
            port = self.providers[self.provider]['port']

            # Create message
            message = MIMEMultipart()
            message["From"] = sender_email
            message["To"] = to_email
            message["Subject"] = subject
            message.attach(MIMEText(body, "plain"))

            # Attach file if provided
            if file_path and os.path.exists(file_path):
                with open(file_path, "rb") as attachment:
                    part = MIMEBase("application", "octet-stream")
                    part.set_payload(attachment.read())

                encoders.encode_base64(part)
                part.add_header(
                    "Content-Disposition",
                    f"attachment; filename={os.path.basename(file_path)}",
                )
                message.attach(part)

            # Send email
            with smtplib.SMTP(smtp_server, port) as server:
                server.starttls()
                server.login(sender_email, sender_password)
                server.send_message(message)
                print("Email sent successfully!")
                return True

        except Exception as e:
            print(f"Email sending failed: {e}")
            return False

# Example usage
if __name__ == "__main__":
    email_sender = EmailSender(provider='gmail')  # Can change to 'outlook' or 'yahoo'
    email_sender.send_email(
        sender_email="uppariupendra11@gmail.com",
        sender_password="eseuxrzxutsyjwse",
        to_email="21r21a66k0@mlrinstitutions.ac.in",
        subject="Test Email",
        body="This is a test email.",
        file_path="/content/predictions_output.xlsx"
    )

Email sent successfully!
