<a href="https://colab.research.google.com/github/aplha27/Infosys_internship/blob/main/infosys_assignment_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install openpyxl



In [3]:
!pip install scikit-learn==1.2.2




In [4]:
!pip install xgboost



In [None]:
import pandas as pd
import numpy as np
import re
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Load dataset
data_path = 'cleaned_data.xlsx'
df = pd.read_excel(data_path)

# Feature extraction functions
def count_words(text):
    return len(text.split()) if pd.notnull(text) else 0

def count_characters(text):
    return len(text) if pd.notnull(text) else 0

def avg_word_length(text):
    words = text.split() if pd.notnull(text) else []
    return sum(len(word) for word in words) / len(words) if words else 0

def count_sentences(text):
    return len(re.split(r'[.!?]', text)) - 1 if pd.notnull(text) else 0

def count_uppercase_ratio(text):
    return sum(1 for char in text if char.isupper()) / len(text) if pd.notnull(text) and len(text) > 0 else 0

def keyword_count(text, keywords):
    words = text.split() if pd.notnull(text) else []
    return sum(1 for word in words if word.lower() in keywords)

def unique_word_ratio(text):
    words = text.split() if pd.notnull(text) else []
    return len(set(words)) / len(words) if words else 0

def check_role_in_resume(resume, role):
    return 1 if pd.notnull(resume) and role.lower() in resume.lower() else 0

def keyword_overlap(text1, text2):
    if pd.notnull(text1) and pd.notnull(text2):
        words1 = set(text1.split())
        words2 = set(text2.split())
        return len(words1 & words2)
    return 0

# Predefined keyword dictionaries
technical_keywords = {'python', 'java', 'sql', 'machine learning', 'cloud', 'design', 'analysis', 'management'}
positive_keywords = {'excellent', 'success', 'outstanding', 'achievement', 'skilled'}
negative_keywords = {'poor', 'inadequate', 'lacking', 'failure', 'weak'}

# Feature engineering
df['resume_word_count'] = df['Cleaned_Resume'].apply(count_words)
df['resume_char_count'] = df['Cleaned_Resume'].apply(count_characters)
df['resume_avg_word_length'] = df['Cleaned_Resume'].apply(avg_word_length)
df['resume_sentence_count'] = df['Cleaned_Resume'].apply(count_sentences)
df['resume_uppercase_ratio'] = df['Cleaned_Resume'].apply(count_uppercase_ratio)
df['resume_technical_keyword_count'] = df['Cleaned_Resume'].apply(lambda x: keyword_count(x, technical_keywords))
df['resume_positive_keyword_count'] = df['Cleaned_Resume'].apply(lambda x: keyword_count(x, positive_keywords))
df['resume_negative_keyword_count'] = df['Cleaned_Resume'].apply(lambda x: keyword_count(x, negative_keywords))
df['resume_unique_word_ratio'] = df['Cleaned_Resume'].apply(unique_word_ratio)

df['transcript_word_count'] = df['Cleaned_Transcript'].apply(count_words)
df['transcript_char_count'] = df['Cleaned_Transcript'].apply(count_characters)
df['transcript_avg_word_length'] = df['Cleaned_Transcript'].apply(avg_word_length)
df['transcript_sentence_count'] = df['Cleaned_Transcript'].apply(count_sentences)
df['transcript_uppercase_ratio'] = df['Cleaned_Transcript'].apply(count_uppercase_ratio)
df['transcript_positive_keyword_count'] = df['Cleaned_Transcript'].apply(lambda x: keyword_count(x, positive_keywords))
df['transcript_negative_keyword_count'] = df['Cleaned_Transcript'].apply(lambda x: keyword_count(x, negative_keywords))
df['transcript_unique_word_ratio'] = df['Cleaned_Transcript'].apply(unique_word_ratio)

df['job_role_in_resume'] = df.apply(lambda row: check_role_in_resume(row['Cleaned_Resume'], row['Role']), axis=1)

df['resume_job_keyword_overlap'] = df.apply(lambda row: keyword_overlap(row['Cleaned_Resume'], row['Cleaned_Job_Description']), axis=1)
df['transcript_job_keyword_overlap'] = df.apply(lambda row: keyword_overlap(row['Cleaned_Transcript'], row['Cleaned_Job_Description']), axis=1)

# Role popularity (frequency encoding)
role_counts = df['Role'].value_counts()
df['role_popularity'] = df['Role'].map(role_counts)

# Decision reason encoding
df['decision_reason_encoded'] = df['Reason for decision'].astype('category').cat.codes

# Embedding extraction
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Function to compute embeddings in batches
def compute_embeddings(texts):
    inputs = tokenizer(texts.tolist(), return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Extract embeddings for each column
resume_embeddings = compute_embeddings(df['Cleaned_Resume'].fillna(""))
transcript_embeddings = compute_embeddings(df['Cleaned_Transcript'].fillna(""))
job_description_embeddings = compute_embeddings(df['Cleaned_Job_Description'].fillna(""))

# Combine embeddings
embedding_features = np.hstack([resume_embeddings, transcript_embeddings, job_description_embeddings])

# Additional features
engineered_features = [
    'transcript_positive_keyword_count', 'resume_positive_keyword_count',
    'transcript_avg_word_length', 'decision_reason_encoded',
    'transcript_char_count', 'transcript_job_keyword_overlap',
    'resume_negative_keyword_count', 'resume_job_keyword_overlap',
    'resume_char_count', 'transcript_unique_word_ratio'
]
additional_features = df[engineered_features].values

# Combine all features
X = np.hstack([embedding_features, additional_features])
y = df['decision'].apply(lambda x: 1 if x == 'select' else 0).values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train XGBoost model
xgb_params = {'colsample_bytree': 0.6, 'learning_rate': 0.05, 'max_depth': 5,
              'n_estimators': 300, 'subsample': 1.0}
xgb_model = XGBClassifier(**xgb_params)
xgb_model.fit(X_train, y_train)

# Train Neural Network
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
mlp.fit(X_train, y_train)

# Ensemble the models
ensemble_model = VotingClassifier(estimators=[('xgb', xgb_model), ('mlp', mlp)], voting='soft')
ensemble_model.fit(X_train, y_train)

# Evaluate the ensemble model
y_pred = ensemble_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Ensemble Model Accuracy: {accuracy}')



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
