# Job Fit Classification Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load data
train_df = pd.read_pickle('train.pkl')

# Assuming similar loading for test data
# test_df = pd.read_pickle('test.pkl')

## Exploratory Data Analysis (EDA)

In [None]:
# Class distribution
plt.figure(figsize=(10,6))
sns.countplot(x='label', data=train_df, order=train_df['label'].value_counts().index)
plt.title('Class Distribution')
plt.show()

In [None]:
# Feature distributions
features = ['jaccard_skills', 'jaccard_adj', 'jaccard_adv']
plt.figure(figsize=(15,10))
for i, feat in enumerate(features, 1):
    plt.subplot(3,1,i)
    sns.boxplot(x='label', y=feat, data=train_df)
    plt.title(f'Distribution of {feat} by Class')
plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
plt.figure(figsize=(10,8))
sns.heatmap(train_df[features + ['label']].corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Matrix')
plt.show()

## Data Preprocessing

In [None]:
# Encode labels
le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df['label'])

# Split data
X = train_df[features]
y = train_df['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

## Handling Class Imbalance

In [None]:
# Calculate class weights
class_weights = dict(1 / (y_train.value_counts(normalize=True)))

# SMOTE oversampling
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

## Model Training & Evaluation

In [None]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)
    
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    
    print("Confusion Matrix:")
    plt.figure(figsize=(8,6))
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', 
                xticklabels=le.classes_, yticklabels=le.classes_)
    plt.show()
    
    print("ROC AUC Score:", roc_auc_score(y_test, y_proba, multi_class='ovr'))
    
    RocCurveDisplay.from_estimator(model, X_test, y_test)
    plt.title('ROC Curves')
    plt.show()

In [None]:
# Random Forest with class weights
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'max_features': ['sqrt', 'log2']
}

rf_grid = GridSearchCV(rf, rf_params, cv=StratifiedKFold(3), scoring='f1_weighted', n_jobs=-1)
rf_grid.fit(X_res, y_res)
best_rf = rf_grid.best_estimator_
print("Best Random Forest Parameters:", rf_grid.best_params_)
evaluate_model(best_rf, X_val, y_val)

In [None]:
# XGBoost with class weights
xgb = XGBClassifier(scale_pos_weight=class_weights, eval_metric='mlogloss', random_state=42)
xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0]
}

xgb_grid = GridSearchCV(xgb, xgb_params, cv=StratifiedKFold(3), scoring='f1_weighted', n_jobs=-1)
xgb_grid.fit(X_res, y_res)
best_xgb = xgb_grid.best_estimator_
print("Best XGBoost Parameters:", xgb_grid.best_params_)
evaluate_model(best_xgb, X_val, y_val)

## Final Model Selection & Testing

In [None]:
# Compare models and select best performer
# Assuming XGBoost performed better
final_model = best_xgb

# For final testing (when test data is available):
# final_model.fit(X, y)
# evaluate_model(final_model, X_test, y_test)