# **JOB RECOMMENDATION SYSTEM**

GEREKLİLİKLERİ YAZ

In [62]:
#libs

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

In [63]:
#dataset
df = pd.read_csv('job_recommendation_dataset.csv')
df.head()

Unnamed: 0,Job Title,Company,Location,Experience Level,Salary,Industry,Required Skills
0,Early years teacher,Richardson Ltd,Sydney,Senior Level,87000.0,Healthcare,Pharmaceuticals
1,Counselling psychologist,"Ramos, Santiago and Stewart",San Francisco,Mid Level,50000.0,Marketing,"Google Ads, SEO, Content Writing"
2,Radio broadcast assistant,Franco Group,New York,Mid Level,77000.0,Healthcare,"Patient Care, Nursing, Medical Research, Pharm..."
3,"Designer, exhibition/display",Collins Inc,Berlin,Senior Level,90000.0,Software,Machine Learning
4,"Psychotherapist, dance movement",Barker Group,Sydney,Entry Level,112000.0,Healthcare,"Nursing, Medical Research, Pharmaceuticals"


In [64]:
# Check unique values in dataset for filtering
print("Unique Locations:")
print(df['Location'].unique()[:20])
print(f"\nUnique Experience Levels:")
print(df['Experience Level'].unique())

Unique Locations:
['Sydney' 'San Francisco' 'New York' 'Berlin' 'London' 'Bangalore'
 'Toronto']

Unique Experience Levels:
['Senior Level' 'Mid Level' 'Entry Level']


In [65]:
print("Dataset loaded successfully\n")
print(f"Number of records: {len(df)}")
print(f"Columns: {df.columns.tolist()}\n")
print(f"Dataset general structures and summaries: {df.info()}\n")
print(f"Statistical summary of numerical columns:\n{df.describe()}\n")

Dataset loaded successfully

Number of records: 50000
Columns: ['Job Title', 'Company', 'Location', 'Experience Level', 'Salary', 'Industry', 'Required Skills']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Job Title         50000 non-null  object 
 1   Company           50000 non-null  object 
 2   Location          50000 non-null  object 
 3   Experience Level  50000 non-null  object 
 4   Salary            50000 non-null  float64
 5   Industry          50000 non-null  object 
 6   Required Skills   50000 non-null  object 
dtypes: float64(1), object(6)
memory usage: 2.7+ MB
Dataset general structures and summaries: None

Statistical summary of numerical columns:
              Salary
count   50000.000000
mean    95145.100000
std     31782.635648
min     40000.000000
25%     68000.000000
50%     95000.000000
75%    123000.0000

In [66]:
#missing values etc...
print("Checking for missing values:\n", df.isnull().sum())

Checking for missing values:
 Job Title           0
Company             0
Location            0
Experience Level    0
Salary              0
Industry            0
Required Skills     0
dtype: int64


In [67]:
#no missing values found
#features:

X = df[['Required Skills', 'Experience Level','Location', 'Salary']]
y = df['Industry']#classification target


In [68]:
#preprocessing and model pipeline
#we will use TfidfVectorizer for text data, OneHotEncoder for categorical data, and MinMaxScaler for numerical data.
preprocessor = ColumnTransformer(
    transformers=[
        ('skills_tdfidf', TfidfVectorizer(), 'Required Skills'),
        ('experience_ohe', OneHotEncoder(handle_unknown='ignore'), ['Experience Level']),
        ('location_ohe', OneHotEncoder(handle_unknown='ignore'), ['Location']),
        ('salary_scaler', MinMaxScaler(), ['Salary'])
    ])

In [69]:
#random forest classifier pipeline
model = Pipeline(
    
    steps=[
        ('preprocessing', preprocessor),
        ('classifier', RandomForestClassifier(
            
            n_estimators=200,
            max_depth=None,
            random_state=42
            
        ))
    ]
)

In [70]:
#train test ayrımı/split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
#stratifiy=y to maintain class distribution in train and test sets

In [71]:
#model training
model.fit(X_train, y_train)
print("Model training completed.\n")

Model training completed.



In [72]:
#performanse and metriks:

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score (macro):", f1_score(y_test, y_pred, average="macro"))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9811
F1 Score (macro): 0.9811280195846142

Classification Report:
               precision    recall  f1-score   support

    Education       1.00      1.00      1.00      1429
      Finance       0.94      0.93      0.93      1403
   Healthcare       1.00      1.00      1.00      1421
Manufacturing       1.00      1.00      1.00      1434
    Marketing       1.00      1.00      1.00      1432
       Retail       1.00      1.00      1.00      1421
     Software       0.93      0.94      0.94      1460

     accuracy                           0.98     10000
    macro avg       0.98      0.98      0.98     10000
 weighted avg       0.98      0.98      0.98     10000


Confusion Matrix:
[[1429    0    0    0    0    0    0]
 [   0 1303    0    0    0    0  100]
 [   0    0 1421    0    0    0    0]
 [   0    0    0 1434    0    0    0]
 [   0    0    0    0 1432    0    0]
 [   0    0    0    0    0 1421    0]
 [   0   89    0    0    0    0 1371]]
               precision    

In [73]:
# Baseline comparison - majority class accuracy
majority_acc = (y_test.value_counts().iloc[0] / len(y_test))
print(f"\n{'='*50}")
print(f"Majority-class baseline accuracy: {majority_acc:.3f}")
print(f"Model accuracy improvement: {(accuracy_score(y_test, y_pred) - majority_acc):.3f}")

# Cross-validation for robustness check
print(f"\n{'='*50}")
print("5-Fold Stratified Cross-Validation:")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=cv, scoring="accuracy", n_jobs=-1)
print(f"CV Accuracy: mean={cv_scores.mean():.3f}, std={cv_scores.std():.3f}")
print(f"CV Scores per fold: {[f'{s:.3f}' for s in cv_scores]}")

# Class distribution analysis
print(f"\n{'='*50}")
print("Class distribution in test set:")
print(y_test.value_counts(normalize=True).sort_index())

#değerlerimiz muhteeseme yakın, cv sonuçlarımızla da uyumlu görünüyor.



Majority-class baseline accuracy: 0.146
Model accuracy improvement: 0.835

5-Fold Stratified Cross-Validation:
CV Accuracy: mean=0.983, std=0.001
CV Scores per fold: ['0.982', '0.983', '0.982', '0.985', '0.983']

Class distribution in test set:
Industry
Education        0.1429
Finance          0.1403
Healthcare       0.1421
Manufacturing    0.1434
Marketing        0.1432
Retail           0.1421
Software         0.1460
Name: proportion, dtype: float64
CV Accuracy: mean=0.983, std=0.001
CV Scores per fold: ['0.982', '0.983', '0.982', '0.985', '0.983']

Class distribution in test set:
Industry
Education        0.1429
Finance          0.1403
Healthcare       0.1421
Manufacturing    0.1434
Marketing        0.1432
Retail           0.1421
Software         0.1460
Name: proportion, dtype: float64


In [74]:
#skill basedf tf-idf vectorization for matching

from sklearn.metrics.pairwise import cosine_similarity
tfidf = TfidfVectorizer()
skill_matrix = tfidf.fit_transform(df['Required Skills'])

In [80]:
def recommend_jobs(user_profile, top_n=5):
    # Vectorize user skills
    user_skills_text = user_profile["skills"]
    user_vec = tfidf.transform([user_skills_text])
    
    # Calculate cosine similarity
    similarities = cosine_similarity(user_vec, skill_matrix)[0]
    
    # Apply filters
    filtered_df = df.copy()
    
    # Location filter
    if user_profile.get('preferred_location'):
        filtered_df = filtered_df[filtered_df['Location'] == user_profile['preferred_location']]
    
    # Experience level filter
    if user_profile.get('experience_level'):
        filtered_df = filtered_df[filtered_df['Experience Level'] == user_profile['experience_level']]
    
    # Get filtered indices and sort by similarity
    filtered_indices = filtered_df.index.tolist()
    filtered_scores = [(idx, similarities[idx]) for idx in filtered_indices]
    filtered_scores = sorted(filtered_scores, key=lambda x: x[1], reverse=True)
    
    # Get top N results
    top_results = filtered_scores[:top_n]
    
    # Format results
    recommendations = []
    for idx, score in top_results:
        job = df.iloc[idx]
        recommendations.append((idx, score, job.to_dict()))
    
    return recommendations


def print_recommendations(recommendations):
    """
    Pretty prints job recommendations.
    
    Parameters:
    -----------
    recommendations : list
        Output from recommend_jobs function
    """
    print(f"{'='*60}")
    print(f"Top {len(recommendations)} Job Recommendations:")
    print(f"{'='*60}\n")
    
    for i, (idx, score, job) in enumerate(recommendations, 1):
        print(f"#{i} | Similarity Score: {score:.3f}")
        print(f"   Job Title: {job['Job Title']}")
        print(f"   Company: {job['Company']}")
        print(f"   Location: {job['Location']}")
        print(f"   Experience Level: {job['Experience Level']}")
        print(f"   Industry: {job['Industry']}")
        print(f"   Salary: ${job['Salary']:,}")
        print(f"   Required Skills: {job['Required Skills']}")
        print(f"{'-'*60}\n")

In [None]:
# Example user profiles for testing
user_profile_1 = {
    'skills': 'Python, Machine Learning, Data Analysis',
    'preferred_location': 'New York',
    'experience_level': 'Mid Level',
}

user_profile_2 = {
    'skills': 'JavaScript, React, Node.js',
    'preferred_location': 'San Francisco',
    'experience_level': 'Senior Level',
}

user_profile_3 = {
    'skills': 'SQL, Data Visualization, Tableau',
    'preferred_location': None,  # No location preference
    'experience_level': 'Entry Level',
}

In [77]:
# Test with user profile 1
recommendations = recommend_jobs(user_profile_1, top_n=5)
print_recommendations(recommendations)

Top 5 Job Recommendations:

#1 | Similarity Score: 0.869
   Job Title: Rural practice surveyor
   Company: Cortez Ltd
   Location: New York
   Experience Level: Mid Level
   Industry: Software
   Salary: $120,000.0
   Required Skills: Machine Learning, Python
------------------------------------------------------------

#2 | Similarity Score: 0.869
   Job Title: Radio producer
   Company: Jones Group
   Location: New York
   Experience Level: Mid Level
   Industry: Software
   Salary: $85,000.0
   Required Skills: Python, Machine Learning
------------------------------------------------------------

#3 | Similarity Score: 0.869
   Job Title: Clinical scientist, histocompatibility and immunogenetics
   Company: Sparks PLC
   Location: New York
   Experience Level: Mid Level
   Industry: Software
   Salary: $63,000.0
   Required Skills: Machine Learning, Python
------------------------------------------------------------

#4 | Similarity Score: 0.869
   Job Title: Chief Operating Officer

In [78]:
# Test with user profile 3 (no location filter)
recommendations_3 = recommend_jobs(user_profile_3, top_n=5)
print_recommendations(recommendations_3)

Top 5 Job Recommendations:

#1 | Similarity Score: 1.000
   Job Title: Naval architect
   Company: Lamb-Hill
   Location: New York
   Experience Level: Entry Level
   Industry: Finance
   Salary: $81,000.0
   Required Skills: SQL
------------------------------------------------------------

#2 | Similarity Score: 1.000
   Job Title: Lighting technician, broadcasting/film/video
   Company: Hunter LLC
   Location: Toronto
   Experience Level: Entry Level
   Industry: Software
   Salary: $86,000.0
   Required Skills: SQL
------------------------------------------------------------

#3 | Similarity Score: 1.000
   Job Title: Optometrist
   Company: Gregory, Ramirez and Gonzalez
   Location: London
   Experience Level: Entry Level
   Industry: Software
   Salary: $129,000.0
   Required Skills: C++, SQL
------------------------------------------------------------

#4 | Similarity Score: 1.000
   Job Title: Research officer, political party
   Company: Anderson-Myers
   Location: Sydney
   Ex

In [None]:
#user 2
recommendations_2 = recommend_jobs(user_profile_2, top_n=3)
print_recommendations(recommendations_2)

Top 3 Job Recommendations:

#1 | Similarity Score: 1.000
   Job Title: Restaurant manager
   Company: Webb, Reese and Esparza
   Location: San Francisco
   Experience Level: Senior Level
   Industry: Software
   Salary: $75,000.0
   Required Skills: React
------------------------------------------------------------

#2 | Similarity Score: 1.000
   Job Title: Chartered management accountant
   Company: Krause-Sparks
   Location: San Francisco
   Experience Level: Senior Level
   Industry: Software
   Salary: $77,000.0
   Required Skills: React
------------------------------------------------------------

#3 | Similarity Score: 1.000
   Job Title: Dance movement psychotherapist
   Company: Park-Stephens
   Location: San Francisco
   Experience Level: Senior Level
   Industry: Software
   Salary: $65,000.0
   Required Skills: React
------------------------------------------------------------

