<a href="https://colab.research.google.com/github/ashwinsabu/Ashwin-Sabu/blob/master/project_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('job_descriptions.csv')

# Step 1: Remove duplicate and null rows
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

# Step 2: Truncate to the top 10,000 rows (if larger)
if df.shape[0] > 10000:
    df = df.head(10000)

# Step 3: Remove specified columns
columns_to_remove = ['Job Id', 'location', 'Country', 'latitude', 'longitude',
                     'Work Type', 'Company Size', 'Job Posting Date', 'Preference',
                     'Contact Person', 'Contact', 'Role', 'Job Portal',
                     'Benefits', 'Responsibilities', 'Company']

df.drop(columns=columns_to_remove, inplace=True, errors='ignore')

# Display the cleaned DataFrame
print(df.head())

# Optionally, save the cleaned DataFrame back to a CSV file
df.to_csv('job_descriptions_cleaned.csv', index=False)


      Experience Qualifications Salary Range                     Job Title  \
0  5 to 15 Years         M.Tech    $59K-$99K  Digital Marketing Specialist   
1  2 to 12 Years            BCA   $56K-$116K                 Web Developer   
2  0 to 12 Years            PhD   $61K-$104K            Operations Manager   
3  4 to 11 Years            PhD    $65K-$91K              Network Engineer   
4  1 to 12 Years            MBA    $64K-$87K                 Event Manager   

                                     Job Description  \
0  Social Media Managers oversee an organizations...   
1  Frontend Web Developers design and implement u...   
2  Quality Control Managers establish and enforce...   
3  Wireless Network Engineers design, implement, ...   
4  A Conference Manager coordinates and manages c...   

                                              skills  \
0  Social media platforms (e.g., Facebook, Twitte...   
1  HTML, CSS, JavaScript Frontend frameworks (e.g...   
2  Quality control process

In [3]:
import pandas as pd

# Load the cleaned CSV file into a DataFrame
df_cleaned = pd.read_csv('job_descriptions_cleaned.csv')

# Count unique job titles
title_counts = df_cleaned['Job Title'].value_counts()

# Convert to DataFrame and reset index

df_title_counts = title_counts.reset_index()
df_title_counts.columns = ['Job Title', 'Count']

# Save to another CSV file
df_title_counts.to_csv('job_title_counts.csv', index=False)


In [4]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet') # Download wordnet for lemmatization

# Load the cleaned CSV file into a DataFrame
df = pd.read_csv('job_descriptions_cleaned.csv')

# Function to clean and process text
def clean_text(text):
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize text
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]  # Remove stop words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize
    return ' '.join(tokens)

# Clean the skills column
df['cleaned_skills'] = df['skills'].apply(clean_text)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_skills'])
features = vectorizer.get_feature_names_out()

# Extract keywords for each job description
def extract_keywords(text, vectorizer):
    tfidf_matrix = vectorizer.transform([text])
    indices = tfidf_matrix.nonzero()[1]
    keywords = [vectorizer.get_feature_names_out()[index] for index in indices]
    return ' '.join(keywords)

# Add extracted keywords to the DataFrame
df['skills'] = df['cleaned_skills'].apply(lambda x: extract_keywords(x, vectorizer))

# Drop the temporary cleaned_skills column
df.drop(columns=['cleaned_skills'], inplace=True)

# Save the modified DataFrame to a new CSV file
df.to_csv('job_descriptions_with_keywords.csv', index=False)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Load the CSV file into a DataFrame
df = pd.read_csv('job_descriptions_with_keywords.csv')

# Function to convert experience range to numeric average
def convert_experience_to_numeric(experience):
    if isinstance(experience, str):
        experience_range = experience.replace('Years', '').strip().split(' to ')
        return (int(experience_range[0]) + int(experience_range[1])) / 2
    return np.nan

# Apply the conversion to the Experience column
df['Experience'] = df['Experience'].apply(convert_experience_to_numeric)

# Check for NaN values and remove rows with NaN values in 'Experience'
df.dropna(subset=['Experience'], inplace=True)

# Drop duplicates
df.drop_duplicates(inplace=True)

# Truncate to the top 10,000 rows
df = df.head(10000)

# Remove specified columns
columns_to_remove = ['Job Id', 'location', 'Country', 'latitude', 'longitude',
                     'Work Type', 'Company Size', 'Job Posting Date', 'Preference',
                     'Contact Person', 'Contact', 'Role', 'Job Portal',
                     'Benefits', 'Responsibilities', 'Company']

df.drop(columns=columns_to_remove, inplace=True, errors='ignore')

# Define the features (X) and target (y)
X = df[['Experience', 'Qualifications', 'Salary Range', 'skills']]
y = df['Job Title']

# Identify and remove classes with only one instance
value_counts = y.value_counts()
single_instance_classes = value_counts[value_counts == 1].index
df = df[~y.isin(single_instance_classes)]  # Remove rows corresponding to single-instance classes

# Update X and y after removing single-instance classes
X = df[['Experience', 'Qualifications', 'Salary Range', 'skills']]
y = df['Job Title']

# Split data into training and testing sets (70-30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Define preprocessing for numeric features (Experience)
numeric_features = ['Experience']
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Define preprocessing for categorical features (Qualifications and Salary Range)
categorical_features = ['Qualifications', 'Salary Range']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Define preprocessing for text features (skills)
text_features = 'skills'
text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(max_features=1000))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('text', text_transformer, text_features)
    ])

# Define the Random Forest Classifier model
rf_classifier = RandomForestClassifier(random_state=42)

# Create a pipeline with the preprocessor and the classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', rf_classifier)
])

# Define parameter grid for GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Implement GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')

# Train the model using grid search
grid_search.fit(X_train, y_train)

# Predict on the test set
y_pred = grid_search.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Perform cross-validation to evaluate the model more robustly
cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5)
print(f'Cross-validation accuracy scores: {cv_scores}')
print(f'Average cross-validation accuracy: {np.mean(cv_scores):.2f}')




Accuracy: 0.89




Cross-validation accuracy scores: [0.93229167 0.91099476 0.91099476 0.93193717 0.93193717]
Average cross-validation accuracy: 0.92
