In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Load dataset
def load_data(filepath):
    df = pd.read_csv(filepath)
    df.dropna(inplace=True)
    return df
# Vectorization
def vectorize_text(data):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(data)
    return X, vectorizer
# Train individual models
def train_random_forest(X_train, y_train):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model
def train_naive_bayes(X_train, y_train):
    model = MultinomialNB()
    model.fit(X_train, y_train)
    return model
def train_svm(X_train, y_train):
    model = SVC(kernel='linear')
    model.fit(X_train, y_train)
    return model
# Evaluate model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    return accuracy, report, cm, y_pred
# Plot confusion matrix
def plot_confusion_matrix(cm, model_name):
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.show()
# Main function
def main():
    filepath = 'new_data.csv'  # Change this to your file path
    df = load_data(filepath)
 
    X, vectorizer = vectorize_text(df['Required Skills'])
    y = LabelEncoder().fit_transform(df['Job Title'])
   
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    rf_model = train_random_forest(X_train, y_train)
    nb_model = train_naive_bayes(X_train, y_train)
    svm_model = train_svm(X_train, y_train)
    
    rf_acc, rf_report, rf_cm, _ = evaluate_model(rf_model, X_test, y_test)
    nb_acc, nb_report, nb_cm, _ = evaluate_model(nb_model, X_test, y_test)
    svm_acc, svm_report, svm_cm, _ = evaluate_model(svm_model, X_test, y_test)
    
    print(f'--- RandomForest ---')
    print(f'Accuracy: {rf_acc:.2f}')
    print(rf_report)
    plot_confusion_matrix(rf_cm, 'RandomForest')
    
    print(f'--- NaiveBayes ---')
    print(f'Accuracy: {nb_acc:.2f}')
    print(nb_report)
    plot_confusion_matrix(nb_cm, 'NaiveBayes')
    
    print(f'--- SVM ---')
    print(f'Accuracy: {svm_acc:.2f}')
    print(svm_report)
    plot_confusion_matrix(svm_cm, 'SVM')

if __name__ == '__main__':
    main()


UnicodeDecodeError: 'utf-8' codec can't decode byte 0x9c in position 171: invalid start byte