## Naive Bayes

In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, CategoricalNB
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
import numpy as np

# Load the data
df = pd.read_csv('C:/CUB/sem2/ml/proj/CN_Dataset/cleaned_data.csv')
# Separate features and target
X = df.drop('SuggestedJobRole', axis=1)
y = df['SuggestedJobRole']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 21 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   SuggestedJobRole                     10000 non-null  object
 1   keywords                             10000 non-null  object
 2   Logical quotient rating              10000 non-null  int64 
 3   hackathons                           10000 non-null  int64 
 4   coding skills rating                 10000 non-null  int64 
 5   public speaking points               10000 non-null  int64 
 6   self-learning capability?            10000 non-null  object
 7   Extra-courses did                    10000 non-null  object
 8   certifications                       10000 non-null  object
 9   workshops                            10000 non-null  object
 10  reading and writing skills           10000 non-null  object
 11  memory capability score              10000

In [81]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, CategoricalNB
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack

# 1) Load Data
df = pd.read_csv('C:/CUB/sem2/ml/proj/CN_Dataset/cleaned_data.csv')

# 2) Fix Non-Numeric Columns Used in "numeric_features"
score_map = {"poor": 1, "medium": 2, "excellent": 3}
df["memory capability score"] = df["memory capability score"].map(score_map)
df["public speaking points"] = df["public speaking points"].astype(int)

# 3) Separate Target
y = df["SuggestedJobRole"]
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# 4) MultinomialNB
vectorizer_multinomial = CountVectorizer(tokenizer=lambda x: x.split(", "))
X_keywords_counts = vectorizer_multinomial.fit_transform(df["keywords"])

numeric_features = [
    "Logical quotient rating", 
    "hackathons", 
    "coding skills rating", 
    "public speaking points", 
    "memory capability score"
]
X_numeric = df[numeric_features]
X_multinomial = hstack([X_keywords_counts, X_numeric])

X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multinomial, y_encoded, test_size=0.3, random_state=42
)

mnb = MultinomialNB()
mnb.fit(X_train_multi, y_train_multi)
y_pred_multi = mnb.predict(X_test_multi)
print("=== MultinomialNB ===")
print(f"Accuracy: {accuracy_score(y_test_multi, y_pred_multi):.4f}")


# 5) BernoulliNB
vectorizer_bernoulli = CountVectorizer(tokenizer=lambda x: x.split(", "), binary=True)
X_keywords_binary = vectorizer_bernoulli.fit_transform(df["keywords"])

binary_cols = [
    "self-learning capability?", 
    "Extra-courses did", 
    "worked in teams ever?", 
    "Introvert"
]
df_binary = df[binary_cols].copy()
for col in binary_cols:
    df_binary[col] = df_binary[col].map({"yes": 1, "no": 0})

X_bernoulli = hstack([X_keywords_binary, df_binary])

X_train_bern, X_test_bern, y_train_bern, y_test_bern = train_test_split(
    X_bernoulli, y_encoded, test_size=0.3, random_state=42
)

bnb = BernoulliNB()
bnb.fit(X_train_bern, y_train_bern)
y_pred_bern = bnb.predict(X_test_bern)
print("\n=== BernoulliNB ===")
print(f"Accuracy: {accuracy_score(y_test_bern, y_pred_bern):.4f}")


# 6) CategoricalNB
categorical_cols = [
    "Interested subjects", 
    # "interested career area",  # removed if not in df.columns
    "Type of company want to settle in?",
    "Interested Type of Books", 
    "Management or Technical", 
    "hard/smart worker",
    "certifications", 
    "workshops", 
    "reading and writing skills"
]
df_cat = df[categorical_cols].copy()
for col in categorical_cols:
    df_cat[col] = LabelEncoder().fit_transform(df_cat[col])

X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(
    df_cat, y_encoded, test_size=0.3, random_state=42
)

catnb = CategoricalNB()
catnb.fit(X_train_cat.values, y_train_cat)
y_pred_cat = catnb.predict(X_test_cat.values)
print("\n=== CategoricalNB ===")
print(f"Accuracy: {accuracy_score(y_test_cat, y_pred_cat):.4f}")




=== MultinomialNB ===
Accuracy: 0.3440

=== BernoulliNB ===
Accuracy: 0.3390

=== CategoricalNB ===
Accuracy: 0.2587
