In [None]:
! pip install optuna

In [None]:
! pip install shap

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Feature Engineering
from sklearn.model_selection import train_test_split ,GridSearchCV ,learning_curve
from sklearn.preprocessing import LabelEncoder ,OrdinalEncoder
# Model
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
# Evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import optuna
import shap
# print the JS visualization code to the notebook
shap.initjs()

sns.set(style="whitegrid")
pd.set_option('display.max_columns', None)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
pd.set_option("display.precision", 2)
plt.rcParams['axes.unicode_minus'] = False # 正常顯示負號

# EDA

In [None]:
df = pd.read_csv('/content/Student Placement.csv')

# df # 707 rows × 13 columns

In [None]:
df.info() # No missing values

In [None]:
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    print(f"Unique values for column '{col}': {df[col].unique()}")

In [None]:
# Numerical statistics
df.describe()

In [None]:
# Object statistics
df.describe(include=['object'])

In [None]:
numerical_columns = df.select_dtypes(include='number').columns

plt.figure(figsize=(12, 6))
for i, column in enumerate(numerical_columns, 1):
    plt.subplot(2, len(numerical_columns)//2, i)
    sns.histplot(df[column], bins=20, kde=True)
    plt.title(f'Histogram of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
numerical_columns = df.select_dtypes(include='number').columns

plt.figure(figsize=(12, 6))
for column in numerical_columns:
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=df[column])
    plt.title(f'Boxplot of {column}')
    plt.xlabel(column)
    plt.ylabel('Values')
    plt.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.subplot(1 ,2 ,1)
sns.countplot(data =df ,x ='Profile')
plt.xticks(rotation=45)
plt.title('Profile Count')

plt.subplot(1 ,2 ,2)
profile_counts = df['Profile'].value_counts()
plt.pie(profile_counts, labels = profile_counts.index, autopct='%1.1f%%')
plt.title('Profile Percentage')
plt.tight_layout()
plt.show()

print(df['Profile'].value_counts()) # 各Profile的個數都為101，百分比也相同

# Feature Engineering

In [None]:
# use_columns = ['Profile']
# df_copy = df.copy()
# label_encoder = LabelEncoder()
# for col in use_columns:
#     df_copy[col + '_encoded'] = label_encoder.fit_transform(df[col])
#     original_values = df_copy[col].unique()
#     encoded_values = df_copy[col + '_encoded'].unique()
#     print(f"Original values for column '{col}': {original_values}")
#     print(f"Encoded values for column '{col}': {encoded_values}")
#     print()

In [None]:
lb_encoder = LabelEncoder()
df['Profile'] = lb_encoder.fit_transform(df['Profile'])
# ['UI/UX' 'Web Developer' 'DATA Scientist' 'Software Engineer' 'Database Administrator' 'Network Engineer' 'Tech Support']
# [5 6 0 3 1 2 4]

or_encoder = OrdinalEncoder()
use_columns = ['Skill 1', 'Skill 2']
df[use_columns] = or_encoder.fit_transform(df[use_columns])
df.head()

# Hetmap

In [None]:
corr = df.corr()
plt.figure(figsize=(10, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
correlation_matrix = df.corr().abs()
top_correlations = correlation_matrix['Profile'].sort_values(ascending=False).head(10)
print(top_correlations)

In [None]:
df.describe()

# Train Test Split

In [None]:
X = df[['Mathmetics' ,'Aptitute' ,'Problem Solving' ,'CN' ,'Creative']]
y = df['Profile']
X_train ,X_test ,y_train ,y_test = train_test_split(X ,y ,test_size=0.1 ,random_state=42 ,stratify=y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
unique, counts = np.unique(y_train, return_counts=True)
print(dict(zip(unique, counts)))
unique, counts = np.unique(y_test, return_counts=True)
print(dict(zip(unique, counts)))

In [None]:
# 查看全部資料的類別比例
print(pd.Series(y).value_counts(normalize=True))
# 查看訓練集標籤的分佈
print(pd.Series(y_train).value_counts(normalize=True))
# 查看測試集的標籤分佈
print(pd.Series(y_test).value_counts(normalize=True))

# Model
使用不同的model來訓練，GridSearchCV來找尋參數、產生評估指標、繪製學習曲線、儲存模型、解釋模型

# DecisionTreeClassifier

In [None]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 12, 15, 17, 20],
    'min_samples_split': [10, 12, 15, 17, 20],
    'max_leaf_nodes': [20, 25, 30, 35, 40],
    'min_samples_leaf': [2, 5, 7, 10],
}

dtree_clf = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(estimator=dtree_clf, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [None]:
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

In [None]:
best_dtree_clf = grid_search.best_estimator_
y_pred = best_dtree_clf.predict(X_test)

In [None]:
def evaluate_model_performance(model, X_train, y_train, X_test, y_test):
    # 訓練集上的模型評分
    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_conf_matrix = confusion_matrix(y_train, y_train_pred)
    train_classification_report = classification_report(y_train, y_train_pred)
    print("Training Set Evaluation:")
    print("Accuracy:", train_accuracy)
    print("Confusion Matrix:")
    print(train_conf_matrix)
    print("Classification Report:")
    print(train_classification_report)
    print("\n")

    # 測試集上的模型評分
    y_test_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_conf_matrix = confusion_matrix(y_test, y_test_pred)
    test_classification_report = classification_report(y_test, y_test_pred)
    print("Testing Set Evaluation:")
    print("Accuracy:", test_accuracy)
    print("Confusion Matrix:")
    print(test_conf_matrix)
    print("Classification Report:")
    print(test_classification_report)

evaluate_model_performance(best_dtree_clf, X_train, y_train, X_test, y_test)

In [None]:
train_sizes, train_scores, test_scores = learning_curve(dtree_clf, X_train, y_train, cv=5, train_sizes=np.linspace(0.1, 1.0, 10), n_jobs=-1)

# Calculate the mean and standard deviation of training and test scores
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

# Plot the learning curve
plt.figure(figsize=(10, 6))
plt.title("Learning Curve")
plt.xlabel("Training Examples")
plt.ylabel("Score")
plt.grid()

plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")

plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training Score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation Score")

plt.legend(loc="best")
plt.show()

# RandomForestClassifier

In [None]:
rf_clf = RandomForestClassifier(random_state=42)

rf_clf.fit(X_train, y_train)

In [None]:
evaluate_model_performance(rf_clf, X_train, y_train, X_test, y_test)

In [None]:
def plot_learning_curve(estimator, X, y, cv=5, train_sizes=np.linspace(0.1, 1.0, 10)):
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, train_sizes=train_sizes, n_jobs=-1)
    # Calculate the mean and standard deviation of training and test scores
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    # Plot the learning curve
    plt.figure(figsize=(10, 6))
    plt.title("Learning Curve")
    plt.xlabel("Training Examples")
    plt.ylabel("Score")
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")

    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training Score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation Score")

    plt.legend(loc="best")
    plt.show()
plot_learning_curve(rf_clf, X_train, y_train)

# XGBClassifier

In [None]:
xgb_clf = XGBClassifier(random_state=42)
xgb_clf.fit(X_train, y_train)

In [None]:
evaluate_model_performance(xgb_clf, X_train, y_train, X_test, y_test)

In [None]:
plot_learning_curve(xgb_clf, X_train, y_train)

# SVC

In [None]:
svc_model = SVC(random_state=42)
svc_model.fit(X_train, y_train)

In [None]:
evaluate_model_performance(svc_model, X_train, y_train, X_test, y_test)

In [None]:
plot_learning_curve(svc_model, X_train, y_train)

# Save Model

In [None]:
from joblib import dump, load

dump(best_dtree_clf ,'decision_tree_model.joblib')
dump(rf_clf ,'random_forest_model.joblib')
dump(xgb_clf ,'xgboost_model.joblib')
dump(svc_model ,'svc_model.joblib')

# Shap
使用best_dtree_clf來做模型解釋

In [None]:
# Tree SHAP 來解釋整個資料集
explainer = shap.TreeExplainer(best_dtree_clf)
shap_values = explainer(X) # .shap_values

In [None]:
shap_values

In [None]:
print("X shape:", X.shape)
print("SHAP values shape:", shap_values.shape) # (707, 5, 7)，707筆資料、5個特徵欄位、7筆標籤類別

In [None]:
# 產生各標籤的 SHAP Summary Plot
for class_index in range(shap_values.shape[2]):
    print("Summary plot for class", class_index)
    shap.summary_plot(shap_values[:, :, class_index], X)

In [None]:
# 產生各標籤的 Bar chart of mean importance
for class_index in range(shap_values.shape[2]):
    print("Summary plot for class", class_index)
    shap.summary_plot(shap_values[:, :, class_index] ,X ,plot_type="bar")

In [None]:
# 產生各標籤的 Global bar plot
for class_index in range(shap_values.shape[2]):
    print("Summary plot for class", class_index)
    # shap.summary_plot(shap_values[:, :, class_index] ,X ,plot_type="bar")
    shap.plots.bar(shap_values[:, :, class_index])

Class 0 :DATA Scientist
*   Mathmetics :+0.23
*   CN :+0.21

Class 1 :Database Administrator
*   Mathmetics :+0.22
*   CN :+0.22
*   Creative :+0.07

Class 2 :Network Engineer
*   Mathmetics :+0.15
*   CN :+0.13
*   Aptitute :+0.11
*   Problem Solving :+0.02

Class 3 :Software Engineer
*   Aptitute :+0.11
*   Mathmetics :+0.11
*   CN :+0.08
*   Problem Solving :+0.02

Class 4 :Tech Support
*   Creative :+0.15
*   CN :+0.14
*   Mathmetics :+0.1

Class 5 :UI/UX
*   Creative :+0.13
*   Mathmetics :+0.09
*   CN :+0.07
*   Aptitute :+0.06
*   Problem Solving :+0.01

Class 6 :Web Developer
*   Mathmetics :+0.1
*   Creative :+0.09
*   CN :+0.09
*   Aptitute :+0.06
*   Problem Solving :+0.01


# Reference
https://www.kaggle.com/datasets/yuvjeetarora/student-job-profile/data

https://shap.readthedocs.io/en/latest/example_notebooks/tabular_examples/tree_based_models/Census%20income%20classification%20with%20XGBoost.html

https://shap.readthedocs.io/en/latest/example_notebooks/api_examples/plots/bar.html