In [53]:
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
data = pd.read_csv('./HW3 附件/HW2_hr-analytics_train.csv')

In [112]:
df = data.copy()
# check nan
def check_nan(df):
    return df.isna().any().any()

def onehot_encoding(df, column_name, show=True):
    df_encoded = pd.get_dummies(df, columns=[column_name], prefix=[column_name])
    if show:
        print(f"After operating [\'{column_name}\']. {df.shape} -> {df_encoded.shape}")
    return df_encoded

def pprint(output = '\n', show_time = False): # print and fprint at the same time
    global filename
    print(output)
    with open(filename, 'a') as f:
        if show_time:
            f.write(datetime.now().strftime("[%Y-%m-%d %H:%M:%S] "))

        f.write(str(output))
        f.write('\n')
        
def check_non_numeric_values(df, column_name):
    """
    Check if any value in the specified column is not int or float.

    Parameters:
    df (DataFrame): The DataFrame to check.
    column_name (str): The name of the column to check.

    Returns:
    bool: True if any non-numeric value is found, False otherwise.
    """
    # Get the specified column
    column = df[column_name]

    # Check if any value is not int or float
    non_numeric_values = column[~column.apply(lambda x: isinstance(x, (int, float)))]
    
    return not non_numeric_values.empty

def count_unique_values(df, column_name):
    """
    Count the number of unique values in the specified column.

    Parameters:
    df (DataFrame): The DataFrame to check.
    column_name (str): The name of the column to count unique values for.

    Returns:
    int: The count of unique values.
    """
    unique_values = df[column_name].nunique()
    return unique_values

def metrics(y_test, y_pred):
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")

    # Calculate precision
    precision = precision_score(y_test, y_pred)
    print(f"Precision: {precision:.4f}")

    # Calculate recall (sensitivity)
    recall = recall_score(y_test, y_pred)
    print(f"Recall: {recall:.4f}")

    # Calculate F1 score (harmonic mean of precision and recall)
    f1 = f1_score(y_test, y_pred)
    print(f"F1 Score: {f1:.4f}")
    # Calculate ROC AUC (Receiver Operating Characteristic Area Under the Curve)
    roc_auc = roc_auc_score(y_test, y_pred)
    print(f"ROC AUC: {roc_auc:.4f}\n")
    



In [65]:
col_not_num = []
for col in df.columns:
    if check_non_numeric_values(df, col):
        col_not_num.append(col)

print(f"Doing onehot encoding on col: {col_not_num}")
for col in col_not_num:
    df = onehot_encoding(df, col)

target_column = 'left'
test_size = 0.2
random_state = 42

trans = True
if trans: # 956 -> 957
    scaler = StandardScaler()
    df['average_montly_hours'] = scaler.fit_transform(df[['average_montly_hours']])
    # df['time_spend_company'] = scaler.fit_transform(df[['time_spend_company']])



X = df.drop(columns=[target_column])
y = df[target_column]

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=test_size, random_state=random_state)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=random_state)

# Create a Decision Tree classifier
decision_tree_classifier = DecisionTreeClassifier()

# Fit the Decision Tree classifier to the training data
decision_tree_classifier.fit(X_train, y_train)





Doing onehot encoding on col: []
With Train Data, the metrics of model:
Confusion matrix:
[[6090    0]
 [   0 1910]]
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC AUC: 1.0000

With Val Data, the metrics of model:
Confusion matrix:
[[747  12]
 [ 13 228]]
Accuracy: 0.9750
Precision: 0.9500
Recall: 0.9461
F1 Score: 0.9480
ROC AUC: 0.9651

With Val Data, the metrics of model:
Confusion matrix:
[[743  16]
 [ 12 229]]
Accuracy: 0.9720
Precision: 0.9347
Recall: 0.9502
F1 Score: 0.9424
ROC AUC: 0.9646



In [99]:
# Predict on the training data and calculate accuracy
print("With Train Data, the metrics of model:")
y_pred = decision_tree_classifier.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
confusion = confusion_matrix(y_train, y_pred)
print(f"Confusion matrix:\n{confusion}")
metrics(y_train, y_pred)

# Predict on the validation data and calculate accuracy
print("With Val Data, the metrics of model:")
y_pred = decision_tree_classifier.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
confusion = confusion_matrix(y_val, y_pred)
print(f"Confusion matrix:\n{confusion}")
metrics(y_val, y_pred)

# Predict on the tesing data and calculate accuracy
print("With Test Data, the metrics of model:")
y_pred = decision_tree_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
print(f"Confusion matrix:\n{confusion}")
metrics(y_test, y_pred)

With Train Data, the metrics of model:
Confusion matrix:
[[6090    0]
 [   0 1910]]
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
ROC AUC: 1.0000

With Val Data, the metrics of model:
Confusion matrix:
[[747  12]
 [ 13 228]]
Accuracy: 0.9750
Precision: 0.9500
Recall: 0.9461
F1 Score: 0.9480
ROC AUC: 0.9651

With Test Data, the metrics of model:
Confusion matrix:
[[743  16]
 [ 12 229]]
Accuracy: 0.9720
Precision: 0.9347
Recall: 0.9502
F1 Score: 0.9424
ROC AUC: 0.9646



In [139]:
def test_importance(df):
    col_not_num = []
    for col in df.columns:
        if check_non_numeric_values(df, col):
            col_not_num.append(col)

    for col in col_not_num:
        df = onehot_encoding(df, col, False)

    target_column = 'left'
    test_size = 0.2
    random_state = 42

    trans = True
    if trans: # 956 -> 957
        scaler = StandardScaler()
        if 'average_montly_hours' in df.columns:
            df['average_montly_hours'] = scaler.fit_transform(df[['average_montly_hours']])
        # df['time_spend_company'] = scaler.fit_transform(df[['time_spend_company']])



    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Split the data into training, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=test_size, random_state=random_state)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=random_state)

    # Create a Decision Tree classifier
    decision_tree_classifier = DecisionTreeClassifier()

    # Fit the Decision Tree classifier to the training data
    decision_tree_classifier.fit(X_train, y_train)

    # Predict on the training data and calculate accuracy
    y_pred = decision_tree_classifier.predict(X_train)
    accuracy = accuracy_score(y_train, y_pred)

    # Predict on the validation data and calculate accuracy
    y_pred = decision_tree_classifier.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)

    return accuracy

col_data = data.copy()
# print(col_data.columns)
score = {}
for ii in col_data.columns.drop('left'):
    temp = data.copy()
    temp = temp.drop(columns=[ii])
    acc = test_importance(temp)
    score[ii] = acc
    print(f"The accuracy of dropping [\'{ii}\'] = {acc}")

The accuracy of dropping ['satisfaction_level'] = 0.957
The accuracy of dropping ['last_evaluation'] = 0.975
The accuracy of dropping ['number_project'] = 0.97
The accuracy of dropping ['average_montly_hours'] = 0.969
The accuracy of dropping ['time_spend_company'] = 0.962
The accuracy of dropping ['Work_accident'] = 0.973
The accuracy of dropping ['promotion_last_5years'] = 0.974
The accuracy of dropping ['sales'] = 0.976
The accuracy of dropping ['salary'] = 0.971


In [140]:
sorted(score.items(), key=lambda x:x[1])

[('satisfaction_level', 0.957),
 ('time_spend_company', 0.962),
 ('average_montly_hours', 0.969),
 ('number_project', 0.97),
 ('salary', 0.971),
 ('Work_accident', 0.973),
 ('promotion_last_5years', 0.974),
 ('last_evaluation', 0.975),
 ('sales', 0.976)]

# Discussion Part

### Our answer to question 5:
In order to find out the top 2 important features in the data, we decided to remove feature one by one, and record the val accuracy after removal.

* RESULT:
[('satisfaction_level', 0.957),
 ('time_spend_company', 0.962),
 ('average_montly_hours', 0.969),
 ('number_project', 0.97),
 ('salary', 0.971),
 ('Work_accident', 0.973),
 ('promotion_last_5years', 0.974),
 ('last_evaluation', 0.975),
 ('sales', 0.976)]

According to the result, we can observe that **['satisfaction_level']** and **['time_spend_company']** decreases the val accuracy the most.
This means **['satisfaction_level']** and **['time_spend_company']** are the top 2 important features in the data.

### By creating a VotingClassifier with **('lasso2', lasso_classifier2)** **('ridge3', ridge_classifier3)**
We get in training data:
* Confusion matrix
    [[6010   80]
    [ 181 1729]]
* Precision: 0.9558
* Recall (Sensitivity): 0.9052
* F1 Score: 0.9298
* AUROC: 0.9460

We get in validating data:
* Confusion matrix
    [[743  16]
    [ 27 214]]
* Accuracy: 0.9570
* Precision: 0.9304
* Recall (Sensitivity): 0.8880
* F1 Score: 0.9087
* AUROC: 0.9334

### Comparing to decision_tree_classifier
We get in training data:
* Confusion matrix:
    [[6090    0]
    [   0 1910]]
* Accuracy: 1.0000
* Precision: 1.0000
* Recall: 1.0000
* F1 Score: 1.0000
* AUROC: 1.0000

We get in validating data:
* Confusion matrix:
    [[750   9]
    [ 13 228]]
* Accuracy: 0.9780
* Precision: 0.9620
* Recall: 0.9461
* F1 Score: 0.9540
* AUROC: 0.9671



### Our answer to question 7:
In training data and validating data, we can see that decision tree classifier has a overall victory on the metrics.
Combining with the result of linear regression without any data transform or polynomial feature, we could know that most part of the data is non-linear.
Decision tree could better fit in data that has more complicate relationship, which results in performing better on non-linear data than simple linear regression.
Therefore, in this case, decision tree classifier is the better model for prediction.


# Output prediction of test data

In [6]:
# Read test data from HW2_hr-analytics_test.csv
test_data = pd.read_csv('./HW3 附件/HW2_hr-analytics_test.csv')
test_data.shape

(5000, 9)

In [43]:
# Preprocessing
test_df = test_data.copy()
col_not_num = []
for col in test_df.columns: # Checking columns that are not numeric
    if check_non_numeric_values(test_df, col): 
        col_not_num.append(col)

print(f"Doing onehot encoding on col: {col_not_num}")
for col in col_not_num: # onehot encode the col that is not numeric
    test_df = onehot_encoding(test_df, col) 

Doing onehot encoding on col: ['sales', 'salary']
After operating ['sales']. (5000, 9) -> (5000, 18)
After operating ['salary']. (5000, 18) -> (5000, 20)


In [46]:
# Setting up random state
random_state = 42

trans = True
if trans: # 956 -> 957
    scaler = StandardScaler()
    test_df['average_montly_hours'] = scaler.fit_transform(test_df[['average_montly_hours']])

X = test_df.copy()

# Predict the data using decision tree classifier.
y_real_pred = decision_tree_classifier.predict(X)
y_real_pred.shape

(5000,)

In [9]:
# Reshape the pred from (5000,) to (5000, 1).
result = y_real_pred.reshape(-1, 1)

# Create a DataFrame with one column 'left'.
result_df = pd.DataFrame(data=result, columns=['left'])
print(result_df)

# Produce our end result.
result_df.to_csv('HW2_hr-analytics_test_sol.csv', index=False)

      left
0        0
1        0
2        1
3        1
4        0
...    ...
4995     0
4996     0
4997     0
4998     0
4999     1

[5000 rows x 1 columns]
