In [30]:
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import Lasso
data = pd.read_csv('./HW2 附件/HW2_hr-analytics_train.csv')

In [31]:
df = data.copy()
# check nan
def check_nan(df):
    return df.isna().any().any()

def onehot_encoding(df, column_name):
    df_encoded = pd.get_dummies(df, columns=[column_name], prefix=[column_name])
    print(df.shape, "->", df_encoded.shape)
    return df_encoded

def pprint(output = '\n', show_time = False): # print and fprint at the same time
    global filename
    print(output)
    with open(filename, 'a') as f:
        if show_time:
            f.write(datetime.now().strftime("[%Y-%m-%d %H:%M:%S] "))

        f.write(str(output))
        f.write('\n')
        
def check_non_numeric_values(df, column_name):
    """
    Check if any value in the specified column is not int or float.

    Parameters:
    df (DataFrame): The DataFrame to check.
    column_name (str): The name of the column to check.

    Returns:
    bool: True if any non-numeric value is found, False otherwise.
    """
    # Get the specified column
    column = df[column_name]

    # Check if any value is not int or float
    non_numeric_values = column[~column.apply(lambda x: isinstance(x, (int, float)))]
    
    return not non_numeric_values.empty

def count_unique_values(df, column_name):
    """
    Count the number of unique values in the specified column.

    Parameters:
    df (DataFrame): The DataFrame to check.
    column_name (str): The name of the column to count unique values for.

    Returns:
    int: The count of unique values.
    """
    unique_values = df[column_name].nunique()
    return unique_values

def metrics(y_test, y_pred):
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    # Calculate precision
    precision = precision_score(y_test, y_pred)
    print("Precision:", precision)

    # Calculate recall (sensitivity)
    recall = recall_score(y_test, y_pred)
    print("Recall (Sensitivity):", recall)

    # Calculate F1 score (harmonic mean of precision and recall)
    f1 = f1_score(y_test, y_pred)
    print("F1 Score:", f1)
    # Calculate ROC AUC (Receiver Operating Characteristic Area Under the Curve)
    roc_auc = roc_auc_score(y_test, y_pred)
    print("ROC AUC:", roc_auc)
    
col_not_num = []
for col in df.columns:
    if check_non_numeric_values(df, col):
        col_not_num.append(col)

print(f"Doing onehot encoding on col: {col_not_num}")
for col in col_not_num:
    df = onehot_encoding(df, col)


Doing onehot encoding on col: ['sales', 'salary']
(10000, 10) -> (10000, 19)
(10000, 19) -> (10000, 21)


In [32]:
target_column = 'left'
test_size = 0.2
random_state = 42

trans = True
if trans: # 956 -> 957
    scaler = StandardScaler()
    df['average_montly_hours'] = scaler.fit_transform(df[['average_montly_hours']])
    # df['time_spend_company'] = scaler.fit_transform(df[['time_spend_company']])



X = df.drop(columns=[target_column])
y = df[target_column]

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=test_size, random_state=random_state)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=random_state)

In [33]:
poly2 = PolynomialFeatures(degree=2)
poly3 = PolynomialFeatures(degree=3)
poly4 = PolynomialFeatures(degree=4)

lasso_classifier1 = make_pipeline(StandardScaler(), LogisticRegression(penalty='l1', solver='liblinear', C=10)) #782
lasso_classifier2 = make_pipeline(poly2, StandardScaler(), LogisticRegression(penalty='l1', solver='liblinear', C=10)) # 954
# lasso_classifier3 = make_pipeline(poly3, StandardScaler(), LogisticRegression(penalty='l1', solver='liblinear', C=10)) # also too hard.
# lasso_classifier4 = make_pipeline(poly4, StandardScaler(), LogisticRegression(penalty='l1', solver='liblinear', C=10)) # too hard!
ridge_classifier1 = make_pipeline(StandardScaler(), LogisticRegression(penalty='l2', solver='liblinear', C=10)) # 782
ridge_classifier2 = make_pipeline(poly2, StandardScaler(), LogisticRegression(penalty='l2', solver='liblinear', C=10)) # 954
ridge_classifier3 = make_pipeline(poly3, StandardScaler(), LogisticRegression(penalty='l2', solver='liblinear', C=10)) # 949
# ridge_classifier4 = make_pipeline(poly4, StandardScaler(), LogisticRegression(penalty='l2', solver='liblinear', C=10)) # too hard to calculate

poly_classifie1 = make_pipeline(LogisticRegression()) # 776
poly_classifie2 = make_pipeline(poly2, LogisticRegression()) # 772
poly_classifie3 = make_pipeline(poly3, LogisticRegression()) # 784
poly_classifie4 = make_pipeline(poly4, LogisticRegression()) # 801

# Create a voting ensemble classifier
ensemble_classifier = VotingClassifier(estimators=[
    ('lasso2', lasso_classifier2), 
    ('ridge3', ridge_classifier3), # 956
])

ensemble_classifier.fit(X_train, y_train)
y_pred = ensemble_classifier.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
print("Improved Accuracy:", accuracy)
confusion = confusion_matrix(y_train, y_pred)
print(confusion)
metrics(y_train, y_pred)

y_pred = ensemble_classifier.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Improved Accuracy:", accuracy)
confusion = confusion_matrix(y_val, y_pred)
print(confusion)
metrics(y_val, y_pred)


Improved Accuracy: 0.967375
[[6010   80]
 [ 181 1729]]
Accuracy: 0.967375
Precision: 0.9557766721945826
Recall (Sensitivity): 0.9052356020942408
F1 Score: 0.9298198440440979
ROC AUC: 0.9460496565479415
Improved Accuracy: 0.957
[[743  16]
 [ 27 214]]
Accuracy: 0.957
Precision: 0.9304347826086956
Recall (Sensitivity): 0.8879668049792531
F1 Score: 0.9087048832271761
ROC AUC: 0.9334432180363987


In [35]:
test_data = pd.read_csv('./HW2 附件/HW2_hr-analytics_test.csv')
test_data.shape

(5000, 9)

In [36]:
test_df = test_data.copy()
col_not_num = []
for col in test_df.columns:
    if check_non_numeric_values(test_df, col):
        col_not_num.append(col)

print(f"Doing onehot encoding on col: {col_not_num}")
for col in col_not_num:
    test_df = onehot_encoding(test_df, col)

Doing onehot encoding on col: ['sales', 'salary']
(5000, 9) -> (5000, 18)
(5000, 18) -> (5000, 20)


In [37]:
test_size = 0.2
random_state = 42

trans = True
if trans: # 956 -> 957
    scaler = StandardScaler()
    test_df['average_montly_hours'] = scaler.fit_transform(test_df[['average_montly_hours']])

X = test_df
y_real_pred = ensemble_classifier.predict(X)
y_real_pred.shape

(5000,)

In [38]:
result = y_real_pred.reshape(-1, 1)  # Reshape to (1000, 1)
# Create a DataFrame with one column 'left'
result_df = pd.DataFrame(data=result, columns=['left'])
print(result_df)
result_df.to_csv('HW2_hr-analytics_test_sol.csv', index=False)

      left
0        0
1        0
2        0
3        1
4        0
...    ...
4995     0
4996     1
4997     0
4998     0
4999     1

[5000 rows x 1 columns]


In [41]:
print(X['average_montly_hours'])

0      -0.827940
1      -0.927944
2       0.352100
3       1.272131
4      -1.307957
          ...   
4995    0.012088
4996   -1.487963
4997    1.132127
4998   -0.427927
4999   -0.867942
Name: average_montly_hours, Length: 5000, dtype: float64
