In [58]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## Load in our dataset

In [59]:
data = pd.read_csv("https://raw.githubusercontent.com/afnanrahman/EAFP/main/data/clean_smote_data.csv")
data

Unnamed: 0.1,Unnamed: 0,age,business_travel,daily_rate,department,distance_from_home,education,education_field,employee_number,environment_satisfaction,gender,hourly_rate,job_involvement,job_level,job_role,job_satisfaction,marital_status,monthly_rate,num_companies_worked,over_time,percent_salary_hike,performance_rating,relationship_satisfaction,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_in_current_role,years_with_curr_manager,rate_avg,attrition
0,0.000000,41.00,2.00,1102.00,2.00,1.00,2.00,1.00,1.00,2.00,0.00,94.00,3.00,2.00,7.00,4.00,2.00,19479.00,8.00,1.00,11.00,3.00,1.00,0.00,8.00,0.00,1.00,6.00,4.00,5.00,6891.67,1.0
1,1.000000,49.00,1.00,279.00,1.00,8.00,1.00,1.00,2.00,3.00,1.00,61.00,2.00,2.00,6.00,2.00,1.00,24907.00,1.00,0.00,23.00,4.00,4.00,1.00,10.00,3.00,3.00,10.00,7.00,7.00,8415.67,0.0
2,2.000000,37.00,2.00,1373.00,1.00,2.00,2.00,4.00,4.00,4.00,1.00,92.00,2.00,1.00,2.00,3.00,2.00,2396.00,6.00,1.00,15.00,3.00,2.00,0.00,7.00,3.00,3.00,0.00,0.00,0.00,1287.00,1.0
3,3.000000,33.00,1.00,1392.00,1.00,3.00,4.00,1.00,5.00,4.00,0.00,56.00,3.00,1.00,6.00,3.00,1.00,23159.00,1.00,1.00,11.00,3.00,3.00,0.00,8.00,3.00,3.00,8.00,7.00,0.00,8202.33,0.0
4,4.000000,27.00,2.00,591.00,1.00,2.00,1.00,3.00,7.00,1.00,1.00,40.00,3.00,1.00,2.00,2.00,1.00,16632.00,9.00,0.00,12.00,3.00,4.00,1.00,6.00,3.00,3.00,2.00,2.00,2.00,5754.33,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2461,1206.818985,33.12,2.00,281.20,1.87,24.12,3.00,1.50,1692.12,2.75,1.00,84.00,3.00,1.87,6.37,3.62,2.00,13431.35,1.00,0.87,18.13,3.00,3.75,0.00,8.87,2.00,3.00,8.87,6.12,6.12,4598.85,1.0
2462,1040.748336,34.31,0.31,526.08,1.00,4.19,3.00,1.62,1469.25,1.31,0.31,64.94,2.31,1.00,2.00,1.00,2.00,17123.82,2.00,1.00,19.25,3.69,2.31,0.00,8.81,2.00,3.38,0.69,0.00,0.00,5904.94,1.0
2463,390.753668,26.13,1.04,584.73,1.00,2.92,1.04,4.83,522.94,2.96,0.96,71.44,2.96,1.00,5.83,1.00,1.96,6586.52,0.04,0.04,21.53,3.96,3.04,0.04,6.75,1.96,3.00,5.79,3.83,3.83,2414.23,1.0
2464,1271.359504,41.32,1.13,403.67,0.87,3.81,1.87,2.60,1781.54,3.13,1.00,93.50,2.73,1.00,5.33,2.13,0.13,11158.08,1.00,0.87,11.87,3.00,3.00,2.73,5.33,2.00,2.13,4.46,2.60,2.60,3885.08,1.0


## Define the group of features we're going to look at


In [60]:
cols = ['age' ,'job_satisfaction' ,'over_time' ,'percent_salary_hike' ,
        'stock_option_level' ,'rate_avg' ,'environment_satisfaction',
        'job_involvement' ,'job_level' ,'job_role' ,'education_field', 
        'performance_rating' ,'total_working_years']
X = data[cols]
Y = data['attrition']

## Split the data into 70% training set and 30% testing set

In [61]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

## Build our Logistic Regression model, make predictions, and determine the model's accuracy.

In [62]:
logistic_model = LogisticRegression(solver = 'lbfgs',max_iter=10000) # solver = 'liblinear' actually gave a little higher accuracy - will have to look into that
logistic_model.fit(X_train, Y_train)

logistic_model.predict(X_test)

predictions = logistic_model.predict(X_test)

# accuracy = np.sum(predictions == Y_test) / len(predictions)
# accuracy

acc = logistic_model.score(X_test, Y_test)
acc

0.7243243243243244

## Repeat process for another group of features.

## Does it matter more that the employee feels appreciated by the company, or if the employee appreciates the company (i.e. company satisfies them)?

#### Category 1 (related to if employee is appreciated by company)
* Daily Rate
* Job Involvement
* Percent Salary Hike
* Performance Rating
* Stock Option Level
* Training Times Last Year
* Years at Company
* Years in Current Role
* Years with Current Manager



#### Category 2 (related to if company is appreciated by employee)
* Environment Satisfaction
* Job Satisfaction
* Over time
* Work Life Balance
* Relationship Satisfaction


In [63]:
cols = ['daily_rate', 'job_involvement', 'percent_salary_hike',
        'performance_rating', 'stock_option_level', 'training_times_last_year',
        'years_at_company', 'years_in_current_role', 'years_with_curr_manager',
        'environment_satisfaction', 'job_satisfaction', 'over_time',
        'work_life_balance', 'relationship_satisfaction']

X = data[cols]
Y = data['attrition']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

logistic_model = LogisticRegression(solver = 'lbfgs',max_iter=10000)
logistic_model.fit(X_train, Y_train)

logistic_model.predict(X_test)

predictions = logistic_model.predict(X_test)

acc = logistic_model.score(X_test, Y_test)
acc

0.7405405405405405

## How might an employee’s growth at the company affect attrition?
## And how might an employee's stability in life affect attrition?

#### Category 1 (related to growth)
* Daily Rate
* Department
* Hourly Rate
* Job Involvement
* Job Level
* Job Role
* Monthly Rate
* Over time
* Percent Salary Hike
* Performance Rating
* Total Working Years
* Training Times Last Year
* Years at Company


#### Category 2 (related to stability)
* Age
* Job Satisfaction
* Marital status
* Work Life Balance
* Years in Current Role
* Years with Current Manager


In [45]:
cols = ['daily_rate', 'department', 'hourly_rate', 'job_involvement', 
        'job_level', 'job_role', 'monthly_rate', 'over_time',
        'percent_salary_hike', 'performance_rating', 'total_working_years',
        'training_times_last_year', 'years_at_company', 'age',
        'job_satisfaction', 'marital_status', 'work_life_balance',
        'years_in_current_role', 'years_with_curr_manager']

X = data[cols]
Y = data['attrition']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

logistic_model = LogisticRegression(solver = 'lbfgs',max_iter=10000)
logistic_model.fit(X_train, Y_train)

logistic_model.predict(X_test)

predictions = logistic_model.predict(X_test)

# accuracy = np.sum(predictions == Y_test) / len(predictions)
# accuracy
acc = logistic_model.score(X_test, Y_test)
acc

0.7243243243243244

## Just played around with some features, kept this one since it had the highest accuracy

In [57]:
cols = ['job_involvement', 'percent_salary_hike', 'stock_option_level',
        'performance_rating', 'years_at_company', 'environment_satisfaction',
        'job_satisfaction', 'relationship_satisfaction', 'work_life_balance',
        'over_time', 'years_in_current_role', 'daily_rate', 'job_level']

X = data[cols]
Y = data['attrition']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

logistic_model = LogisticRegression(solver = 'lbfgs',max_iter=1000)
logistic_model.fit(X_train, Y_train)

logistic_model.predict(X_test)

predictions = logistic_model.predict(X_test)

accuracy = np.sum(predictions == Y_test) / len(predictions)
accuracy

0.7472972972972973