In [17]:
import pandas as pd
import numpy as np

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [28]:
# В интерпретаторе Python мы можем перемещаться по директориям нашей операционной системы, 
# подобно тому как мы ходим по папочкам в "проводнике" виндоус или файндере MacOS

# Но происходит это не так тривиально -- вместо кнопок будем использовать пару команд из стандартной библиотеки os
# Для начала -- загрузим ее к себе в исполняемую среду. 
import os

In [29]:
# командой getcwd() мы можем вывести на экран текущую директорию, чтобы понять где мы
# cwd = current working directory

os.getcwd()

'/home/jovyan/demo'

In [31]:
# Отсюда мы можем забрать наш уже привычный файл HRDataset_v14, обратившись к нему. 
# Мы ведь знаем что он внутри соседней директории "2":

df_hr = pd.read_csv('HRDataset_v14 (1).csv')

In [32]:
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression

from matplotlib import pyplot

In [33]:
pd.options.display.max_columns = 999

In [34]:
df_hr.tail()

Unnamed: 0,Employee_Name,EmpID,MarriedID,MaritalStatusID,GenderID,EmpStatusID,DeptID,PerfScoreID,FromDiversityJobFairID,Salary,Termd,PositionID,Position,State,Zip,DOB,Sex,MaritalDesc,CitizenDesc,HispanicLatino,RaceDesc,DateofHire,DateofTermination,TermReason,EmploymentStatus,Department,ManagerName,ManagerID,RecruitmentSource,PerformanceScore,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,LastPerformanceReview_Date,DaysLateLast30,Absences
306,"Woodson, Jason",10135,0,0,1,1,5,3,0,65893,0,20,Production Technician II,MA,1810,05/11/85,M,Single,US Citizen,No,White,7/7/2014,,N/A-StillEmployed,Active,Production,Kissy Sullivan,20.0,LinkedIn,Fully Meets,4.07,4,0,2/28/2019,0,13
307,"Ybarra, Catherine",10301,0,0,0,5,5,1,0,48513,1,19,Production Technician I,MA,2458,05/04/82,F,Single,US Citizen,No,Asian,9/2/2008,9/29/2015,Another position,Voluntarily Terminated,Production,Brannon Miller,12.0,Google Search,PIP,3.2,2,0,9/2/2015,5,4
308,"Zamora, Jennifer",10010,0,0,0,1,3,4,0,220450,0,6,CIO,MA,2067,08/30/79,F,Single,US Citizen,No,White,4/10/2010,,N/A-StillEmployed,Active,IT/IS,Janet King,2.0,Employee Referral,Exceeds,4.6,5,6,2/21/2019,0,16
309,"Zhou, Julia",10043,0,0,0,1,3,3,0,89292,0,9,Data Analyst,MA,2148,02/24/79,F,Single,US Citizen,No,White,3/30/2015,,N/A-StillEmployed,Active,IT/IS,Simon Roup,4.0,Employee Referral,Fully Meets,5.0,3,5,2/1/2019,0,11
310,"Zima, Colleen",10271,0,4,0,1,5,3,0,45046,0,19,Production Technician I,MA,1730,08/17/78,F,Widowed,US Citizen,No,Asian,9/29/2014,,N/A-StillEmployed,Active,Production,David Stanley,14.0,LinkedIn,Fully Meets,4.5,5,0,1/30/2019,0,2


In [35]:
df_hr['State'].nunique()

28

In [36]:
X = df_hr[['MarriedID', 'MaritalStatusID', 'DeptID', 'PerfScoreID', 
           'PositionID', 'EmpSatisfaction', 'SpecialProjectsCount']]

y = df_hr[['Salary']]

In [37]:
# В объекте X у нас оказываются признаки

X

Unnamed: 0,MarriedID,MaritalStatusID,DeptID,PerfScoreID,PositionID,EmpSatisfaction,SpecialProjectsCount
0,0,0,5,4,19,5,0
1,1,1,3,3,27,3,6
2,1,1,5,3,20,3,0
3,1,1,5,3,19,5,0
4,0,2,5,3,19,4,0
...,...,...,...,...,...,...,...
306,0,0,5,3,20,4,0
307,0,0,5,1,19,2,0
308,0,0,3,4,6,5,6
309,0,0,3,3,9,3,5


In [38]:
# в объекте 'y' у нас оказываются ответы для этих признаков

y

Unnamed: 0,Salary
0,62506
1,104437
2,64955
3,64991
4,50825
...,...
306,65893
307,48513
308,220450
309,89292


In [39]:
from sklearn.model_selection import train_test_split

In [40]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

# Посмотрим каков у нас оказался размер объектов в трейне и тесте
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(248, 7)
(63, 7)
(248, 1)
(63, 1)


In [41]:
Y_test

Unnamed: 0,Salary
290,88976
9,50178
57,83552
60,65729
25,103613
...,...
195,74417
210,68829
224,46799
158,66074


In [42]:
lin_model = LinearRegression()

lin_model.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [43]:
# посмотрим, какие из наших признаков оказались наиболее значимыми для определения зарплаты на объектах
importance = lin_model.coef_

print(importance)

[[ 1517.74204556 -1347.40107733 -2866.59535406  2891.55636358
   -182.22564328   698.45118922  3542.27308623]]


In [44]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [45]:
# Определим функцию оценки ошибки получившейся модели:

def model_access(lin_model, X_train, X_test, Y_train, Y_test):

    # model evaluation for training set
    y_train_predict = lin_model.predict(X_train)
    rmse = (np.sqrt(mean_squared_error(Y_train, y_train_predict)))
    r2 = r2_score(Y_train, y_train_predict)

    
    print("\n")
    print("The model performance for training set")
    print("--------------------------------------")
    print('RMSE is {}'.format(rmse))
    print('R2 score is {}'.format(r2))
    print("--------------------------------------")
    print("\n")

    
    # model evaluation for testing set
    y_test_predict = lin_model.predict(X_test)
    rmse = (np.sqrt(mean_squared_error(Y_test, y_test_predict)))
    r2 = r2_score(Y_test, y_test_predict)

    print("The model performance for testing set")
    print("--------------------------------------")
    print('RMSE is {}'.format(rmse))
    print('R2 score is {}'.format(r2))

In [46]:
model_access(lin_model, X_train, X_test, Y_train, Y_test)



The model performance for training set
--------------------------------------
RMSE is 21191.638523978876
R2 score is 0.22959801927272339
--------------------------------------


The model performance for testing set
--------------------------------------
RMSE is 22109.817459972408
R2 score is 0.396060686220037


In [47]:
X_test

Unnamed: 0,MarriedID,MaritalStatusID,DeptID,PerfScoreID,PositionID,EmpSatisfaction,SpecialProjectsCount
290,0,2,5,3,17,3,0
9,0,2,3,3,14,5,6
57,1,1,3,3,9,3,6
60,0,0,6,3,21,4,0
25,0,0,3,3,30,5,7
...,...,...,...,...,...,...,...
195,0,3,5,3,20,5,0
210,0,0,6,3,3,5,0
224,0,2,5,3,19,4,0
158,0,2,5,3,20,3,0


In [48]:
lin_model.predict([[0,1,2,3,9,3,0]])

array([[71503.70706851]])

In [49]:
"""

У нас есть еще ряд признаков, которые интуитивно кажутся сильно связаны с зарплатой: география сотрудников
Есть как минимум Zip (почтовый индекс) и State (штат) сотрудника. 

Zip целочисленный. Можем просто добавить его в модель.

"""
X = df_hr[['MarriedID', 'MaritalStatusID', 'DeptID', 'PerfScoreID', 
           'PositionID', 'EmpSatisfaction', 'SpecialProjectsCount', 'Zip']]

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [50]:
importance = lin_model.coef_
print(importance)

[[ 1.65359000e+03 -1.38771388e+03 -3.21744803e+03  2.93882137e+03
  -1.06352820e+02  8.01552657e+02  3.47567478e+03  5.79854070e-02]]


In [51]:
# Посмотрим что собой представляет, например, первый предсказанный моделью объект
y_test_predict = lin_model.predict(X_test)
y_test_predict[0]

array([59638.16329937])

In [52]:
model_access(lin_model, X_train, X_test, Y_train, Y_test)



The model performance for training set
--------------------------------------
RMSE is 21180.617448845645
R2 score is 0.23039913243314092
--------------------------------------


The model performance for testing set
--------------------------------------
RMSE is 22150.65746021619
R2 score is 0.39382750075374506


In [53]:
from sklearn.preprocessing import MinMaxScaler

In [54]:
def scaling(df):
    scaler = MinMaxScaler() 
    data_scaled = scaler.fit_transform(df)
    df_scaled = pd.DataFrame(data_scaled)
    return df_scaled

In [55]:
X = df_hr[['MarriedID', 'MaritalStatusID', 'DeptID', 'PerfScoreID', 
           'PositionID', 'EmpSatisfaction', 'SpecialProjectsCount', 'Zip']]

X_scaled = scaling(X)

X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state=42)

lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)

importance = lin_model.coef_
print(importance)

[[  1653.59000082  -5550.85550861 -16087.24016112   8816.46412093
   -3084.23176896   3206.21062603  27805.39820063   5626.84591458]]


In [56]:
model_access(lin_model, X_train, X_test, Y_train, Y_test)



The model performance for training set
--------------------------------------
RMSE is 21180.617448845645
R2 score is 0.2303991324331408
--------------------------------------


The model performance for testing set
--------------------------------------
RMSE is 22150.65746021613
R2 score is 0.3938275007537484


In [57]:
X = df_hr[['MaritalStatusID', 'DeptID', 'PerfScoreID', 'SpecialProjectsCount', 'Zip']]

X_scaled = scaling(X)

X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state=42)

lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)

importance = lin_model.coef_
print(importance)

[[ -4871.98589343 -16094.21081832   9667.77613845  28680.33441284
    6974.11377811]]


In [58]:
model_access(lin_model, X_train, X_test, Y_train, Y_test)



The model performance for training set
--------------------------------------
RMSE is 21212.984182557466
R2 score is 0.22804523516318365
--------------------------------------


The model performance for testing set
--------------------------------------
RMSE is 22106.636387933588
R2 score is 0.3962344584605577


In [59]:
from sklearn.preprocessing import LabelEncoder

In [60]:
# creat1ing initial dataframe
temp_df = pd.DataFrame(df_hr[['State']])

# creating instance of labelencoder
labelencoder = LabelEncoder()

# Assigning numerical values and storing in another column
df_hr['State_id'] = labelencoder.fit_transform(temp_df['State'])

df_hr.head(10)

Unnamed: 0,Employee_Name,EmpID,MarriedID,MaritalStatusID,GenderID,EmpStatusID,DeptID,PerfScoreID,FromDiversityJobFairID,Salary,Termd,PositionID,Position,State,Zip,DOB,Sex,MaritalDesc,CitizenDesc,HispanicLatino,RaceDesc,DateofHire,DateofTermination,TermReason,EmploymentStatus,Department,ManagerName,ManagerID,RecruitmentSource,PerformanceScore,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,LastPerformanceReview_Date,DaysLateLast30,Absences,State_id
0,"Adinolfi, Wilson K",10026,0,0,1,1,5,4,0,62506,0,19,Production Technician I,MA,1960,07/10/83,M,Single,US Citizen,No,White,7/5/2011,,N/A-StillEmployed,Active,Production,Michael Albert,22.0,LinkedIn,Exceeds,4.6,5,0,1/17/2019,0,1,10
1,"Ait Sidi, Karthikeyan",10084,1,1,1,5,3,3,0,104437,1,27,Sr. DBA,MA,2148,05/05/75,M,Married,US Citizen,No,White,3/30/2015,6/16/2016,career change,Voluntarily Terminated,IT/IS,Simon Roup,4.0,Indeed,Fully Meets,4.96,3,6,2/24/2016,0,17,10
2,"Akinkuolie, Sarah",10196,1,1,0,5,5,3,0,64955,1,20,Production Technician II,MA,1810,09/19/88,F,Married,US Citizen,No,White,7/5/2011,9/24/2012,hours,Voluntarily Terminated,Production,Kissy Sullivan,20.0,LinkedIn,Fully Meets,3.02,3,0,5/15/2012,0,3,10
3,"Alagbe,Trina",10088,1,1,0,1,5,3,0,64991,0,19,Production Technician I,MA,1886,09/27/88,F,Married,US Citizen,No,White,1/7/2008,,N/A-StillEmployed,Active,Production,Elijiah Gray,16.0,Indeed,Fully Meets,4.84,5,0,1/3/2019,0,15,10
4,"Anderson, Carol",10069,0,2,0,5,5,3,0,50825,1,19,Production Technician I,MA,2169,09/08/89,F,Divorced,US Citizen,No,White,7/11/2011,9/6/2016,return to school,Voluntarily Terminated,Production,Webster Butler,39.0,Google Search,Fully Meets,5.0,4,0,2/1/2016,0,2,10
5,"Anderson, Linda",10002,0,0,0,1,5,4,0,57568,0,19,Production Technician I,MA,1844,05/22/77,F,Single,US Citizen,No,White,1/9/2012,,N/A-StillEmployed,Active,Production,Amy Dunn,11.0,LinkedIn,Exceeds,5.0,5,0,1/7/2019,0,15,10
6,"Andreola, Colby",10194,0,0,0,1,4,3,0,95660,0,24,Software Engineer,MA,2110,05/24/79,F,Single,US Citizen,No,White,11/10/2014,,N/A-StillEmployed,Active,Software Engineering,Alex Sweetwater,10.0,LinkedIn,Fully Meets,3.04,3,4,1/2/2019,0,19,10
7,"Athwal, Sam",10062,0,4,1,1,5,3,0,59365,0,19,Production Technician I,MA,2199,02/18/83,M,Widowed,US Citizen,No,White,9/30/2013,,N/A-StillEmployed,Active,Production,Ketsia Liebig,19.0,Employee Referral,Fully Meets,5.0,4,0,2/25/2019,0,19,10
8,"Bachiochi, Linda",10114,0,0,0,3,5,3,1,47837,0,19,Production Technician I,MA,1902,02/11/70,F,Single,US Citizen,No,Black or African American,7/6/2009,,N/A-StillEmployed,Active,Production,Brannon Miller,12.0,Diversity Job Fair,Fully Meets,4.46,3,0,1/25/2019,0,4,10
9,"Bacong, Alejandro",10250,0,2,1,1,3,3,0,50178,0,14,IT Support,MA,1886,01/07/88,M,Divorced,US Citizen,No,White,1/5/2015,,N/A-StillEmployed,Active,IT/IS,Peter Monroe,7.0,Indeed,Fully Meets,5.0,5,6,2/18/2019,0,16,10


In [61]:
# Попробуем еще раз запустить модель, но добавим признак State_id

X = df_hr[['MaritalStatusID', 'DeptID', 'PerfScoreID', 'SpecialProjectsCount', 'State_id']]

X_scaled = scaling(X)

X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state=42)

lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)

importance = lin_model.coef_
print(importance)

[[ -6083.46971713 -18526.89484848   9488.89077324  28121.72621546
   35602.76331095]]


In [62]:
model_access(lin_model, X_train, X_test, Y_train, Y_test)



The model performance for training set
--------------------------------------
RMSE is 20917.527709080798
R2 score is 0.24939920189023346
--------------------------------------


The model performance for testing set
--------------------------------------
RMSE is 22423.457175588028
R2 score is 0.3788047443334699


In [63]:
import matplotlib.pyplot as plt

In [64]:
# matplotlib histogram
plt.hist(df_hr['State_id'], color = 'blue', edgecolor = 'black',
         bins = int(27))

# Add labels
plt.title('Histogram of employees across the States in our company')
plt.xlabel('State_id')
plt.ylabel('Amount of employees')

Text(0, 0.5, 'Amount of employees')

In [75]:
X = df_hr[['MaritalStatusID', 'DeptID', 'PerfScoreID', 'SpecialProjectsCount', 'State_id', 'Absences', 'DaysLateLast30']]

X_scaled = scaling(X)

X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state=42)

lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)

importance = lin_model.coef_
print(importance)

[[ -5478.29985566 -20917.77827005  19037.98544043  27344.23923201
   34999.73952768   7078.51844914  12739.82983469]]


In [76]:
model_access(lin_model, X_train, X_test, Y_train, Y_test)



The model performance for training set
--------------------------------------
RMSE is 20712.06475008665
R2 score is 0.264072374065648
--------------------------------------


The model performance for testing set
--------------------------------------
RMSE is 22156.15118436136
R2 score is 0.39352678214380865


In [77]:
X = df_hr[['MaritalStatusID', 'DeptID', 'PerfScoreID', 'SpecialProjectsCount', 'State_id', 'Absences', 'DaysLateLast30', 'GenderID']]

X_scaled = scaling(X)

X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state=42)

lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)

importance = lin_model.coef_
print(importance)

[[ -5474.36101693 -21031.76583477  18975.87526734  27195.6024111
   35046.74054762   7066.71270995  12594.02186959    548.33989308]]


In [78]:
model_access(lin_model, X_train, X_test, Y_train, Y_test)



The model performance for training set
--------------------------------------
RMSE is 20710.314300170838
R2 score is 0.264196760507178
--------------------------------------


The model performance for testing set
--------------------------------------
RMSE is 22153.395701087476
R2 score is 0.3936776226826595


In [79]:
# creat1ing initial dataframe
temp_df = pd.DataFrame(df_hr[['MaritalDesc']])

# creating instance of labelencoder
labelencoder = LabelEncoder()

# Assigning numerical values and storing in another column
df_hr['MaritalDesc_id'] = labelencoder.fit_transform(temp_df['MaritalDesc'])

df_hr.head(10)

Unnamed: 0,Employee_Name,EmpID,MarriedID,MaritalStatusID,GenderID,EmpStatusID,DeptID,PerfScoreID,FromDiversityJobFairID,Salary,Termd,PositionID,Position,State,Zip,DOB,Sex,MaritalDesc,CitizenDesc,HispanicLatino,RaceDesc,DateofHire,DateofTermination,TermReason,EmploymentStatus,Department,ManagerName,ManagerID,RecruitmentSource,PerformanceScore,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,LastPerformanceReview_Date,DaysLateLast30,Absences,State_id,ManagerID_id,MaritalDesc_id
0,"Adinolfi, Wilson K",10026,0,0,1,1,5,4,0,62506,0,19,Production Technician I,MA,1960,07/10/83,M,Single,US Citizen,No,White,7/5/2011,,N/A-StillEmployed,Active,Production,Michael Albert,17,LinkedIn,Exceeds,4.6,5,0,1/17/2019,0,1,10,17,3
1,"Ait Sidi, Karthikeyan",10084,1,1,1,5,3,3,0,104437,1,27,Sr. DBA,MA,2148,05/05/75,M,Married,US Citizen,No,White,3/30/2015,6/16/2016,career change,Voluntarily Terminated,IT/IS,Simon Roup,19,Indeed,Fully Meets,4.96,3,6,2/24/2016,0,17,10,19,1
2,"Akinkuolie, Sarah",10196,1,1,0,5,5,3,0,64955,1,20,Production Technician II,MA,1810,09/19/88,F,Married,US Citizen,No,White,7/5/2011,9/24/2012,hours,Voluntarily Terminated,Production,Kissy Sullivan,15,LinkedIn,Fully Meets,3.02,3,0,5/15/2012,0,3,10,15,1
3,"Alagbe,Trina",10088,1,1,0,1,5,3,0,64991,0,19,Production Technician I,MA,1886,09/27/88,F,Married,US Citizen,No,White,1/7/2008,,N/A-StillEmployed,Active,Production,Elijiah Gray,8,Indeed,Fully Meets,4.84,5,0,1/3/2019,0,15,10,8,1
4,"Anderson, Carol",10069,0,2,0,5,5,3,0,50825,1,19,Production Technician I,MA,2169,09/08/89,F,Divorced,US Citizen,No,White,7/11/2011,9/6/2016,return to school,Voluntarily Terminated,Production,Webster Butler,20,Google Search,Fully Meets,5.0,4,0,2/1/2016,0,2,10,20,0
5,"Anderson, Linda",10002,0,0,0,1,5,4,0,57568,0,19,Production Technician I,MA,1844,05/22/77,F,Single,US Citizen,No,White,1/9/2012,,N/A-StillEmployed,Active,Production,Amy Dunn,1,LinkedIn,Exceeds,5.0,5,0,1/7/2019,0,15,10,1,3
6,"Andreola, Colby",10194,0,0,0,1,4,3,0,95660,0,24,Software Engineer,MA,2110,05/24/79,F,Single,US Citizen,No,White,11/10/2014,,N/A-StillEmployed,Active,Software Engineering,Alex Sweetwater,0,LinkedIn,Fully Meets,3.04,3,4,1/2/2019,0,19,10,0,3
7,"Athwal, Sam",10062,0,4,1,1,5,3,0,59365,0,19,Production Technician I,MA,2199,02/18/83,M,Widowed,US Citizen,No,White,9/30/2013,,N/A-StillEmployed,Active,Production,Ketsia Liebig,14,Employee Referral,Fully Meets,5.0,4,0,2/25/2019,0,19,10,14,4
8,"Bachiochi, Linda",10114,0,0,0,3,5,3,1,47837,0,19,Production Technician I,MA,1902,02/11/70,F,Single,US Citizen,No,Black or African American,7/6/2009,,N/A-StillEmployed,Active,Production,Brannon Miller,4,Diversity Job Fair,Fully Meets,4.46,3,0,1/25/2019,0,4,10,4,3
9,"Bacong, Alejandro",10250,0,2,1,1,3,3,0,50178,0,14,IT Support,MA,1886,01/07/88,M,Divorced,US Citizen,No,White,1/5/2015,,N/A-StillEmployed,Active,IT/IS,Peter Monroe,18,Indeed,Fully Meets,5.0,5,6,2/18/2019,0,16,10,18,0


In [80]:
X = df_hr[['MaritalStatusID', 'DeptID', 'PerfScoreID', 'SpecialProjectsCount', 'State_id', 'Absences', 'DaysLateLast30', 'GenderID', 'MaritalDesc_id']]

X_scaled = scaling(X)

X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state=42)

lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)

importance = lin_model.coef_
print(importance)

[[ -4544.7593338  -21346.38406656  18938.33402441  27110.18140897
   35119.95643333   7146.31210443  12603.73918964    543.18607384
    1481.9644627 ]]


In [81]:
model_access(lin_model, X_train, X_test, Y_train, Y_test)



The model performance for training set
--------------------------------------
RMSE is 20707.22034808237
R2 score is 0.2644165900943376
--------------------------------------


The model performance for testing set
--------------------------------------
RMSE is 22129.599743586445
R2 score is 0.39497947932117294


In [82]:
# creat1ing initial dataframe
temp_df = pd.DataFrame(df_hr[['ManagerID']])

# creating instance of labelencoder
labelencoder = LabelEncoder()

# Assigning numerical values and storing in another column
df_hr['ManagerID_id'] = labelencoder.fit_transform(temp_df['ManagerID'])

df_hr.head(10)

Unnamed: 0,Employee_Name,EmpID,MarriedID,MaritalStatusID,GenderID,EmpStatusID,DeptID,PerfScoreID,FromDiversityJobFairID,Salary,Termd,PositionID,Position,State,Zip,DOB,Sex,MaritalDesc,CitizenDesc,HispanicLatino,RaceDesc,DateofHire,DateofTermination,TermReason,EmploymentStatus,Department,ManagerName,ManagerID,RecruitmentSource,PerformanceScore,EngagementSurvey,EmpSatisfaction,SpecialProjectsCount,LastPerformanceReview_Date,DaysLateLast30,Absences,State_id,ManagerID_id,MaritalDesc_id
0,"Adinolfi, Wilson K",10026,0,0,1,1,5,4,0,62506,0,19,Production Technician I,MA,1960,07/10/83,M,Single,US Citizen,No,White,7/5/2011,,N/A-StillEmployed,Active,Production,Michael Albert,17,LinkedIn,Exceeds,4.6,5,0,1/17/2019,0,1,10,17,3
1,"Ait Sidi, Karthikeyan",10084,1,1,1,5,3,3,0,104437,1,27,Sr. DBA,MA,2148,05/05/75,M,Married,US Citizen,No,White,3/30/2015,6/16/2016,career change,Voluntarily Terminated,IT/IS,Simon Roup,19,Indeed,Fully Meets,4.96,3,6,2/24/2016,0,17,10,19,1
2,"Akinkuolie, Sarah",10196,1,1,0,5,5,3,0,64955,1,20,Production Technician II,MA,1810,09/19/88,F,Married,US Citizen,No,White,7/5/2011,9/24/2012,hours,Voluntarily Terminated,Production,Kissy Sullivan,15,LinkedIn,Fully Meets,3.02,3,0,5/15/2012,0,3,10,15,1
3,"Alagbe,Trina",10088,1,1,0,1,5,3,0,64991,0,19,Production Technician I,MA,1886,09/27/88,F,Married,US Citizen,No,White,1/7/2008,,N/A-StillEmployed,Active,Production,Elijiah Gray,8,Indeed,Fully Meets,4.84,5,0,1/3/2019,0,15,10,8,1
4,"Anderson, Carol",10069,0,2,0,5,5,3,0,50825,1,19,Production Technician I,MA,2169,09/08/89,F,Divorced,US Citizen,No,White,7/11/2011,9/6/2016,return to school,Voluntarily Terminated,Production,Webster Butler,20,Google Search,Fully Meets,5.0,4,0,2/1/2016,0,2,10,20,0
5,"Anderson, Linda",10002,0,0,0,1,5,4,0,57568,0,19,Production Technician I,MA,1844,05/22/77,F,Single,US Citizen,No,White,1/9/2012,,N/A-StillEmployed,Active,Production,Amy Dunn,1,LinkedIn,Exceeds,5.0,5,0,1/7/2019,0,15,10,1,3
6,"Andreola, Colby",10194,0,0,0,1,4,3,0,95660,0,24,Software Engineer,MA,2110,05/24/79,F,Single,US Citizen,No,White,11/10/2014,,N/A-StillEmployed,Active,Software Engineering,Alex Sweetwater,0,LinkedIn,Fully Meets,3.04,3,4,1/2/2019,0,19,10,0,3
7,"Athwal, Sam",10062,0,4,1,1,5,3,0,59365,0,19,Production Technician I,MA,2199,02/18/83,M,Widowed,US Citizen,No,White,9/30/2013,,N/A-StillEmployed,Active,Production,Ketsia Liebig,14,Employee Referral,Fully Meets,5.0,4,0,2/25/2019,0,19,10,14,4
8,"Bachiochi, Linda",10114,0,0,0,3,5,3,1,47837,0,19,Production Technician I,MA,1902,02/11/70,F,Single,US Citizen,No,Black or African American,7/6/2009,,N/A-StillEmployed,Active,Production,Brannon Miller,4,Diversity Job Fair,Fully Meets,4.46,3,0,1/25/2019,0,4,10,4,3
9,"Bacong, Alejandro",10250,0,2,1,1,3,3,0,50178,0,14,IT Support,MA,1886,01/07/88,M,Divorced,US Citizen,No,White,1/5/2015,,N/A-StillEmployed,Active,IT/IS,Peter Monroe,18,Indeed,Fully Meets,5.0,5,6,2/18/2019,0,16,10,18,0


In [83]:
X = df_hr[['ManagerID_id','DeptID', 'PerfScoreID', 'EmpSatisfaction', 'SpecialProjectsCount', 'PositionID', 'MaritalDesc_id', 'DaysLateLast30', 'GenderID']]

X_scaled = scaling(X)

X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state=42)

lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)

importance = lin_model.coef_
print(importance)

[[ -5965.77248898 -13395.40569508  16564.78723884   3154.78898457
   29969.46054642  -6002.5361129    1866.21172469  10014.15351552
     749.27972228]]


In [84]:
model_access(lin_model, X_train, X_test, Y_train, Y_test)



The model performance for training set
--------------------------------------
RMSE is 21083.228626261603
R2 score is 0.2374601362658998
--------------------------------------


The model performance for testing set
--------------------------------------
RMSE is 21646.28239170192
R2 score is 0.4211185582900909


In [None]:
В результате анализа можно выделить следующие факторы:

ManaderID - менеджер - Возможно предположить отсутсвие менеджеров у некоторых сотрудников с высокой позиуией

SpecialProjectsCount - большинство сотрудников не имеет проектов