In [115]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from tpot import TPOTClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score

# 数据加载
train_data = pd.read_csv('./Attrition_train.csv')
test_data = pd.read_csv('./Attrition_test.csv')
test_data_1 = pd.read_csv('./Attrition_test.csv')

# 数据处理
train_data = train_data.drop(columns=['user_id','EmployeeCount','EmployeeNumber','Over18','StandardHours'])
train_data = train_data.drop(columns=['DailyRate','HourlyRate','MonthlyRate'])
test_data = test_data.drop(columns=['user_id','EmployeeCount','EmployeeNumber','Over18','StandardHours'])
test_data = test_data.drop(columns=['DailyRate','HourlyRate','MonthlyRate'])

train_data.loc[train_data.Attrition == 'Yes','Attrition'] = 1
train_data.loc[train_data.Attrition == 'No','Attrition'] = 0

train_data['NumCompaniesWorkedPerYear'] = train_data.apply(lambda x: x['NumCompaniesWorked'] / (x['TotalWorkingYears']+1), axis=1)
train_data = train_data.drop(columns=['NumCompaniesWorked','TotalWorkingYears'])
test_data['NumCompaniesWorkedPerYear'] = test_data.apply(lambda x: x['NumCompaniesWorked'] / (x['TotalWorkingYears']+1), axis=1)
test_data = test_data.drop(columns=['NumCompaniesWorked','TotalWorkingYears'])

'''
#探索离散参数与离职的关系
for i in train_data.columns:
    if train_data[i].dtype == 'O':
        print(i + ':')
        print((train_data[train_data['Attrition'] == 1][i].value_counts()/train_data[i].value_counts()).sort_values(ascending=False))
        print('-----------------------')
'''

'''
#探索年龄与离职的关系
plt.figure(figsize=(10,5))
sns.barplot(x='Age', y='Attrition', data = train_data )
'''

#重新划分年龄标识
def resetAge(name):
    if (name >= 18) & (name < 22)  & (name == 58):
        return 1
    elif (name == 54) & (name == 57) & (name > 58) :
        return 0
    else:
        return 2
'''
#探索月薪与离职的关系
facet = sns.FacetGrid(train_data,hue = 'Attrition' ,aspect=3)
facet.map(sns.kdeplot,'MonthlyIncome',shade = True)
facet.set(xlim=(0,train_data['MonthlyIncome'].max()))
facet.add_legend()
'''
#重新划分月薪标识
def resetSalary(s):
    if s>0 & s<3725:
        return 0
    elif s>=3725 & s<8500:
        return 1
    else:
        return 2
    
'''
#探索涨薪幅度与离职的关系
plt.figure(figsize=(14,5))
sns.barplot(x='PercentSalaryHike', y='Attrition', data = train_data , palette = 'Set2')
'''
def resetPerHike(s):
    if (s >= 22 & s < 25) | (s == 11):
        return 0
    elif (s >= 12 & s < 14) | (s == 17):
        return 1
    elif (s >24):
        return 3
    else:
        return 2
'''
#探索每年就职公司数与离职的关系
facet = sns.FacetGrid(train_data,hue = 'Attrition' ,aspect=3)
facet.map(sns.kdeplot,'NumCompaniesWorkedPerYear',shade = True)
facet.set(xlim=(0,train_data['NumCompaniesWorkedPerYear'].max()))
facet.add_legend()
'''
#重新划分就职公司数标识
def resetWorkingCompanyNum(s):
    if (s > 0) & (s < 0.34):
        return 0
    elif (s >= 0.34) & (s < 1.25):
        return 1
    else:
        return 2
'''
#探索在本公司工作年限与离职的关系
plt.figure(figsize=(14,5))
sns.barplot(x='YearsAtCompany', y='Attrition', data = train_data , palette = 'Set2')
'''
def resetYearsAtCompany(s):
    if s==40 :
        return 0
    elif s==23 | (s>=31 & s<33):
        return 1
    elif s==12 | s==15 | s==16 | (s>=21 & s<23)| (s>=24 & s<31) | (s>=34 & s<38) | s>40 :
        return 2
    else:
        return 3

#重定义数据
train_data['Age'] = train_data['Age'].apply(resetAge)
train_data['PercentSalaryHike'] = train_data['PercentSalaryHike'].apply(resetPerHike)
train_data['MonthlyIncome'] = train_data['MonthlyIncome'].apply(resetSalary)
train_data['NumCompaniesWorkedPerYear'] = train_data['NumCompaniesWorkedPerYear'].apply(resetWorkingCompanyNum)
train_data['YearsAtCompany'] = train_data['YearsAtCompany'].apply(resetYearsAtCompany)
test_data['Age'] = test_data['Age'].apply(resetAge)
test_data['PercentSalaryHike'] = test_data['PercentSalaryHike'].apply(resetPerHike)
test_data['MonthlyIncome'] = test_data['MonthlyIncome'].apply(resetSalary)
test_data['NumCompaniesWorkedPerYear'] = test_data['NumCompaniesWorkedPerYear'].apply(resetWorkingCompanyNum)
test_data['YearsAtCompany'] = test_data['YearsAtCompany'].apply(resetYearsAtCompany)

#将离散数据进行one-hot编码(训练集)
cata_result = pd.DataFrame()
for i in train_data.columns:
    if train_data[i].dtype == 'O':
        cata = pd.DataFrame()
        cata = pd.get_dummies(train_data[i],prefix=i)
        cata_result = pd.concat([cata_result,cata],axis=1)

for i in train_data.columns:
    if train_data[i].dtype == 'O':
        train_data = train_data.drop(i,axis=1)

#将离散数据进行one-hot编码(测试集)
cata_result_test = pd.DataFrame()
for i in test_data.columns:
    if test_data[i].dtype == 'O':
        cata_test = pd.DataFrame()
        cata_test = pd.get_dummies(test_data[i],prefix=i)
        cata_result_test = pd.concat([cata_result_test,cata_test],axis=1)

for i in test_data.columns:
    if test_data[i].dtype == 'O':
        test_data = test_data.drop(i,axis=1)
        
train_data = pd.concat([train_data,cata_result],axis=1)
train_features = train_data.drop(columns=['Attrition'])
train_labels = train_data['Attrition']
test_data = pd.concat([test_data,cata_result_test],axis=1)

'''
#分割训练集与测试集
X_train, X_test, y_train, y_test = train_test_split(train_features.astype(np.float64),
    train_labels.astype(np.float64), train_size=0.8, test_size=0.2,random_state=42)

#利用TPOT选择模型
tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
#tpot.export('tpot_Attribute_pipeline_1.py')
'''
#利用模型进行预测
model = GradientBoostingClassifier(learning_rate=1.0, max_depth=1, max_features=0.7500000000000001, min_samples_leaf=16, min_samples_split=6, n_estimators=100, subsample=1.0)

model.fit(train_features, train_labels)
pred_labels_test = model.predict_proba(test_data)
#print(pred_labels_test)

predict_y = []
for i in range(len(pred_labels_test)):
    predict_y.append(pred_labels_test[i][1]/(pred_labels_test[i][0]+pred_labels_test[i][1]))

pred_test = test_data_1['user_id'].tolist()
pred_result = pd.DataFrame({'user_id':pred_test,'Attrition':predict_y})
print(pred_result)

#pred_result.to_csv('./result_3.csv')

     user_id  Attrition
0        442   0.048477
1       1091   0.013149
2        981   0.523220
3        785   0.028260
4       1332   0.916846
..       ...        ...
289     1439   0.018231
290      481   0.010532
291      124   0.619353
292      198   0.021148
293     1229   0.040336

[294 rows x 2 columns]
