In [15]:
import pymysql 
import sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import jaccard_score
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from metric_learn import LMNN
from sklearn.metrics import roc_auc_score
from sklearn.base import clone


In [16]:
try:
    mydb=pymysql.connect(host='localhost',user='root',passwd='123456',port=3306,database='employmentprediction')
    cursor=mydb.cursor()
    print('初始化成功！连接成功！')
except:
    print('连接出错')
    exit(0)

初始化成功！连接成功！


In [69]:
class MetaCost(object):

    """A procedure for making error-based classifiers cost-sensitive

    >>> from sklearn.datasets import load_iris
    >>> from sklearn.linear_model import LogisticRegression
    >>> import pandas as pd
    >>> import numpy as np
    >>> S = pd.DataFrame(load_iris().data)
    >>> S['target'] = load_iris().target
    >>> LR = LogisticRegression(solver='lbfgs', multi_class='multinomial')
    >>> C = np.array([[0, 1, 1], [1, 0, 1], [1, 1, 0]])
    >>> model = MetaCost(S, LR, C).fit('target', 3)
    >>> model.predict_proba(load_iris().data[[2]])
    >>> model.score(S[[0, 1, 2, 3]].values, S['target'])

    .. note:: The form of the cost matrix C must be as follows:
    +---------------+----------+----------+----------+
    |  actual class |          |          |          |
    +               |          |          |          |
    |   +           | y(x)=j_1 | y(x)=j_2 | y(x)=j_3 |
    |       +       |          |          |          |
    |           +   |          |          |          |
    |predicted class|          |          |          |
    +---------------+----------+----------+----------+
    |   h(x)=j_1    |    0     |    a     |     b    |
    |   h(x)=j_2    |    c     |    0     |     d    |
    |   h(x)=j_3    |    e     |    f     |     0    |
    +---------------+----------+----------+----------+
    | C = np.array([[0, a, b],[c, 0 , d],[e, f, 0]]) |
    +------------------------------------------------+
    """
    def __init__(self, S, L, C, m=50, n=1, p=True, q=True):
        """
        :param S: The training set
        :param L: A classification learning algorithm
        :param C: A cost matrix
        :param q: Is True iff all resamples are to be used  for each examples
        :param m: The number of resamples to generate
        :param n: The number of examples in each resample
        :param p: Is True iff L produces class probabilities
        """
        if not isinstance(S, pd.DataFrame):
            raise ValueError('S must be a DataFrame object')
        new_index = list(range(len(S)))
        S.index = new_index
        self.S = S
        self.L = L
        self.C = C
        self.m = m
        self.n = len(S) * n
        self.p = p
        self.q = q

    def fit(self, flag, num_class):
        """
        :param flag: The name of classification labels
        :param num_class: The number of classes
        :return: Classifier
        """
        col = [col for col in self.S.columns if col != flag]
        S_ = {}
        M = []

        for i in range(self.m):
            # Let S_[i] be a resample of S with self.n examples
            S_[i] = self.S.sample(n=self.n, replace=True)

            X = S_[i][col].values
            y = S_[i][flag].values

            # Let M[i] = model produced by applying L to S_[i]
            model = clone(self.L)
            M.append(model.fit(X, y))

        label = []
        S_array = self.S[col].values
        for i in range(len(self.S)):
            if not self.q:
                k_th = [k for k, v in S_.items() if i not in v.index]
                M_ = list(np.array(M)[k_th])
            else:
                M_ = M

            if self.p:
                P_j = [model.predict_proba(S_array[[i]]) for model in M_]
            else:
                P_j = []
                vector = [0] * num_class
                for model in M_:
                    vector[model.predict(S_array[[i]])] = 1
                    P_j.append(vector)

            # Calculate P(j|x)
            P = np.array(np.mean(P_j, 0)).T

            # Relabel
            label.append(np.argmin(self.C.dot(P)))

        # Model produced by applying L to S with relabeled y
        X_train = self.S[col].values
        y_train = np.array(label)
        model_new = clone(self.L)
        model_new.fit(X_train, y_train)

        return model_new

In [75]:
class imclassifier:
    '在不平衡数据集中进行分类预测'
    def __init__(self,data_original,feature_name,label_name,ir_threshold=4,dsc=True,dse=True,dfe=True,csl=True,el=True):
        'data_original存储原始数据'
        self.data_original=data_original
        
        'feature存储data_original中的特征列名'
        self.feature_name=feature_name
        
        'label存储data_original中的标签列名'
        self.label_name=label_name
        
        '''
        标签占比阈值,
        如果标签的占比大于阈值,
        这个标签才会被选入筛选器
        '''
        # self.target_threshold=target_threshold
        
        'IR阈值'
        self.ir_threshold=ir_threshold
        
        '变换同义的标签'
        # self.data_original[label].replace(['其他暂不就业','不就业拟升学'],value='待就业',inplace=True)

#         '对原始数据的标签进行统计'
#         self.target_counts=self.data_original[self.label_name].value_counts(normalize=True)
        
#         'target_selected是index属性的,用此来筛选数据'
#         self.target_selected=self.target_counts[self.target_counts>=self.target_threshold].index
        
#         'data_selected用于存储筛选后的数据'
#         self.data_selected=self.data_original[self.data_original[self.label_name].isin(self.target_selected)]
        
        'feature用于存储所有的特征' 
        self.feature=self.data_original.loc[:,self.feature_name]
        'target用于存储标签'
        self.target=self.data_original.loc[:,self.label_name]

        '为标签列编码'
        self.target_o_encoder=OrdinalEncoder()
        self.target_ordinal=pd.DataFrame(self.target_o_encoder.fit_transform(self.target.to_frame()),columns=[self.label_name])
        
        '编码器编码的标签名,target_name为数组属性'
        self.target_name=self.target_o_encoder.categories_[0]

        '经由编码器编码后的总标签数'
        self.target_types=len(self.target_name)
        
        '''
        DSC:Data Subset Construction
        data_subset用于存储子数据集
        data_subset_ir用于存储子数据集imbalance ratio
        '''
        self.data_subset=[]
        self.data_subset_ir=[]
        self.data_subset_value_counts=[]
        if dsc:
            self.dsc()
        else:
            self.nodsc()
        'DSE:Data Subset Extension'
        self.data_subset_extension=[]
        if dse:
            self.dse()
        else:
            self.nodse()
        'DFE:Data Feature Emphasis'
        self.data_subset_extension_emphasis=[]
        if dfe:
            self.dfe()
        # else:
        #     self.nodfe()
        
        'CSL:Cost-Sensitive Learning'
        self.data_subset_metacost=[]
        if csl:
            self.csl()
        
    'DSC:Data Subset Construction'
    def dsc(self):
        for index in range(self.target_types):
            temp_df=pd.concat([self.feature,self.target_ordinal],axis='columns',join='inner')
            temp_df[self.label_name]=np.where(temp_df[self.label_name]==index,1,0)

            self.data_subset_value_counts.append(temp_df[self.label_name].value_counts())

            self.data_subset.append(temp_df)
            self.data_subset_ir.append(self.data_subset_value_counts[index][0]/self.data_subset_value_counts[index][1])
    
    'NODSC:No Data Subset Construction'
    def nodsc(self):
        temp_df=pd.concat([self.feature,self.target_ordinal],axis='columns',join='inner')
        temp_value_counts=temp_df[self.label_name].value_counts()
        self.data_subset.append(temp_df)
        self.data_subset_ir.append(temp_value_counts[0]/temp_value_counts[1])

    'DSE:Data Subset Extension'
    def dse(self):
        for index in range(len(self.data_subset)):
            temp_data_subset_extension=[]
            if self.data_subset_ir[index]>=self.ir_threshold:
                data_subset_extension_minor_size=self.data_subset_value_counts[1]
                data_subset_extension_major_minor_ratio=round(self.data_subset_value_counts[index][0]/self.data_subset_value_counts[index][1])
                print(data_subset_extension_major_minor_ratio)
                
                minor_data=self.data_subset[index][self.data_subset[index][self.label_name]==1]
                major_data=self.data_subset[index][self.data_subset[index][self.label_name]==0]
                
                major_data_shuffled=shuffle(major_data)
                
                '进行划分,若数据不够进行一次划分则全部选用'
                major_chunks=np.array_split(major_data_shuffled, data_subset_extension_major_minor_ratio)
                for major_chunk in major_chunks:
                    temp_data_subset_extension_base=pd.concat([minor_data, major_chunk])
                    temp_data_subset_extension.append(temp_data_subset_extension_base)
            else:
                temp_data_subset_extension.append(self.data_subset[index])
                
            self.data_subset_extension.append(temp_data_subset_extension)
    
    'NODSE:No Data Subset Extension'
    def nodse(self):
        for index in range(len(self.data_subset)):
            temp_data_subset_extension=[]
            temp_data_subset_extension.append(self.data_subset[index])
            self.data_subset_extension.append(temp_data_subset_extension)
            
    'DFE:Data Feature Emphasis'
    def dfe(self):
        for index in range(len(self.data_subset_extension)):
            temp_subset_lmnn=LMNN(n_neighbors=5,learn_rate=1e-6,random_state=42)
            temp_subset_lmnn.fit(self.data_subset[index].loc[:,self.feature_name],self.data_subset[index].loc[:,self.label_name])
            
            temp_data_subset_extension_emphasis=[]
            
            for index_1 in range(len(self.data_subset_extension[index])):
                temp_data=self.data_subset_extension[index][index_1]
                temp_x=temp_data.loc[:,self.feature_name]
                temp_y=temp_data.loc[:,self.label_name]

                temp_x_lmnn=temp_subset_lmnn.transform(temp_x)
        # self.target_ordinal=pd.DataFrame(self.target_o_encoder.fit_transform(self.target.to_frame()),columns=[self.label_name])                
                temp_data_subset_extension_emphasis.append(pd.concat([pd.DataFrame(temp_x_lmnn,columns=[self.feature_name]),temp_y],axis='columns',join='inner'))
            
            self.data_subset_extension_emphasis.append(temp_data_subset_extension_emphasis)
                                 
    def nodfe(self):
        self.data_subset_extension_emphasis=self.data_subset_extension
                                 
    'CSL:Cost-Sensitive Learning'
    def csl(self):
        for index in range(len(self.data_subset_extension)):
            temp_subset_metacost=[]
            for index_1 in range(len(self.data_subset_extension[index])):
                temp_knn=KNeighborsClassifier(n_neighbors=5)
                a=1
                b=1
                temp_cost=np.array([[0, a],[b, 0]])
                temp_csl=MetaCost(self.data_subset_extension_emphasis[index][index_1],temp_knn,temp_cost).fit(self.label_name,len(self.label_name))
                temp_subset_metacost.append(temp_csl)
                
            self.subset_matacost.append(temp_subset_metacost)
            
    'EL:Ensemble Learning'
    # def el(self):
        

In [71]:
def imsplit(data_original,label_name,target_threshold=0.03):
    '对原始数据的标签进行统计'
    target_counts=data_original[label_name].value_counts(normalize=True)

    'target_selected是index属性的,用此来筛选数据'
    target_selected=target_counts[target_counts>=target_threshold].index

    'data_selected用于存储筛选后的数据'
    data_selected=data_original[data_original[label_name].isin(target_selected)]

    return data_selected

In [72]:
sql='select * from trainingdata'
data_ori=pd.read_sql(sql,mydb)

  data_ori=pd.read_sql(sql,mydb)


In [73]:
# type(data_ori)
data_ori=imsplit(data_ori,label_name='y1',target_threshold=0.03)
data_train,data_test=train_test_split(data_ori,test_size=0.3,random_state=42)

In [74]:
imc=imclassifier(data_ori,feature_name=['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10','x11','x12'],label_name='y1',dsc=True,dse=True)

23
11
8
7


NameError: name 'label_name' is not defined

In [56]:
imc.subset_matacost

AttributeError: 'imclassifier' object has no attribute 'subset_matacost'

In [69]:
# len(imc.data_subset_extension)
sum=0
for index in range(len(imc.data_subset_extension)):
    print("no:",index,"=",len(imc.data_subset_extension[index]))
    sum+=len(imc.data_subset_extension[index])

no: 0 = 23
no: 1 = 1
no: 2 = 1
no: 3 = 11
no: 4 = 8
no: 5 = 7


In [67]:
sum

51

In [20]:
# imc.data_subset[0]['y1'].value_counts()

for i in range(len(imc.data_subset_ir)):
    print(imc.target_name[0][i],"\t",imc.data_subset_ir[i],"\t",imc.target_counts[imc.target_name[0][i]])

三资企业 	 23.46875 	 0.03841536614645858
不就业拟升学 	 77.3 	 0.012004801920768308
其他企业 	 1.7864768683274022 	 0.34813925570228094
其他暂不就业 	 70.18181818181819 	 0.014405762304921969
升学 	 3.078125 	 0.23649459783913565
国有企业 	 11.836065573770492 	 0.07683073229291716
待就业 	 8.433734939759036 	 0.10084033613445378
机关 	 86.0 	 0.010804321728691477
科研助理 	 77.3 	 0.012004801920768308
自由职业 	 7.329787234042553 	 0.12004801920768307


In [21]:
for i in range(imc.target_types):
    print("index=",i)
    # print("index=",i,imc.data_subset[i])

index= 0
index= 1
index= 2
index= 3
index= 4
index= 5
index= 6
index= 7
index= 8
index= 9


In [22]:
cursor.close()
mydb.close()