In [17]:
from __future__ import print_function 
import sys
import os
import math
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.datasets import make_classification 
%matplotlib inline

def shuffle_data(X, y, seed=None): 
    if seed:
        np.random.seed(seed)
        
    idx = np.arange(X.shape[0])
    np.random.shuffle(idx)
    print(X[idx])
    print(y[idx])
    return X[idx], y[idx]

# 正规化数据集 X
def normalize(X, axis=-1, p=2):
    lp_norm = np.atleast_1d(np.linalg.norm(X, p, axis)) 
    lp_norm[lp_norm == 0] = 1
    return X / np.expand_dims(lp_norm, axis)

# 标准化数据集 X
def standardize(X):
    X_std = np.zeros(X.shape) 
    mean = X.mean(axis=0) 
    std = X.std(axis=0)
    # 分母不能等于 0 的情形
    # X_std = (X - X.mean(axis=0)) / X.std(axis=0) for col in range(np.shape(X)[1]):
    if std[col]:
        X_std[:, col] = (X_std[:, col] - mean[col]) / std[col]
    return X_std

# 划分数据集为训练集和测试集
def train_test_split(X, y, test_size=0.2, shuffle=True, seed=None):
    if shuffle:
        X, y = shuffle_data(X, y, seed)
        n_train_samples = int(X.shape[0] * (1-test_size))
        x_train, x_test = X[:n_train_samples], X[n_train_samples:] 
        y_train, y_test = y[:n_train_samples], y[n_train_samples:]
        
    return x_train, x_test, y_train, y_test

def accuracy(y, y_pred):
    y = y.reshape(y.shape[0], -1)
    y_pred = y_pred.reshape(y_pred.shape[0], -1) 
    return np.sum(y == y_pred)/len(y)

class KNN():
    """
    K 近邻分类算法. Parameters: 
    -----------
    k: int 最近邻个数.
    """

    def __init__(self, k=5): 
        self.k = k

    # 计算一个样本与训练集中所有样本的欧氏距离的平方
    def euclidean_distance(self, one_sample, X_train):
        one_sample = one_sample.reshape(1, -1)
        X_train = X_train.reshape(X_train.shape[0], -1)
        distances = np.power(np.tile(one_sample, (X_train.shape[0], 1)) - X_train,2).sum(axis=1)

        return distances

    # 获取 k 个近邻的类别标签
    def get_k_neighbor_labels(self, distances, y_train, k):
        k_neighbor_labels = []

        for distance in np.sort(distances)[:k]:
            label = y_train[distances==distance] 
            k_neighbor_labels.append(label)

            return np.array(k_neighbor_labels).reshape(-1, )

    # 进行标签统计，得票最多的标签就是该测试样本的预测标签 
    def vote(self, one_sample, X_train, y_train, k):
        distances = self.euclidean_distance(one_sample, X_train) 
        #print(distances.shape)
        y_train = y_train.reshape(y_train.shape[0], 1)
        k_neighbor_labels = self.get_k_neighbor_labels(distances, y_train, k) 
        #print(k_neighbor_labels.shape)
        find_label, find_count = 0, 0
        for label, count in Counter(k_neighbor_labels).items():
            if count > find_count: 
                find_count = count 
                find_label = label
        return find_label

    # 对测试集进行预测
    def predict(self, X_test, X_train, y_train):
        y_pred = []

        for sample in X_test:
            label = self.vote(sample, X_train, y_train, self.k) 
            y_pred.append(label)
            #print(y_pred)
        return np.array(y_pred)

def main():
    # 200条数据，每条数据四个特征，共两类
    data = make_classification(n_samples=200, n_features=4, n_informative=2,n_redundant=2, n_repeated=0, n_classes=2)
    X, y = data[0], data[1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True)
    clf = KNN(k=5)
    y_pred = clf.predict(X_test, X_train, y_train)
    
    accu = accuracy(y_test, y_pred) 
    print ("Accuracy:", accu)


if __name__ == "__main__": 
    main()

[[ 4.13460256e-01 -7.18420911e-02  1.24354663e-01  2.05588055e-01]
 [-5.12300155e-01  9.74331022e-02 -2.68193261e-01 -3.21440744e-01]
 [-1.02734239e+00  2.26363796e-01 -9.57782296e-01 -8.90097887e-01]
 [ 2.34236430e+00 -2.86460945e-01 -9.29779703e-01  2.09360555e-01]
 [ 7.03604314e-01  4.95919166e-02 -2.11823271e+00 -1.01210181e+00]
 [-2.43768763e+00  3.38239471e-01  4.23675693e-01 -5.35852197e-01]
 [-3.13609019e-01  1.68482597e-01 -1.63975613e+00 -1.05935114e+00]
 [ 4.49990335e-01  6.01719413e-02 -1.74050250e+00 -8.72808763e-01]
 [ 5.67156518e-01 -5.81849841e-02 -3.76643950e-01 -3.78794109e-02]
 [ 1.20823228e-01 -6.96555261e-02  6.96071020e-01  4.45736691e-01]
 [ 7.74416774e-01 -4.17980332e-02 -1.02472510e+00 -3.50110311e-01]
 [ 1.56001926e+00 -9.90871451e-02 -1.86241520e+00 -5.87290397e-01]
 [ 1.56056770e+00 -3.66802922e-01  1.76603193e+00  1.53396477e+00]
 [ 1.03789282e+00 -1.51653908e-01 -7.67827516e-02  2.88714011e-01]
 [ 1.40786595e+00 -2.73718687e-01  8.17834127e-01  9.30596122e

In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#读入数据，没有列名默认为None
raw_data = pd.read_csv('/Users/dengyangjie/Desktop/ML/Lab/Lab02/adult.data', header=None)  #导入数据集
print(raw_data.shape)
data = raw_data.values
print(data)
X = data[:,:-1]
y = data[:,-1]
print(X)
print(y)
# Show the head of the table
raw_data.head(100)

(32561, 15)
[[39 ' State-gov' 77516 ... 40 ' United-States' ' <=50K']
 [50 ' Self-emp-not-inc' 83311 ... 13 ' United-States' ' <=50K']
 [38 ' Private' 215646 ... 40 ' United-States' ' <=50K']
 ...
 [58 ' Private' 151910 ... 40 ' United-States' ' <=50K']
 [22 ' Private' 201490 ... 20 ' United-States' ' <=50K']
 [52 ' Self-emp-inc' 287927 ... 40 ' United-States' ' >50K']]
[[39 ' State-gov' 77516 ... 0 40 ' United-States']
 [50 ' Self-emp-not-inc' 83311 ... 0 13 ' United-States']
 [38 ' Private' 215646 ... 0 40 ' United-States']
 ...
 [58 ' Private' 151910 ... 0 40 ' United-States']
 [22 ' Private' 201490 ... 0 20 ' United-States']
 [52 ' Self-emp-inc' 287927 ... 0 40 ' United-States']]
[' <=50K' ' <=50K' ' <=50K' ... ' <=50K' ' <=50K' ' >50K']


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [41]:
def shuffle_data(X, y, seed=None): 
    if seed:
        np.random.seed(seed)
        
    idx = np.arange(X.shape[0])
    np.random.shuffle(idx)
    return X[idx], y[idx]

# 划分数据集为训练集和测试集
def train_test_split(X, y, test_size=0.2, shuffle=True, seed=None):
    if shuffle:
        X, y = shuffle_data(X, y, seed)
        n_train_samples = int(X.shape[0] * (1-test_size))
        x_train, x_test = X[:n_train_samples], X[n_train_samples:] 
        y_train, y_test = y[:n_train_samples], y[n_train_samples:]
    return x_train, x_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True)
X_train.shape[0],X_test.shape[0]

(21815, 10746)

In [44]:
import pandas as pd
import numpy as np
import random
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


class Perceptron(object):
    """    
    eta:float    
    学习效率，处于0和1之间        
    n_iter:int    对训练数据进行学习改进次数        
    w_:一维向量    
    存储权重数值        
    error_:    存储每次迭代改进时，网络对数据进行错误判断的次数    
    """            
    def __init__(self,eta=0.01,n_iter=50):        
        self.eta = eta        
        self.n_iter = n_iter            
    def net_input(self, X):        
        return np.dot(X, self.w_[1:]) + self.w_[0]    
    def activation(self, X):        
        return self.net_input(X)       
    def predict(self, X):        
        return np.where(self.activation(X) >=0, 1, -1)        
    def fit(self, X, y):        
        """        
        X:二维数组[n_sampls, n_features]        
        n_samples 表示X中含有训练数据条目数        
        n_faetures 含有4个数据的一维向量，用于表示一条训练条目                
        y:一维向量        
        用于存储每一训练条目对应的正确分类        
        """     

        self.w_ = np.zeros(1 + X.shape[1])        
        self.cost_ = []                
        for i in range(self.n_iter):            
            output = self.net_input(X)            
            errors = (y-output)            
            self.w_[1:] += self.eta * X.T.dot(errors)            
            self.w_[0] += self.eta * errors.sum()            
            cost = (errors ** 2).sum() /2.0            
            self.cost_.append(cost)        
        return self

def shuffle_data(X, y, seed=None): 
    if seed:
        np.random.seed(seed)
        
    idx = np.arange(X.shape[0])
    np.random.shuffle(idx)
    print(X[idx])
    print(y[idx])
    return X[idx], y[idx]

# 划分数据集为训练集和测试集
def train_test_split(X, y, test_size=0.2, shuffle=True, seed=None):
    if shuffle:
        X, y = shuffle_data(X, y, seed)
        n_train_samples = int(X.shape[0] * (1-test_size))
        x_train, x_test = X[:n_train_samples], X[n_train_samples:] 
        y_train, y_test = y[:n_train_samples], y[n_train_samples:]
    return x_train, x_test, y_train, y_test
    
if __name__ == '__main__':

    print('Start read data')

    raw_data = pd.read_csv('/Users/dengyangjie/Desktop/ML/Lab/Lab02/adult.data', header=None)  #导入数据集
    data = raw_data.values
    X = data[:,:-1]
    y = data[:,-1]
    for i in range(y.shape[0]):
        if y[i] == '>50K':
            y[i] = 1
        else:
            y[i] = 0
            

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True)
    
    percep = Perceptron(eta=0.0001, n_iter=50)
    percep.fit(X_train,y_train)
    
    predict = percep.predict(X_test,y_test)
    print(predict)

    #print("The accruacy socre is ", score)

Start read data
[[44 ' Local-gov' 145522 ... 0 40 ' United-States']
 [53 ' Private' 142411 ... 0 40 ' United-States']
 [28 ' Private' 142712 ... 0 40 ' United-States']
 ...
 [31 ' Private' 259705 ... 0 40 ' United-States']
 [43 ' Private' 206878 ... 0 60 ' United-States']
 [51 ' Private' 145409 ... 0 40 ' United-States']]
[0 0 0 ... 0 0 0]


TypeError: can't multiply sequence by non-int of type 'float'

In [None]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt

def standardize(X):
    X_std = np.zeros(X.shape)
    mean = np.mean(X, axis = 0)
    # X.mean(axis = 0)
    std = np.std(X, axis = 0)
    X_std = (X - mean)/std

    return X_std


# data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', names=["age", "type_employer", "fnlwgt", "education",  "education_num",
#                               "marital", "occupation", "relationship", "race","sex", "capital_gain",
#                               "capital_loss", "hr_per_week","country", "income"])
#
# data.to_csv('adult.data', index=False)

data = pd.read_csv('adult.data')
# data = data.reindex(columns=["age", "type_employer", "fnlwgt", "education",  "education_num",
#                               "marital", "occupation", "relationship", "race","sex", "capital_gain",
#                               "capital_loss", "hr_per_week","country", "income"])
# data["education_num"]=None
# data["fnlwgt"]=None

print(data.columns)
# print(data.info())
print(data.head())
# print(data.describe())
# print(len(data[data[]]))

data.drop(['education', 'fnlwgt', 'race', 'capital_gain', 'capital_loss', 'country'], axis=1, inplace=True)

print(data.head())
print(data['income'].value_counts())
# <= 50K 24720
# > 50K 7841

# age_notnull = data[data["age"].notnull()].as_matrix()
# age_null = data[data["age"].isnull()].as_matrix()

# print(data["age"].isnull().value_counts())
print(data["type_employer"].value_counts()) #?
print(data["education_num"].value_counts())
print(data["marital"].value_counts())
print(data["occupation"].value_counts())# ?
print(data["relationship"].value_counts())
# print(data["race"].value_counts())
print(data["sex"].value_counts())
print(data["hr_per_week"].value_counts())
# print(data["country"].value_counts()) # ?

# print(data[data["type_employer"] == ' ?'].index.tolist())


# 删除缺失值记录
data.drop(data[data["type_employer"] == ' ?'].index.tolist(),axis=0, inplace=True)
data.drop(data[data["occupation"] == ' ?'].index.tolist(),axis=0, inplace=True)
print(data["type_employer"].value_counts())


print(data['income'].value_counts())
# 将离散值（类型值）改为数值

dummies_type_employer= pd.get_dummies(data["type_employer"], prefix = "type_employer")
dummies_marital = pd.get_dummies(data["marital"], prefix = "marital")
dummies_occupation = pd.get_dummies(data["occupation"], prefix = "occupation")
dummies_relationship = pd.get_dummies(data["relationship"], prefix = "relationship")
# dummies_race = pd.get_dummies(data["race"], prefix = "race")
# dummies_country = pd.get_dummies(data["country"], prefix = "country")
dummies_sex = pd.get_dummies(data["sex"], prefix = "sex")
print(dummies_sex, dummies_sex.shape)

data = pd.concat([data, dummies_type_employer, dummies_marital, dummies_occupation, dummies_relationship, dummies_sex], axis=1)
print(data.head())
data.drop(["type_employer", "marital", "occupation", "relationship", "sex"], axis=1, inplace=True)

print(data.head())

# 标准化

age_scale = standardize(np.array(data["age"]))
education_num_scale = standardize(np.array(data["education_num"]))
hr_per_week_scale = standardize(np.array(data["hr_per_week"]))
# print(data['age'].shape, age_scale.shape)

data["age_scale"] = age_scale
data["education_num_scale"] = education_num_scale
data["hr_per_week_scale"] = hr_per_week_scale


# data = pd.concat([data, pd.Series(age_scale), pd.Series(education_num_scale), pd.Series(hr_per_week_scale)], axis=1)
data.drop(["age", "education_num", "hr_per_week"], axis=1, inplace=True)

print(data.head())

data.ix[data['income'] == ' <=50K', 'income'] = -1
data.ix[data['income'] == ' >50K', 'income'] = 1

# print(data['age_scale'].shape, age_scale.shape)
# print(data['sex_ Female'].shape, dummies_sex.shape)


data.to_csv('adult_scale.data', index=False)