In [1]:
# -*- coding:utf-8 -*- #

import pandas as pd

### 6.6.1  数值型特征

In [2]:
dic = {'height': [1.6, 1.7, 1.8],
      'weight': [60, 70, 90]}
data = pd.DataFrame(dic)
data['bmi'] = data['weight'] / (data['height'] **2)
print(data)
data['overweight'] = data['bmi'] > 25
print(data)
data['overweight'] = data['overweight'].map({True:'Yes', False:'No'})
print(data)

   height  weight        bmi
0     1.6      60  23.437500
1     1.7      70  24.221453
2     1.8      90  27.777778
   height  weight        bmi  overweight
0     1.6      60  23.437500       False
1     1.7      70  24.221453       False
2     1.8      90  27.777778        True
   height  weight        bmi overweight
0     1.6      60  23.437500         No
1     1.7      70  24.221453         No
2     1.8      90  27.777778        Yes


### 6.6.2  类型特征

In [3]:
import pandas as pd
dic = {'string': ['第一组', '第二组', '第二组']}
data = pd.DataFrame(dic)
print(pd.factorize(data.string)) # 转换成数值型编码

data['num'] = pd.factorize(data['string'])[0]
df = pd.get_dummies(data['string'], prefix='组别')  # 转换成onehot类型编码
new_data = pd.concat([data, df], axis=1)
print(new_data)

(array([0, 1, 1]), Index(['第一组', '第二组'], dtype='object'))
  string  num  组别_第一组  组别_第二组
0    第一组    0       1       0
1    第二组    1       0       1
2    第二组    1       0       1


### 6.6.3  关键字特征

In [4]:
import pandas as pd
import numpy as np
from scipy import stats
import jieba
import re

def do_split(test_text):
    pattern = r',|\.|/|;|\'|`|\[|\]|<|>|\?|:|"|\{|\}|\~|!|？|@|#|\$|%|\^|&|\(|\)|-|=|\_|\+|，|。|、|；|‘|’|【|】03   |·|！| |…|（|）' 
    return re.split(pattern, test_text) 



In [5]:
def get_keywords(data, feat):
    ret = []
    data[feat] = data[feat].apply(lambda x: x.strip())
    for i in data[feat].unique():
        # 将短句作为关键字
        if len(i) <= 50 and i not in ret:
            ret.append(i)
        # 将子句作为关键字
        for sentence in do_split(i):
            if len(sentence) <= 50 and sentence not in ret:
                ret.append(sentence)
        # 将词作为关键字
        for word in jieba.lcut(i, cut_all=True):
            if len(word) > 1 and word not in ret:
                ret.append(word)
    return ret

def check_freq(data, feat, keywords, limit):
    ret = []
    for key in keywords:
        try:
            if len(data[data[feat].str.contains(key)]) > limit:
                ret.append(key)
        except:
            pass
    return ret

def do_test(data, feat, key, y, debug=False):
    arr1 = data[data[feat].str.contains(key) == True][y]
    arr2 = data[data[feat].str.contains(key) == False][y]
    ret1 = stats.ttest_ind(arr1, arr2, equal_var = False)
    ret2 = stats.levene(arr1, arr2)
    if ret1.pvalue < 0.05 or ret2.pvalue < 0.05:
        return True    
    return False

def check(data, feat, y):
    ret = []
    keywords = get_keywords(data, feat)
    arr = check_freq(data, feat, keywords, 5)
    for word in arr:
        if do_test(data, feat, word, y):
            ret.append(word)
    return ret

# 读取数据文件的前500条数据，其中第6个字段是微博内容，第5个字段为点赞次数。
data = pd.read_csv('weibo_train_data.txt', sep='\t', 
                   header=None, nrows=500)
print(check(data, 6, 5))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.465 seconds.
Prefix dict has been built succesfully.
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


['挣钱是一种能力', '抢红包拼的是技术', '一起发出的现金红包', '￣', '3￣', '╭http', 't', 'cn', '红包', '挣钱', '能力', '抢红包', '抢到', '一起', '发出', '现金', '幸福', '粉丝', 'http', '这些', '喜欢', '分享自', '羊年未到', '好运先来', '你也来试试手气', '就是', '好运', '先来', '来试', '试试', '试手', '手气', '东西', '软件', '下载', '3', 'IT', '#IT#', '新闻', '支付宝钱包', '羊年好运就此开启', '你也来试试手气吧', '╮', '支付', '支付宝', '钱包', '就此', '开启', '来自', '网易云音乐', '网易', '音乐', '已经', '一个', '16', '发表', '需要', '他们', '工程', '工程师', '开发', '不是', '代码', '理解', '进行', '创业', '快的打车', '打车', '感觉', '今天', '评论', '查看', '成为', '活动', '幸运', '一些', '我正在', '为', '正在', '支持', '小时', 'in', '手机', '自己', '2015', '信息', '我抢到了', '发出的现金红包', '阅读', '使用', '单词', '管理', '设计', '30', '项目', '15', '工具', '免费', '提供', '快来', '未来', 'on', '来', '博客', '赞', '还有', '向', 'and', '简单', '环境', 'from', '获得', 'for', '详情', 'at', 'Y', '小伙伴们', '——', '快来试试手气', '23', '形', '51', '帮助', '01', '的现金红包', '/', '我在', 'T', '「', '转', '」', 'X', 'be']
