In [101]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import OneHotEncoder
import copy


def format_data(df):
    #用missing填充缺失值，并去除首尾空格
    for column in df.columns:
        if df[column].dtype == "object":
            df[column] = df[column].fillna("missing")
            df[column]=df[column].apply(lambda x: x.strip())


    #清洗数据：将位置只保留省份，面料只保留第一个
    #df["销量"]=df["销量"].apply(lambda x: int(x.replace("人付款","")))
    df["位置"]=df["位置"].apply(lambda x: x.split(" ")[0])
    df["面料"]=df["面料"].apply(lambda x: x.split(",")[0])
    
    return df

df = pd.read_csv("D:/dress.csv")
# print (df.head())

#删除缺失值个数>100的列
for column in df.columns:
    isnullList=df[column].isnull()
    nullCnt = (len(isnullList[isnullList==True]))
    if nullCnt > 100:
        del df[column]
#         print ("del column:" + column)

#删除不重要的特征
del df["货号"]
del df["年份季节"]
del df["品牌"]
del df["销量"]

df = format_data(df)

#对应的class labels, 也需要将其转换为数值表征
class_mapping = {}
for column in df.columns[4:]:
    if df[column].dtype == "object":
        class_mapping[column] = {label:idx for idx,label in enumerate(np.unique(df[column]))}
        df[column] = df[column].map(class_mapping[column])


#搜索结果前5页定义为爆款(1)，后5页定义为非爆款(0)
ones = np.ones(int(df.shape[0]/2),dtype=int)
zeros = np.zeros(df.shape[0]-int(df.shape[0]/2),dtype=int)
df["爆款"]=np.concatenate((ones,zeros))

#提取自变量X,因变量y
X,y = df.ix[:,3:-1].values, df.ix[:,-1].values

labelList = range(1,X.shape[1])

#One-Hot编码
ohe = OneHotEncoder(categorical_features=labelList, sparse=False)
oheFit = ohe.fit(X)
X = oheFit.transform(X)

# 80%训练集, 20%测试集
X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.20, random_state=1)

#Logistic Regression
pipe_lr = Pipeline([('clf', LogisticRegression(random_state=1))])

pipe_lr.fit(X_train, y_train)
print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test))

X_test = pd.read_csv("D:/dress_test.csv")
X_copy = copy.deepcopy(X_test)
X_test = format_data(X_test)

for column in X_test.columns:
    if X_test[column].dtype == "object":
        X_test[column] = X_test[column].map(class_mapping[column])

X_test_ = oheFit.transform(X_test)
predict_proba = pipe_lr.predict_proba(X_test_)

s = pd.Series(predict_proba[:,1],name="爆款指数")
X_copy.join(s)

Test Accuracy: 0.655


Unnamed: 0,价格,位置,厚薄,面料,中老年风格,中老年女装图案,适用年龄,服装版型,组合形式,穿着方式,衣长,裙长,领型,袖型,成分含量,爆款指数
0,88,浙江 嘉兴,适中,其他,时尚,碎花,40-49周岁,宽松,两件套,套头,中长款,中长款,圆领,常规,96%及以上,0.857847
1,99,上海,适中,聚酯,知性,花色,40-49周岁,宽松,单件,套头,中长款,中长款,圆领,衬衫袖,96%及以上,0.551912
2,148,浙江 杭州,适中,其他,休闲,纯色,40-49周岁,宽松,两件套,开衫,中长款（衣长50-70CM）,中长款,立领,常规,96%及以上,0.876058
3,129,北京,薄,雪纺,时尚,复古图案,40-49周岁,修身,假两件,套头,中长款,中长款,圆领,常规,96%及以上,0.260073
4,76,江苏 苏州,适中,其他,时尚,纯色,40-49周岁,宽松,两件套,开衫,中长款（衣长50-70CM）,中长款,立领,常规,96%及以上,0.970179
