In [1]:
import pandas as pd
df = pd.read_csv("../input/cat_train.csv")

In [3]:
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [4]:
df.ord_2.value_counts()

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
Name: ord_2, dtype: int64

In [5]:
from sklearn import preprocessing
df.loc[:, "ord_2"] = df.ord_2.fillna("NONE")
lbl_enc = preprocessing.LabelEncoder()
df.loc[:, "ord_2"] = lbl_enc.fit_transform(df.ord_2.values)
df.ord_2.value_counts()

2    142726
6    124239
1     97822
0     84790
3     67508
4     64840
5     18075
Name: ord_2, dtype: int64

# 疎行列

In [6]:
import numpy as np
from scipy import sparse

# 一例
example = np.array(
    [
        [0,0,1],
        [1,0,0],
        [1,0,1]
    ]
)
print(example.nbytes)

72


In [7]:
sparse_example = sparse.csc_matrix(example)
print(sparse_example)

  (1, 0)	1
  (2, 0)	1
  (0, 2)	1
  (2, 2)	1


In [9]:
type(sparse_example)
print(sparse_example.data.nbytes)

32


## サイズの違いが大きい場合

In [10]:
n_rows = 10000
n_cols = 100000
# コインの表になる確率が 0.05 であるようなコインで1回試行をした結果、表である回数を要素として持つ行列の生成
example = np.random.binomial(1, p=0.05, size=(n_rows, n_cols))

In [12]:
# バイト数
print(f"Size of dense array: {example.nbytes}")

Size of dense array: 8000000000


In [13]:
sparse_example = sparse.csr_matrix(example)
print(f"Size of sparse array: {sparse_example.data.nbytes}")

Size of sparse array: 400050568


In [14]:
df.groupby(["ord_1", "ord_2"])["id"].count().reset_index(name="count")

Unnamed: 0,ord_1,ord_2,count
0,Contributor,0,15634
1,Contributor,1,17734
2,Contributor,2,26082
3,Contributor,3,12428
4,Contributor,4,11919
5,Contributor,5,3250
6,Contributor,6,22774
7,Expert,0,19477
8,Expert,1,22956
9,Expert,2,33249


In [15]:
# 再読み込み
df = pd.read_csv("../input/cat_train.csv")

In [16]:
df.ord_2.fillna("NONE").value_counts()

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
NONE            18075
Name: ord_2, dtype: int64

# 未知のカテゴリ

In [17]:
# 評価データセットを結合してラベルエンコード
train = pd.read_csv("../input/cat_train.csv")
test = pd.read_csv("../input/cat_test.csv")
# もとに戻す用に擬似的な目的変数を追加
test.loc[:, "target"] = -1
# 結合
data = pd.concat([train, test]).reset_index(drop=True)

In [18]:
# 説明変数
features = [x for x in train.columns if x not in ["id", "target"]]
for ft in features:
    lbl_enc = preprocessing.LabelEncoder()
    temp_col = data[ft].fillna("NONE").astype(str).values
    data.loc[:, ft] = lbl_enc.fit_transform(temp_col)

# 再分割
train = data[data.target != -1].reset_index(drop=True)
test = data[data.target == -1].reset_index(drop=True)

In [19]:
train.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,0,0,3,5,3,6,...,27,2,0,3,3,21,57,5,5,0
1,1,1,1,0,0,2,3,4,0,5,...,2113,2,2,6,5,24,151,6,9,0
2,2,0,1,0,0,0,3,1,3,0,...,1400,2,4,2,14,16,106,4,11,0
3,3,2,0,0,0,0,3,0,3,3,...,2168,0,5,4,1,2,46,2,5,0
4,4,0,2,0,2,0,3,6,3,2,...,1748,2,2,1,8,2,51,4,3,0


In [20]:
test.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,600000,0,0,0,0,2,0,2,0,3,...,2197,2,5,0,6,21,147,2,11,-1
1,600001,0,0,0,0,2,3,0,4,6,...,1107,0,5,1,14,13,46,1,10,-1
2,600002,0,0,0,0,2,0,0,0,6,...,812,0,1,6,9,13,12,1,8,-1
3,600003,1,0,0,0,0,3,2,0,2,...,996,0,1,3,13,1,0,0,8,-1
4,600004,0,0,1,0,2,3,0,5,3,...,371,0,0,4,15,9,14,2,5,-1


In [22]:
type(df.ord_4.fillna("NONE").value_counts())

pandas.core.series.Series

In [23]:
df.ord_4.fillna("NONE").value_counts()

N       39978
P       37890
Y       36657
A       36633
R       33045
U       32897
M       32504
X       32347
C       32112
H       31189
Q       30145
T       29723
O       25610
B       25212
E       21871
K       21676
I       19805
NONE    17930
D       17284
F       16721
W        8268
Z        5790
S        4595
G        3404
V        3107
J        1950
L        1657
Name: ord_4, dtype: int64

In [24]:
df.ord_4.fillna("NONE").value_counts()["N"]

39978

In [25]:
type(df.ord_4.fillna("NONE").value_counts()["N"])

numpy.int64

In [26]:
# loc の書き方が理解不足
df.ord_4.fillna("NONE").value_counts()["N"].values

AttributeError: 'numpy.int64' object has no attribute 'values'

## 特定値未満を希少値とみなす

In [27]:
df.ord_4 = df.ord_4.fillna("NONE")
df.loc[
    # loc の書き方が理解不足
    # 当該行のord_4の値（「N」など）に対するSeries オブジェクトのカウント値が、2000未満の行
    df["ord_4"].value_counts()[df["ord_4"]].values < 2000,
    "ord_4"
] = "RARE"

In [28]:
df.ord_4.value_counts()

N       39978
P       37890
Y       36657
A       36633
R       33045
U       32897
M       32504
X       32347
C       32112
H       31189
Q       30145
T       29723
O       25610
B       25212
E       21871
K       21676
I       19805
NONE    17930
D       17284
F       16721
W        8268
Z        5790
S        4595
RARE     3607
G        3404
V        3107
Name: ord_4, dtype: int64

# モデル作成

## 層化抽出kfold結果確認

In [1]:
import pandas as pd
df = pd.read_csv("../input/cat_train_folds.csv")
df.kfold.value_counts()

4    120000
3    120000
2    120000
1    120000
0    120000
Name: kfold, dtype: int64

In [2]:
# 層化抽出されていることを確認
for i in range(5):
    value_counts_per_fold = df[df.kfold==i].target.value_counts()
    print(f"value_counts_per_fold={value_counts_per_fold}")

value_counts_per_fold=0    97535
1    22465
Name: target, dtype: int64
value_counts_per_fold=0    97535
1    22465
Name: target, dtype: int64
value_counts_per_fold=0    97535
1    22465
Name: target, dtype: int64
value_counts_per_fold=0    97536
1    22464
Name: target, dtype: int64
value_counts_per_fold=0    97536
1    22464
Name: target, dtype: int64


In [3]:
type(str)

type

In [4]:
type(int)

type

# 米国国勢調査データ

In [2]:
import pandas as pd
df = pd.read_csv("../input/adult.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [4]:
df.income.value_counts()

<=50K    24720
>50K      7841
Name: income, dtype: int64

不均衡データのためAUCを評価指標にする

In [13]:
# kfold の確認
df = pd.read_csv("../input/adult_folds.csv")
df.groupby(["kfold", "income"]).size()

kfold  income
0      <=50K     4944
       >50K      1569
1      <=50K     4944
       >50K      1568
2      <=50K     4944
       >50K      1568
3      <=50K     4944
       >50K      1568
4      <=50K     4944
       >50K      1568
dtype: int64

In [15]:
df.head()

Unnamed: 0,workclass,education,education.num,marital.status,occupation,relationship,race,sex,native.country,income,kfold
0,Private,Some-college,10,Divorced,Sales,Unmarried,White,Male,United-States,<=50K,0
1,Private,10th,6,Divorced,Other-service,Unmarried,White,Female,United-States,<=50K,0
2,Private,HS-grad,9,Divorced,Other-service,Not-in-family,White,Male,United-States,<=50K,0
3,Private,Assoc-voc,11,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,United-States,<=50K,0
4,Private,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,United-States,<=50K,0


In [17]:
df.columns

Index(['workclass', 'education', 'education.num', 'marital.status',
       'occupation', 'relationship', 'race', 'sex', 'native.country', 'income',
       'kfold'],
      dtype='object')