In [1]:
# ライブラリ読み込み
import pandas as pd
import numpy as np
import multiprocessing
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn import metrics

from time import time
import datetime
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold,GroupKFold
from sklearn.metrics import roc_auc_score
sns.set()
%matplotlib inline

import os
import sys
import warnings
import gc

import feather

HOME_DIR = ‘/’.join(os.environ.get(‘VIRTUAL_ENV’).split(‘/’)[:-1])

# アプリ定数
PYSRC_DIR = os.path.join(HOME_DIR, 'py')
DATA_DIR = os.path.join(HOME_DIR, 'data/wine_quality')
EVAL_DIR = os.path.join(HOME_DIR, 'eval')

# 共通スクリプト呼び出し用にパスを通す
sys.path.append(PYSRC_DIR)

# dataフォルダに移動
os.chdir(DATA_DIR)
from glob import glob
glob('./*')

warnings.filterwarnings('ignore')
from copy import deepcopy

#可視化設定

plt.rcParams["patch.force_edgecolor"] = False
plt.rcParams['font.family'] = 'Ricty Diminised'
sns.set(style="whitegrid", font='Ricty Diminised', palette="muted",
        color_codes=True, rc={'grid.linestyle': '--'})
red = sns.xkcd_rgb["light red"]
green = sns.xkcd_rgb["medium green"]
blue = sns.xkcd_rgb["denim blue"]

import japanize_matplotlib


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [30]:
!pwd

/Users/yuki.matsumoto/Documents/Project/infonia/Git/InfoNear/py/eda


In [2]:
def kesson_table(df):
    null_val = df.isnull().sum()
    percent = 100 * df.isnull().sum()/len(df)
    kesson_table = pd.concat([null_val,percent],axis=1)
    kesson_table_ren_columns = kesson_table.rename(columns = {0:'lack num',1:'%'})
    return kesson_table_ren_columns

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
winequality_red_df = pd.read_csv('winequality-red.csv', delimiter=";")
winequality_white_df = pd.read_csv('winequality-white.csv', delimiter=";")    

In [5]:
print(f'winequality_red_df dataset has {winequality_red_df.shape[0]} rows and {winequality_red_df.shape[1]} columns.')
print(f'winequality_white_df dataset has {winequality_white_df.shape[0]} rows and {winequality_white_df.shape[1]} columns.')

winequality_red_df dataset has 1599 rows and 12 columns.
winequality_white_df dataset has 4898 rows and 12 columns.


In [6]:
display(winequality_red_df.head())
display(winequality_white_df.head())

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [10]:
winequality_red_df["quality"].unique()

array([5, 6, 7, 4, 8, 3])

In [11]:
winequality_red_df["quality"].dtype

dtype('int64')

In [12]:
winequality_red_df["alcohol"].dtype

dtype('float64')

## knn実装

In [16]:
class KNN2d:

    def knn2d(self, x, y, k):
        num = x.shape[0]
        ipsilon_list = []
        for j in range(num):
            #k番目のデータに対するその他のデータの距離のリスト
            distance_list = [] 
            for i in range(num):
                xl = x[i] - x[j]
                yl = y[i] - y[j]
                distance = ((xl) ** 2 + (yl) ** 2)** 0.5
                distance_list.append(distance)
            d_dil = np.array(distance_list)
            d_dil = np.sort(d_dil)
            ipsilon_list.append(d_dil[k])
        abnormals = np.array(ipsilon_list)

        return abnormals/10

    def abnormal_decision(self, abnormals, treshold):
        result_list = []
        num = abnormals.shape[0]
        for i in range(num):
            abnormal = abnormals[i]
            if abnormal > treshold:
                result_list.append(i)
                
        return result_list

In [None]:
knn = KNN2d()
