In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances
import matplotlib.pyplot as plt


# Load data

In [13]:
# 데이터프레임으로 데이터를 적재합니다.
data = pd.read_csv('./data/housing.csv')


In [14]:
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,78100,INLAND
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,77100,INLAND
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,92300,INLAND
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,84700,INLAND


# pandas를 이용한 one-hot 인코딩 (get_dummies 함수)


In [15]:
data[['ocean_proximity']]

Unnamed: 0,ocean_proximity
0,NEAR BAY
1,NEAR BAY
2,NEAR BAY
3,NEAR BAY
4,NEAR BAY
...,...
20635,INLAND
20636,INLAND
20637,INLAND
20638,INLAND


In [16]:
dummy

Unnamed: 0,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,0,0,1,0
1,0,0,1,0
2,0,0,1,0
3,0,0,1,0
4,0,0,1,0
...,...,...,...,...
20635,1,0,0,0
20636,1,0,0,0
20637,1,0,0,0
20638,1,0,0,0


In [17]:
dummy = pd.get_dummies(data[['ocean_proximity']], drop_first=True)
data = pd.concat([data, dummy], axis=1) #concatenate

In [18]:
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,NEAR BAY,0,0,1,0
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,NEAR BAY,0,0,1,0
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,NEAR BAY,0,0,1,0
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,NEAR BAY,0,0,1,0
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,NEAR BAY,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,78100,INLAND,1,0,0,0
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,77100,INLAND,1,0,0,0
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,92300,INLAND,1,0,0,0
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,84700,INLAND,1,0,0,0


In [19]:
data = data.drop('ocean_proximity', axis=1)

In [20]:
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,0,0,1,0
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,0,0,1,0
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,0,0,1,0
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,0,0,1,0
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,78100,1,0,0,0
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,77100,1,0,0,0
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,92300,1,0,0,0
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,84700,1,0,0,0


# Dealing with missing data

## isnull 메서드

In [22]:
data.isnull().sum()

longitude                       0
latitude                        0
housing_median_age              0
total_rooms                     0
total_bedrooms                207
population                      0
households                      0
median_income                   0
median_house_value              0
ocean_proximity_INLAND          0
ocean_proximity_ISLAND          0
ocean_proximity_NEAR BAY        0
ocean_proximity_NEAR OCEAN      0
dtype: int64

## dropna 메서드

In [51]:
# missing value 포함하는 행 제거
# 유지하고 싶으면 다른 변수에 저장 해야함
data = data.dropna(axis=0)

In [52]:
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,0,0,1,0
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,0,0,1,0
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,0,0,1,0
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,0,0,1,0
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,78100,1,0,0,0
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,77100,1,0,0,0
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,92300,1,0,0,0
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,84700,1,0,0,0


# Indexing

In [53]:
data.iloc[1]

longitude                       -122.2200
latitude                          37.8600
housing_median_age                21.0000
total_rooms                     7099.0000
total_bedrooms                  1106.0000
population                      2401.0000
households                      1138.0000
median_income                      8.3014
median_house_value            358500.0000
ocean_proximity_INLAND             0.0000
ocean_proximity_ISLAND             0.0000
ocean_proximity_NEAR BAY           1.0000
ocean_proximity_NEAR OCEAN         0.0000
Name: 1, dtype: float64

In [54]:
data.iloc[0:4]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,0,0,1,0
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,0,0,1,0
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,0,0,1,0
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,0,0,1,0


# Calculate Distance 

In [55]:
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,0,0,1,0
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,0,0,1,0
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,0,0,1,0
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,0,0,1,0
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,78100,1,0,0,0
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,77100,1,0,0,0
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,92300,1,0,0,0
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,84700,1,0,0,0


In [56]:
data.iloc[0],data.iloc[1]

(longitude                       -122.2300
 latitude                          37.8800
 housing_median_age                41.0000
 total_rooms                      880.0000
 total_bedrooms                   129.0000
 population                       322.0000
 households                       126.0000
 median_income                      8.3252
 median_house_value            452600.0000
 ocean_proximity_INLAND             0.0000
 ocean_proximity_ISLAND             0.0000
 ocean_proximity_NEAR BAY           1.0000
 ocean_proximity_NEAR OCEAN         0.0000
 Name: 0, dtype: float64,
 longitude                       -122.2200
 latitude                          37.8600
 housing_median_age                21.0000
 total_rooms                     7099.0000
 total_bedrooms                  1106.0000
 population                      2401.0000
 households                      1138.0000
 median_income                      8.3014
 median_house_value            358500.0000
 ocean_proximity_INLAND     

In [57]:
#1차 거리 
np.linalg.norm(data.iloc[0] - data.iloc[1], ord=1)

104407.0538

In [58]:
#2차 거리 
np.linalg.norm(data.iloc[0] - data.iloc[1], ord=2)

94338.68387358956

In [59]:
#inf 거리 
np.linalg.norm(data.iloc[0] - data.iloc[1], ord=np.inf)

94100.0

In [60]:
# 첫번째 집과 가장 거리가 가까운 샘플은 몇번째 집 일까? 

difference = data.iloc[0] - data

distance_vector = np.linalg.norm(data.iloc[0] - data.iloc[1:], axis=1, ord=1)

In [61]:
np.argmin(distance_vector)

9225

In [62]:
data.iloc[0] , data.iloc[9225+1]

(longitude                       -122.2300
 latitude                          37.8800
 housing_median_age                41.0000
 total_rooms                      880.0000
 total_bedrooms                   129.0000
 population                       322.0000
 households                       126.0000
 median_income                      8.3252
 median_house_value            452600.0000
 ocean_proximity_INLAND             0.0000
 ocean_proximity_ISLAND             0.0000
 ocean_proximity_NEAR BAY           1.0000
 ocean_proximity_NEAR OCEAN         0.0000
 Name: 0, dtype: float64,
 longitude                       -122.4900
 latitude                          37.9800
 housing_median_age                34.0000
 total_rooms                     1256.0000
 total_bedrooms                   178.0000
 population                       460.0000
 households                       174.0000
 median_income                      6.4271
 median_house_value            451700.0000
 ocean_proximity_INLAND     

# 코사인 거리 계산 

In [63]:
cosine_similarity(np.array(data.iloc[0]).reshape(1,-1), np.array(data.iloc[954]).reshape(1,-1))

array([[0.99997432]])

# Correlation 계산


In [64]:
print(data.corr())

                            longitude  latitude  housing_median_age  \
longitude                    1.000000 -0.924616           -0.109357   
latitude                    -0.924616  1.000000            0.011899   
housing_median_age          -0.109357  0.011899            1.000000   
total_rooms                  0.045480 -0.036667           -0.360628   
total_bedrooms               0.069608 -0.066983           -0.320451   
population                   0.100270 -0.108997           -0.295787   
households                   0.056513 -0.071774           -0.302768   
median_income               -0.015550 -0.079626           -0.118278   
median_house_value          -0.045398 -0.144638            0.106432   
ocean_proximity_INLAND      -0.055337  0.351084           -0.236968   
ocean_proximity_ISLAND       0.009501 -0.016662            0.017105   
ocean_proximity_NEAR BAY    -0.474714  0.358785            0.256149   
ocean_proximity_NEAR OCEAN   0.046185 -0.161342            0.020797   

     

In [32]:
corr = data.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
longitude,1.0,-0.924664,-0.108197,0.044568,0.069608,0.099773,0.05531,-0.015176,-0.045967
latitude,-0.924664,1.0,0.011173,-0.0361,-0.066983,-0.108785,-0.071035,-0.079809,-0.14416
housing_median_age,-0.108197,0.011173,1.0,-0.361262,-0.320451,-0.296244,-0.302916,-0.119034,0.105623
total_rooms,0.044568,-0.0361,-0.361262,1.0,0.93038,0.857126,0.918484,0.19805,0.134153
total_bedrooms,0.069608,-0.066983,-0.320451,0.93038,1.0,0.877747,0.979728,-0.007723,0.049686
population,0.099773,-0.108785,-0.296244,0.857126,0.877747,1.0,0.907222,0.004834,-0.02465
households,0.05531,-0.071035,-0.302916,0.918484,0.979728,0.907222,1.0,0.013033,0.065843
median_income,-0.015176,-0.079809,-0.119034,0.19805,-0.007723,0.004834,0.013033,1.0,0.688075
median_house_value,-0.045967,-0.14416,0.105623,0.134153,0.049686,-0.02465,0.065843,0.688075,1.0


# Preprocessing

### Sampling 

In [66]:
data.sample(100)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
13434,-117.43,34.10,43,1898,418.0,971,366,2.4735,89900,1,0,0,0
6531,-118.07,34.07,31,1370,284.0,1062,277,3.5156,199300,0,0,0,0
3971,-118.58,34.19,35,2329,399.0,966,336,3.8839,224900,0,0,0,0
1949,-120.86,38.75,15,1533,300.0,674,287,2.5625,146100,1,0,0,0
423,-122.26,37.88,52,2551,417.0,894,404,6.2425,391800,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10488,-117.68,33.49,16,3084,724.0,2557,690,2.8357,106300,0,0,0,0
7928,-118.08,33.85,22,1055,204.0,682,216,6.0000,191300,0,0,0,0
15670,-122.44,37.80,52,1006,291.0,445,257,2.7717,500000,0,0,1,0
3306,-122.64,38.96,29,883,187.0,326,136,1.7273,58200,1,0,0,0


# Attribute Transform

In [68]:
normalization_data= (data - data.mean(axis=0))/data.std(axis=0)
normalization_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-1.327281,1.051692,0.982139,-0.803793,-0.970301,-0.973296,-0.976809,2.345106,2.128767,-0.682697,-0.015644,2.828592,-0.384177
1,-1.322290,1.042330,-0.606195,2.042080,1.348243,0.861318,1.670332,2.332575,1.313594,-0.682697,-0.015644,2.828592,-0.384177
2,-1.332272,1.037649,1.855723,-0.535176,-0.825541,-0.819749,-0.843406,1.782896,1.258152,-0.682697,-0.015644,2.828592,-0.384177
3,-1.337263,1.037649,1.855723,-0.623495,-0.718750,-0.765037,-0.733544,0.932947,1.164593,-0.682697,-0.015644,2.828592,-0.384177
4,-1.337263,1.037649,1.855723,-0.461959,-0.611959,-0.758860,-0.628914,-0.013143,1.172390,-0.682697,-0.015644,2.828592,-0.384177
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.758299,1.800633,-0.288528,-0.444570,-0.388886,-0.511774,-0.443196,-1.216697,-1.115465,1.464708,-0.015644,-0.353516,-0.384177
20636,-0.818192,1.805314,-0.844446,-0.887535,-0.920466,-0.943292,-1.008198,-0.692027,-1.124128,1.464708,-0.015644,-0.353516,-0.384177
20637,-0.823183,1.777229,-0.923862,-0.175038,-0.125468,-0.368817,-0.173773,-1.143143,-0.992452,1.464708,-0.015644,-0.353516,-0.384177
20638,-0.873094,1.777229,-0.844446,-0.355336,-0.305826,-0.603549,-0.393497,-1.055110,-1.058290,1.464708,-0.015644,-0.353516,-0.384177


In [69]:
normalization_data = (data - data.min())/data.max()
normalization_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-0.018546,0.127294,0.769231,0.022330,0.019860,0.008940,0.020552,0.521683,0.875200,0.0,0.0,1.0,0.0
1,-0.018634,0.126818,0.384615,0.180493,0.171451,0.067205,0.186945,0.520097,0.687001,0.0,0.0,1.0,0.0
2,-0.018459,0.126579,0.980769,0.037258,0.029325,0.013816,0.028938,0.450497,0.674201,0.0,0.0,1.0,0.0
3,-0.018371,0.126579,0.980769,0.032350,0.036307,0.015554,0.035843,0.342878,0.652601,0.0,0.0,1.0,0.0
4,-0.018371,0.126579,0.980769,0.041328,0.043289,0.015750,0.042420,0.223085,0.654401,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.028519,0.165435,0.461538,0.042294,0.057874,0.023597,0.054094,0.070693,0.126202,1.0,0.0,0.0,0.0
20636,-0.027469,0.165673,0.326923,0.017675,0.023119,0.009893,0.018579,0.137126,0.124202,1.0,0.0,0.0,0.0
20637,-0.027382,0.164243,0.307692,0.057274,0.075097,0.028137,0.071029,0.080006,0.154602,1.0,0.0,0.0,0.0
20638,-0.026507,0.164243,0.326923,0.047253,0.063305,0.020683,0.057218,0.091153,0.139402,1.0,0.0,0.0,0.0


# Discretization

In [70]:
category = ['Child', 'Young', 'Adults', 'Senior']
pd.cut(x=data['housing_median_age'], bins=[0,14,24,64,100],labels=category)

0        Adults
1         Young
2        Adults
3        Adults
4        Adults
          ...  
20635    Adults
20636     Young
20637     Young
20638     Young
20639     Young
Name: housing_median_age, Length: 20433, dtype: category
Categories (4, object): ['Child' < 'Young' < 'Adults' < 'Senior']

In [71]:
data['age_discretization'] = pd.cut(x=data['housing_median_age'], bins=[0,14,24,64,100],labels=category)
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,age_discretization
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,0,0,1,0,Adults
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,0,0,1,0,Young
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,0,0,1,0,Adults
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,0,0,1,0,Adults
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,0,0,1,0,Adults
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,78100,1,0,0,0,Adults
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,77100,1,0,0,0,Young
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,92300,1,0,0,0,Young
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,84700,1,0,0,0,Young


# delete feature

In [73]:
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,age_discretization
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,0,0,1,0,Adults
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,0,0,1,0,Young
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,0,0,1,0,Adults
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,0,0,1,0,Adults
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,0,0,1,0,Adults
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,78100,1,0,0,0,Adults
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,77100,1,0,0,0,Young
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,92300,1,0,0,0,Young
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,84700,1,0,0,0,Young


In [74]:
data.drop(['age_discretization'], axis=1)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600,0,0,1,0
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500,0,0,1,0
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100,0,0,1,0
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300,0,0,1,0
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,78100,1,0,0,0
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,77100,1,0,0,0
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,92300,1,0,0,0
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,84700,1,0,0,0


In [None]:
data