## 셈플 데이터셋 적재하기

In [1]:
from sklearn import datasets

In [2]:
digits = datasets.load_digits() # 수기로 쓴 숫자 이미지

In [6]:
features = digits.data
target = digits.target

In [7]:
features[0]

array([ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.,  0.,  0., 13., 15., 10.,
       15.,  5.,  0.,  0.,  3., 15.,  2.,  0., 11.,  8.,  0.,  0.,  4.,
       12.,  0.,  0.,  8.,  8.,  0.,  0.,  5.,  8.,  0.,  0.,  9.,  8.,
        0.,  0.,  4., 11.,  0.,  1., 12.,  7.,  0.,  0.,  2., 14.,  5.,
       10., 12.,  0.,  0.,  0.,  0.,  6., 13., 10.,  0.,  0.,  0.])

In [8]:
digits.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])

In [9]:
import numpy as np

In [10]:
X,y = datasets.load_digits(n_class=5, return_X_y=True)
np.unique(y)

array([0, 1, 2, 3, 4])

## 모의 데이터셋 만들기

In [11]:
from sklearn.datasets import make_regression

In [13]:
features, target, coefficients = make_regression(n_samples = 100
                                                ,n_features = 3
                                                ,n_informative = 3
                                                ,n_targets = 1
                                                ,noise = 0.0
                                                ,coef = True
                                                ,random_state = 1)

In [15]:
print('특성 행렬\n',features[:3])
print('타깃 벡터\n', target[:3])

특성 행렬
 [[ 1.29322588 -0.61736206 -0.11044703]
 [-2.793085    0.36633201  1.93752881]
 [ 0.80186103 -0.18656977  0.0465673 ]]
타깃 벡터
 [-10.37865986  25.5124503   19.67705609]


In [16]:
from sklearn.datasets import make_classification

In [21]:
features, target = make_classification(n_samples = 100
                                      ,n_features = 3
                                      ,n_informative = 3
                                      ,n_redundant = 0
                                      ,n_classes = 2
                                      ,weights = [.25,.75]
                                      ,random_state = 1)

print('특성 행렬\n', features[:3])
print('타깃 벡터\n', target[:3])

특성 행렬
 [[ 1.06354768 -1.42632219  1.02163151]
 [ 0.23156977  1.49535261  0.33251578]
 [ 0.15972951  0.83533515 -0.40869554]]
타깃 벡터
 [1 0 0]


In [26]:
import matplotlib.pyplot as plt
%matplotlib notebook

In [30]:
features, target = make_blobs(n_samples = 100
                             ,n_features = 2
                             ,centers = 3
                             ,cluster_std = True
                             ,shuffle = True
                             ,random_state = 1)
plt.scatter(features[:,0], features[:,1],c=target)

<IPython.core.display.Javascript object>

<matplotlib.collections.PathCollection at 0x7fd679425fa0>

## csv 파일 적재하기

In [31]:
import pandas as pd

In [33]:
url = 'https://tinyurl.com/simulated-data'
dataframe = pd.read_csv(url)

In [35]:
dataframe.head(2)

Unnamed: 0,integer,datetime,category
0,5,2015-01-01 00:00:00,0
1,5,2015-01-01 00:00:01,0


In [36]:
dataframe = pd.read_csv(url, skiprows=range(1,11),nrows=1)
dataframe

Unnamed: 0,integer,datetime,category
0,5,2015-01-01 00:00:10,0


## 엑셀 차일 적재하기

In [39]:
url = 'https://tinyurl.com/simulated-excel'
dataframe = pd.read_excel(url, sheet_name=0,header=1)

In [41]:
dataframe.head(2)

Unnamed: 0,5,2015-01-01 00:00:00,0
0,5,2015-01-01 00:00:01,0
1,9,2015-01-01 00:00:02,0


## JSON 파일 적재하기

In [44]:
url = 'http://tinyurl.com/simulated-json'
dataframe = pd.read_json(url, orient='columns')
dataframe.head(2)

Unnamed: 0,integer,datetime,category
0,5,2015-01-01 00:00:00,0
1,5,2015-01-01 00:00:01,0


## SQl 데이터 베이스로부터 적재하기 

In [46]:
import pandas as pd
from sqlalchemy import create_engine

In [47]:
database_connection = create_engine('sqlite:///sample.db')

In [49]:
dataframe = pd.read_sql_query('SELECT * FROM data',database_connection)
dataframe.head(2)

OperationalError: (sqlite3.OperationalError) no such table: data
[SQL: SELECT * FROM data]
(Background on this error at: http://sqlalche.me/e/13/e3q8)