In [15]:
#파일 다운 및 압축 해제
import urllib.request as req
import gzip, os, path

savepath="./mnist"
baseurl="http://yann.lecun.com/exdb/mnist"
files=[
    "train-images-idx3-ubyte.gz",
    "train-labels-idx1-ubyte.gz",
    "t10k-images-idx3-ubyte.gz",
    "t10k-labels-idx1-ubyte.gz"
]

#다운로드
if not os.path.exists(savepath): 
    os.mkdir(savepath)
    
for f in files:
    url=baseurl+"/"+f
    loc=savepath+"/"+f
    print("download:",url)
    if not os.path.exists(loc):
        req.urlretrieve(url, loc)
        
#GZip 압축해제
for f in files:
    gz_file=savepath+"/"+f
    raw_file=savepath+"/"+f.replace(".gz","")
    print("gzip:",f)
    with gzip.open(gz_file, "rb") as fp:
        body=fp.read()
        with open (raw_file, "wb") as w:
            w.write(body)
print("ok")

download: http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
download: http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
download: http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
download: http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
gzip: train-images-idx3-ubyte.gz
gzip: train-labels-idx1-ubyte.gz
gzip: t10k-images-idx3-ubyte.gz
gzip: t10k-labels-idx1-ubyte.gz
ok


In [16]:
#csv 파일 변환
import struct

def to_csv (name, maxdata):
    
    #레이블 파일과 이미지 파일 열기
    lbl_f=open("./mnist/"+name+"-labels-idx1-ubyte", "rb") #rb: read binary
    img_f=open("./mnist/"+name+"-images-idx3-ubyte", "rb")
    csv_f=open("./mnist/"+name+".csv","w",encoding="utf-8")
    
    #헤더정보읽기
    mag, lbl_count=struct.unpack(">II",lbl_f.read(8))
    mag, img_count=struct.unpack(">II",img_f.read(8))
    rows, cols=struct.unpack(">II",img_f.read(8))
    pixels=rows*cols
    
    #이미지 데이터를 읽고 csv로 저장하기
    res=[]
    for idx in range(lbl_count):
        if idx > maxdata:
            break
        label=struct.unpack("B",lbl_f.read(1))[0]
        bdata=img_f.read(pixels)
        sdata=list(map(lambda n: str(n), bdata))
        csv_f.write(str(label)+",")
        csv_f.write(",".join(sdata)+"\r\n")
        
        #잘 저장됐는지 이미지 파일로 저장해서 테스트하기
        if idx<10:
            s="P2 28 28 255\n"
            s+=" ".join(sdata)
            iname="./mnist/{0}-{1}-{2}.pgm".format(name,idx,label)
            with open(iname, "w", encoding="utf-8") as f:
                f.write(s)
                
    csv_f.close()
    lbl_f.close()
    img_f.close()
    
#결과를 파일로 출력하기
to_csv("train",1000)
to_csv("t10k",500)

In [17]:
#csv 전체 변환
import struct

def to_csv (name):
    
    #레이블 파일과 이미지 파일 열기
    lbl_f=open("./mnist/"+name+"-labels-idx1-ubyte", "rb") #rb: read binary
    img_f=open("./mnist/"+name+"-images-idx3-ubyte", "rb")
    csv_f=open("./mnist/"+name+"_full"+".csv","w",encoding="utf-8")
    
    #헤더정보읽기
    mag, lbl_count=struct.unpack(">II",lbl_f.read(8))
    mag, img_count=struct.unpack(">II",img_f.read(8))
    rows, cols=struct.unpack(">II",img_f.read(8))
    pixels=rows*cols
    
    #이미지 데이터를 읽고 csv로 저장하기
    res=[]
    for idx in range(lbl_count):
        
        label=struct.unpack("B",lbl_f.read(1))[0]
        bdata=img_f.read(pixels)
        sdata=list(map(lambda n: str(n), bdata))
        csv_f.write(str(label)+",")
        csv_f.write(",".join(sdata)+"\r\n")
                
    csv_f.close()
    lbl_f.close()
    img_f.close()
    
#결과를 파일로 출력하기
to_csv("train")
to_csv("t10k")

In [18]:
#데이터 학습시키기
from sklearn import model_selection, svm, metrics

#csv 파일을 읽어들이고 가공하기
def load_csv(fname):
    labels=[]
    images=[]
    
    with open(fname, "r") as f:
        for line in f:
            cols=line.split(",")
            if len(cols)<2:
                continue
            labels.append(int(cols.pop(0)))
            vals=list(map(lambda n: int(n)/256, cols))
            images.append(vals)
    return {"labels":labels, "images":images}

data=load_csv("./mnist/train.csv")
test=load_csv("./mnist/t10k.csv")

#학습하기
clf=svm.SVC()
clf.fit(data["images"], data["labels"])

#예측하기
predict=clf.predict(test["images"])

#결과 확인하기
ac_score=metrics.accuracy_score(test["labels"],predict)
cl_report=metrics.classification_report(test["labels"],predict)
print("정답률=",ac_score)
print("report=\n", cl_report)

정답률= 0.7884231536926147
report=
              precision    recall  f1-score   support

          0       0.87      0.93      0.90        42
          1       0.81      1.00      0.89        67
          2       0.84      0.69      0.76        55
          3       0.87      0.57      0.68        46
          4       0.76      0.75      0.75        55
          5       0.63      0.80      0.71        50
          6       0.97      0.67      0.79        43
          7       0.74      0.86      0.79        49
          8       0.91      0.72      0.81        40
          9       0.71      0.81      0.76        54

avg / total       0.80      0.79      0.79       501



In [19]:
#전체데이터 학습시키기
from sklearn import model_selection, svm, metrics
import pandas as pd

train_csv=pd.read_csv("./mnist/train_full.csv", header=None)
tk_csv=pd.read_csv("./mnist/t10k_full.csv", header=None)

def transfer(dat):
    output=[]
    for i in dat:
        output.append(float(i)/256)
    return output
    


#train_csv_data=train_csv.iloc[:,1:].values #iloc[row범위,col범위] #모든 row, 1이후의 col
train_csv_data=list(map(transfer,train_csv.iloc[:,1:].values)) #map은 iterable 반환 -> 리스트 변환 필요
train_csv_label=train_csv[0].values
#tk_csv_data=tk_csv.iloc[:,1:].values
tk_csv_data=list(map(transfer,tk_csv.iloc[:,1:].values)) 
tk_csv_label=tk_csv[0].values 

#학습하기
clf=svm.SVC()
clf.fit(train_csv_data, train_csv_label) #fit함수의 앞쪽에 들어가는 train_csv_data: 0~1사이의 요소여야 함.
predict=clf.predict(tk_csv_data)

#결과확인하기
score=metrics.accuracy_score(tk_csv_label, predict)
report=metrics.classification_report(tk_csv_label, predict)
print("정답률=",score)
print("report=\n", report)

정답률= 0.9443
report=
              precision    recall  f1-score   support

          0       0.96      0.99      0.97       980
          1       0.97      0.99      0.98      1135
          2       0.94      0.93      0.93      1032
          3       0.93      0.94      0.93      1010
          4       0.93      0.96      0.94       982
          5       0.93      0.91      0.92       892
          6       0.95      0.97      0.96       958
          7       0.96      0.93      0.94      1028
          8       0.94      0.92      0.93       974
          9       0.94      0.92      0.93      1009

avg / total       0.94      0.94      0.94     10000

