In [None]:
import os
import tarfile
import urllib.request # jupyter notebook에선 import urllib까지만 해도 가능

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

def fetch_spam_data(spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path): # 현재 디렉토리 안에 폴더가 없으면 새로 만들기
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", HAM_URL), ("spam.tar.bz2", SPAM_URL)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path): # 현재 디렉토리 안에 파일이 없을 경우 다운로드
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=SPAM_PATH) # () = 압축을 풀 경로 입력
        tar_bz2_file.close()

In [None]:
fetch_spam_data()

In [None]:
# 모든 이메일을 읽어 들이기
HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [None]:
print(len(ham_filenames))
print(len(spam_filenames))

2500
500


In [None]:
# python module email - 헤더, 인코딩 등 처리
import email
import email.policy

def load_email(is_spam, filename, spam_path=SPAM_PATH):
    directory = "spam" if is_spam else "easy_ham"
    with open(os.path.join(spam_path, directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [None]:
ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

In [None]:
print(ham_emails[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [None]:
def get_email_structure(email):
    if isinstance(email, str): # isinstance(object, class) = 첫 번째 인수로 인스턴스, 두 번째 인수로 클래스 이름
        return email
    payload = email.get_payload() # get_payload() - 이미지나 첨부 파일을 가진 multipart의 경우 objects 반환 / 아닌 경우 string 반환
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([ # "multipart()"는 단순히 출력을 위한 구분용 string
            get_email_structure(sub_email) # multipart를 분류한 이후의 각 메일(=sub_email)은 string 타입
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

In [None]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [None]:
structures_counter(ham_emails).most_common() # most_common(): 많은 순서대로

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [None]:
structures_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

In [None]:
# 이메일 헤더 살펴보기
for header, value in spam_emails[0].items():
    print(header,":",value)

Return-Path : <12a1mailbot1@web.de>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received : from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From : 12a1mailbot1@web.de
Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To : dcek1a1@netsgo.com
Subject : Life Insurance - Why Pay More?
Date : Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version : 1.0
Message-ID : <0103c1042001882DD_IT7@dd_it7>
Content-Type : text/html; charset="iso-8859-1"
Content-Transfer-Encoding : qu

In [None]:
# 보낸사람의 이메일 주소 등 헤더에는 유용한 정보가 많지만 여기선 Subject 헤더만 다룰 것
spam_emails[0]["Subject"]

'Life Insurance - Why Pay More?'

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 전처리 함수 작성
# HTML을 일반 텍스트로 변환 => BeautifulSoup가 좋지만 여기선 의존성을 줄이기 위해 정규식을 사용
import re
from html import unescape # unescape: 다시 원래 형태의 데이터 문자열로 변환하는 작업

def html_to_plain_text(html):
    # .: \n을 제외한 모든 문자 / ?: 반복 체크(앞의 문자가 있어도, 없어도)
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)

    # flags는 옵션 인자: re.M: ^와 $는 개행문자(=바꾸려는 문자) 위치에서 일치 / re.S: 마침표는 개행문자와 일치 / re.l: 대소문자 구분 없이 일치
    # \s: 띄어쓰기 - 아마도 <a href= > 형태를 잡아내려고 한 듯
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)

    # 이외 모든 태그 제거
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

In [None]:
# 정규표현식 이해(flags 인자)
print(re.findall('a..', 'abc addd$  a\na', flags=re.S))
print(re.findall('a..', 'abc ad$ a\na', flags=re.S | re.M))
print(re.findall('a..', 'ABC a^d$ a\na', flags=re.S | re.M | re.I))

['abc', 'add', 'a\na']
['abc', 'ad$', 'a\na']
['ABC', 'a^d', 'a\na']


In [None]:
html_spam_emails = [email for email in X_train[y_train==1]
                    if get_email_structure(email) == "text/html"]
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:1000], "...")

<HTML><HEAD><TITLE></TITLE><META http-equiv="Content-Type" content="text/html; charset=windows-1252"><STYLE>A:link {TEX-DECORATION: none}A:active {TEXT-DECORATION: none}A:visited {TEXT-DECORATION: none}A:hover {COLOR: #0033ff; TEXT-DECORATION: underline}</STYLE><META content="MSHTML 6.00.2713.1100" name="GENERATOR"></HEAD>
<BODY text="#000000" vLink="#0033ff" link="#0033ff" bgColor="#CCCC99"><TABLE borderColor="#660000" cellSpacing="0" cellPadding="0" border="0" width="100%"><TR><TD bgColor="#CCCC99" valign="top" colspan="2" height="27">
<font size="6" face="Arial, Helvetica, sans-serif" color="#660000">
<b>OTC</b></font></TD></TR><TR><TD height="2" bgcolor="#6a694f">
<font size="5" face="Times New Roman, Times, serif" color="#FFFFFF">
<b>&nbsp;Newsletter</b></font></TD><TD height="2" bgcolor="#6a694f"><div align="right"><font color="#FFFFFF">
<b>Discover Tomorrow's Winners&nbsp;</b></font></div></TD></TR><TR><TD height="25" colspan="2" bgcolor="#CCCC99"><table width="100%" border="0" 

In [None]:
print(html_to_plain_text(sample_html_spam.get_content())[:1000], "...")


OTC
 Newsletter
Discover Tomorrow's Winners 
For Immediate Release
Cal-Bay (Stock Symbol: CBYI)
Watch for analyst "Strong Buy Recommendations" and several advisory newsletters picking CBYI.  CBYI has filed to be traded on the OTCBB, share prices historically INCREASE when companies get listed on this larger trading exchange. CBYI is trading around 25 cents and should skyrocket to $2.66 - $3.25 a share in the near future.
Put CBYI on your watch list, acquire a position TODAY.
REASONS TO INVEST IN CBYI
A profitable company and is on track to beat ALL earnings estimates!
One of the FASTEST growing distributors in environmental & safety equipment instruments.
Excellent management team, several EXCLUSIVE contracts.  IMPRESSIVE client list including the U.S. Air Force, Anheuser-Busch, Chevron Refining and Mitsubishi Heavy Industries, GE-Energy & Environmental Research.
RAPIDLY GROWING INDUSTRY
Industry revenues exceed $900 million, estimates indicate that there could be as much as $25 billi

In [None]:
# 이메일을 입력으로 받고 일반 텍스트를 출력하는 함수
def email_to_text(email):
    html = None
    for part in email.walk(): # walk(): 메시지 객체 트리의 모든 파트와 서브 파트를 깊이 우선 탐색 순서로 iterate 
        ctype = part.get_content_type() # 각 iteration은 서브 파트를 반환
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain": # plain이면 내용 그대로 반환
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [None]:
print(email_to_text(sample_html_spam)[:100], "...")


OTC
 Newsletter
Discover Tomorrow's Winners 
For Immediate Release
Cal-Bay (Stock Symbol: CBYI)
Wat ...


In [None]:
try:
    import nltk
    stemmer = nltk.PorterStemmer()
    for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
        print(word, "=>", stemmer.stem(word))
except ImportError:
    print("Error: stemming requires the NLTK module.")
    stemmer = None

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


In [None]:
# Anaconda에선 적용되지 않는 듯.. Google Colab으로 갈아탐
! pip install -q -U urlextract

In [None]:
# 인터넷 주소를 "URL"이란 문자로 변환하기 위한 작업 준비
try:
  import urlextract
  url_extractor = urlextract.URLExtract()
  print(url_extractor.find_urls("Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s"))

except ImportError:
  print("Error: replacing URLs requires the urlextract module.")
  url_extractor = None

['github.com', 'https://youtu.be/7Pq-S557XQU?t=3m32s']


In [None]:
# 하나의 변환기로 연결하여 이메일을 단어 카운트로 바꿈
from sklearn.base import BaseEstimator, TransformerMixin

class EmailToWordCounterTransforemr(BaseEstimator, TransformerMixin):
  def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
               replace_urls=True, replace_numbers=True, stemming=True):
    
    self.strip_headers = strip_headers
    self.lower_case = lower_case
    self.remove_punctuation = remove_punctuation
    self.replace_urls = replace_urls
    self.replace_numbers = replace_numbers
    self.stemming = stemming
  def fit(self, X, y=None):
    return self
  def transform(self, X, y=None):
    X_transformed = []
    for email in X:
      text = email_to_text(email) or ""
      if self.lower_case:
        text = text.lower()
      if self.replace_urls and url_extractor is not None:
        urls = list(set(url_extractor.find_urls(text)))
        urls.sort(key=lambda url: len(url), reverse=True)
        for url in urls:
          text = text.replace(url, " URL ")
      if self.replace_numbers:
        text = re.sub(r'\d+(?:\.\d+(?:[eE]\d+))?', 'NUMBER', text)
      if self.remove_punctuation:
        text = re.sub(r'\W+', ' ', text, flags=re.M)
      word_counts = Counter(text.split())
      if self.stemming and stemmer is not None:
        stemmed_word_counts = Counter() # 단어가 몇 개 있는지 세주는 함수(maybe automatically sorting)
        for word, count in word_counts.items():
          stemmed_word = stemmer.stem(word)
          stemmed_word_counts[stemmed_word] += count
        word_counts = stemmed_word_counts
      X_transformed.append(word_counts)
    return np.array(X_transformed)

In [None]:
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransforemr().fit_transform(X_few)
X_few_wordcounts

array([Counter({'chuck': 1, 'murcko': 1, 'wrote': 1, 'stuff': 1, 'yawn': 1, 'r': 1}),
       Counter({'the': 11, 'of': 9, 'and': 8, 'all': 3, 'christian': 3, 'to': 3, 'by': 3, 'jefferson': 2, 'i': 2, 'have': 2, 'superstit': 2, 'one': 2, 'on': 2, 'been': 2, 'ha': 2, 'half': 2, 'rogueri': 2, 'teach': 2, 'jesu': 2, 'some': 1, 'interest': 1, 'quot': 1, 'url': 1, 'thoma': 1, 'examin': 1, 'known': 1, 'word': 1, 'do': 1, 'not': 1, 'find': 1, 'in': 1, 'our': 1, 'particular': 1, 'redeem': 1, 'featur': 1, 'they': 1, 'are': 1, 'alik': 1, 'found': 1, 'fabl': 1, 'mytholog': 1, 'million': 1, 'innoc': 1, 'men': 1, 'women': 1, 'children': 1, 'sinc': 1, 'introduct': 1, 'burnt': 1, 'tortur': 1, 'fine': 1, 'imprison': 1, 'what': 1, 'effect': 1, 'thi': 1, 'coercion': 1, 'make': 1, 'world': 1, 'fool': 1, 'other': 1, 'hypocrit': 1, 'support': 1, 'error': 1, 'over': 1, 'earth': 1, 'six': 1, 'histor': 1, 'american': 1, 'john': 1, 'e': 1, 'remsburg': 1, 'letter': 1, 'william': 1, 'short': 1, 'again': 1, 'becom

위 파이프라인에 나타난 정규표현식 이해

In [None]:
print(re.findall(r'\d+(?:\.\d+(?:[eE]\d+))?', 'abc12.34a4434..22e3.eE(.)\n// 12.3e5 12.5e4'))
# (?:) -> 이후에 등장할 수도(or not) 있는 조건 명시
# \d+(?:\.\d+) -> 숫자 뒤에 마침표 + 다른 숫자가 올 수 있음, 즉 소수점 
# \d+(?:\.\d+(?:[eE]\d+)) -> e나 E는 여기서 10의 -1제곱 형태를 말함 즉 12.5e4 까지 모두 잡아냄

['12', '34', '4434', '22', '3', '12.3e5', '12.5e4']


In [None]:
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, vocabulary_size=1000):
    self.vocabulary_size = vocabulary_size

  def fit(self, X, y=None):
    total_count = Counter()
    for word_count in X:
      for word, count in word_count.items():
        total_count[word] += min(count, 10) # the, a 등 불필요한 단어는 무한히 클 수 있기 때문에 10개로 제한을 둔 듯
    most_common = total_count.most_common()[:self.vocabulary_size] # 상위 10개만 출력
    self.most_common_ = most_common
    self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)} # 단어에 인덱싱, 여기서 단어 갯수는 고려하지 않음
    
    return self

  def transform(self, X, y=None):
    rows = [] # training example 1개에 해당
    cols = [] # training example 전체가 가지고 있는 단어에 해당, 각 example이 해당 단어를 가지고 있는지 확인
    data = [] # 특정 단어가 몇 개 나타나는지(vocabulary 사전에 없는 단어가 10번 나타나면 이것도 count되어 10으로 표시되게 함, 밑의 예시 참조)
    for row, word_count in enumerate(X):
      for word, count in word_count.items():
        rows.append(row)
        # vocabulary 사전에 있는 단어만 파악, 사전에 없으면 0 값 반환
        cols.append(self.vocabulary_.get(word, 0)) # 단어가 vocabulary안에 있으면 해당 value 값 즉 인덱스 반환, 없다면 0 반환
        data.append(count)

    return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [None]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

<3x11 sparse matrix of type '<class 'numpy.longlong'>'
	with 20 stored elements in Compressed Sparse Row format>

## (참고) 위 파이프라인 세부 내용 파악

In [None]:
total_count1 = Counter()
for word_count in X_few_wordcounts:
  for word, count in word_count.items():
    total_count1[word] += min(count, 10)
print(total_count, '\n')

most_common = total_count.most_common()[:10]
print(most_common, '\n')

vocabulary = {word: index + 1 for index, (word, count) in enumerate(most_common)}
print(vocabulary, '\n')

Counter({'the': 10, 'of': 10, 'and': 10, 'to': 6, 'url': 5, 'all': 4, 'in': 3, 'christian': 3, 'on': 3, 'by': 3, 's': 3, 'group': 3, 'wrote': 2, 'jefferson': 2, 'i': 2, 'have': 2, 'superstit': 2, 'one': 2, 'been': 2, 'ha': 2, 'thi': 2, 'half': 2, 'rogueri': 2, 'that': 2, 'teach': 2, 'jesu': 2, 'forteana': 2, 'martin': 2, 'an': 2, 'we': 2, 'is': 2, 'yahoo': 2, 'unsubscrib': 2, 'chuck': 1, 'murcko': 1, 'stuff': 1, 'yawn': 1, 'r': 1, 'some': 1, 'interest': 1, 'quot': 1, 'thoma': 1, 'examin': 1, 'known': 1, 'word': 1, 'do': 1, 'not': 1, 'find': 1, 'our': 1, 'particular': 1, 'redeem': 1, 'featur': 1, 'they': 1, 'are': 1, 'alik': 1, 'found': 1, 'fabl': 1, 'mytholog': 1, 'million': 1, 'innoc': 1, 'men': 1, 'women': 1, 'children': 1, 'sinc': 1, 'introduct': 1, 'burnt': 1, 'tortur': 1, 'fine': 1, 'imprison': 1, 'what': 1, 'effect': 1, 'coercion': 1, 'make': 1, 'world': 1, 'fool': 1, 'other': 1, 'hypocrit': 1, 'support': 1, 'error': 1, 'over': 1, 'earth': 1, 'six': 1, 'histor': 1, 'american': 1,

In [None]:
rows = []
cols = []
data = []
for row, word_count in enumerate(X_few_wordcounts):
  for word, count in word_count.items():
    rows.append(row)
    cols.append(vocabulary.get(word, 0))
    data.append(count)

print(rows)
print(cols)
print(data)
# 아래 결과를 보면 겹치는 부분이 있는데, csr_matrix는 이를 모두 더해서 출력함

[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 6, 1, 0, 0, 2, 0, 3, 0, 0, 0, 7, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2

In [None]:
# scipty 희소 행렬 지원 예제
from scipy.sparse import coo_matrix

row = [0, 0, 1, 1, 2] # row 인덱스를 담은 리스트
col = [0, 1, 0, 0, 3] # column 인덱스를 담은 리스트
data = [2, 4, 2, 3, 5] # 원소 값을 담은 리스트

m = coo_matrix((data, (row, col)))
m # m.toarray() - 행렬 모양 확인

from scipy.sparse import csr_matrix

indices = [0, 1, 2, 2, 3] # column 인덱스
indptr = [0, 3, 4, 5] # row 인덱스 [0:3] - 첫 행 원소 / [3:4] - 둘째 행 원소 / [4:5] - 셋째 행 원소 => 실제 행보다 indptr 길이는 +1
data = [2, 4, 2, 1, 5]
m = csr_matrix((data, indices, indptr))
m # m.toarray()

# 먼저 coo 행렬을 만들고 나서, tocsr이나 tocsc 메서드를 통해 다른 형식의 희소행렬로 바꾸어서 작업하는 것도 가능 
# 또는 csr_matrix((data, (row, col))) 라고 입력해서 바로 csr 형식으로 희소행렬을 만들어줄 수도 있음

<3x4 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

# 다시 작업 시작

In [None]:
X_few_vectors.toarray()
# 3번째 행의 1번째 열은 vocabulary 목록에 없는 단어가 67개 있다는 것을 의미
# 다음의 0은 vocabulary 목록에 있는 첫 번째 단어가 3번째 training example, 즉 X_train[3]에서 한 번도 등자하지 않았다는 의미

array([[ 6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [99, 11,  9,  8,  3,  1,  3,  1,  3,  2,  3],
       [67,  0,  1,  2,  3,  4,  1,  2,  0,  1,  0]], dtype=int64)

In [None]:
vocab_transformer.vocabulary_

{'all': 6,
 'and': 3,
 'by': 10,
 'christian': 8,
 'in': 7,
 'of': 2,
 'on': 9,
 'the': 1,
 'to': 4,
 'url': 5}

In [None]:
# 전체 데이터셋 변환
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
                                ("email_to_wordcount", EmailToWordCounterTransforemr()),
                                ("wordcount_to_vector", WordCounterToVectorTransformer()),
])
X_train_transformed = preprocess_pipeline.fit_transform(X_train)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="lbfgs", random_state=42) # 최적화 알고리즘 매개변수 default 값: lbfgs(L2 정규화 지원)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV]  ................................................................
[CV] .................................... , score=0.981, total=   0.1s
[CV]  ................................................................
[CV] .................................... , score=0.985, total=   0.1s
[CV]  ................................................................
[CV] .................................... , score=0.994, total=   0.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.4s finished


0.9866666666666667

In [None]:
from sklearn.metrics import precision_score, recall_score

X_test_transformed = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(solver="lbfgs", random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("정밀도: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("재현율: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

정밀도: 96.88%
재현율: 97.89%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
