<a href="https://colab.research.google.com/github/arua23/2020-CBMS-DoubleU-Net/blob/master/03_SPAM_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Downloading the dataset

In [None]:
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = 'https://spamassassin.apache.org/old/publiccorpus/'
HAM_URL = DOWNLOAD_ROOT + '20021010_easy_ham.tar.bz2'
SPAM_URL = DOWNLOAD_ROOT + '20021010_spam.tar.bz2'
SPAM_PATH = os.path.join('datasets', 'spam')

def fetch_spam_data(spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (('ham.tar.bz2', HAM_URL), ('spam.tar.bz2', SPAM_URL)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=SPAM_PATH)
        tar_bz2_file.close()

# fetch_spam_data()

Saving the dataset into Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

save_path = '/content/drive/MyDrive/Machine Learning Projects/Hands On Machine Learning/SPAM_classifier/'

# # Copying the downloaded files

# import shutil
# shutil.copytree('/content/datasets/spam', save_path, dirs_exist_ok= True)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# After copying the files

ham_path = os.path.join(save_path, 'easy_ham')
spam_path = os.path.join(save_path, 'spam')

ham_filenames = [name for name in sorted(os.listdir(ham_path)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(spam_path)) if len(name) > 20]

print(f'Number of ham emails: {len(ham_filenames)}')
print(f'Number of spam emails: {len(spam_filenames)}')

Number of ham emails: 2551
Number of spam emails: 501


In [None]:
ham_filenames[:10]

['0001.ea7e79d3153e7469e7a9c3e0af6a357e',
 '0002.b3120c4bcbf3101e661161ee7efcb8bf',
 '0003.acfc5ad94bbd27118a0d8685d18c89dd',
 '0004.e8d5727378ddde5c3be181df593f1712',
 '0005.8c3b9e9c0f3f183ddaf7592a11b99957',
 '0006.ee8b0dba12856155222be180ba122058',
 '0007.c75188382f64b090022fa3b095b020b0',
 '0008.20bc0b4ba2d99aae1c7098069f611a9b',
 '0009.435ae292d75abb1ca492dcc2d5cf1570',
 '0010.4996141de3f21e858c22f88231a9f463']

Reading the emails

In [None]:
import email
import email.policy

# Custom funtion for loading the emails

# def load_email(is_spam, file_name, email_path = save_path):
#     directory = 'spam' if is_spam else 'easy_ham'
#     with open(os.path.join(email_path, directory, file_name), 'rb') as f:
#         return email.parser.BytesParser(policy = email.policy.default).parse(f)

def load_email(file_path):
    with open(file_path, 'rb') as f:
        return email.parser.BytesParser(policy = email.policy.default).parse(f)


In [None]:
# Loading the ham emails first
ham_emails = [load_email(os.path.join(ham_path, file_path)) for file_path in ham_filenames]
# Loading the spam emails
spam_emails = [load_email(os.path.join(spam_path, file_path)) for file_path in spam_filenames]

In [None]:
# Printing a sample ham email
print(ham_emails[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [None]:
# Printing a smaple spam email
print(spam_emails[6].get_content().strip())

A POWERHOUSE GIFTING PROGRAM You Don't Want To Miss! 
 
  GET IN WITH THE FOUNDERS! 
The MAJOR PLAYERS are on This ONE
For ONCE be where the PlayerS are
This is YOUR Private Invitation

EXPERTS ARE CALLING THIS THE FASTEST WAY 
TO HUGE CASH FLOW EVER CONCEIVED
Leverage $1,000 into $50,000 Over and Over Again

THE QUESTION HERE IS:
YOU EITHER WANT TO BE WEALTHY 
OR YOU DON'T!!!
WHICH ONE ARE YOU?
I am tossing you a financial lifeline and for your sake I 
Hope you GRAB onto it and hold on tight For the Ride of youR life!

Testimonials

Hear what average people are doing their first few days:
�We've received 8,000 in 1 day and we are doing that over and over again!' Q.S. in AL
 �I'm a single mother in FL and I've received 12,000 in the last 4 days.� D. S. in FL
�I was not sure about this when I sent off my $1,000 pledge, but I got back $2,000 the very next day!� L.L. in KY
�I didn't have the money, so I found myself a partner to work this with. We have received $4,000 over the last 2 days

# Getting Structures of the emails

In [None]:
# Custom function for getting email structures
def get_email_structure(email):
    if isinstance(email, str):
      return email
    payload = email.get_payload()
    if isinstance(payload, list):
      return 'multipart({})'.format(', '.join([
          get_email_structure(sub_email)
          for sub_email in payload
      ]))
    else:
      return email.get_content_type()

In [None]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [None]:
structures_counter(ham_emails).most_common()

[('text/plain', 2453),
 ('multipart(text/plain, application/pgp-signature)', 72),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [None]:
structures_counter(spam_emails).most_common()

[('text/plain', 222),
 ('text/html', 181),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 19),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

In [None]:
# Header value for a spam email
for header, value in spam_emails[6].items():
    print(header,":",value)

Return-Path : <Thecashsystem@firemail.de>
Delivered-To : zzzz@localhost.example.com
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.example.com (Postfix) with ESMTP id 3453043F99	for <zzzz@localhost>; Thu, 22 Aug 2002 11:58:24 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 16:58:24 +0100 (IST)
Received : from mailbox-13.st1.spray.net (mailbox-13.st1.spray.net [212.78.202.113])	by webnote.net (8.9.3/8.9.3) with ESMTP id QAA05573	for <zzzz@example.com>; Thu, 22 Aug 2002 16:55:29 +0100
Received : from freesource (user-24-214-168-210.knology.net [24.214.168.210])	by mailbox-13.st1.spray.net (Postfix) with ESMTP	id ADDD03E25C; Thu, 22 Aug 2002 17:50:55 +0200 (DST)
Message-ID : <413-220028422154219900@freesource>
X-Priority : 1
To : 1 <thecashsystem@firemail.de>
From : TheCashSystem <Thecashsystem@firemail.de>
Subject : RE: Your Bank Account Information 
Date : Thu, 22

In [None]:
# Header value for a ham email
for header, value in ham_emails[1].items():
    print(header,":",value)

Return-Path : <Steve_Burt@cursor-system.com>
Delivered-To : zzzz@localhost.netnoteinc.com
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id BE12E43C34	for <zzzz@localhost>; Thu, 22 Aug 2002 07:46:38 -0400 (EDT)
Received : from phobos [127.0.0.1]	by localhost with IMAP (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 12:46:38 +0100 (IST)
Received : from n20.grp.scd.yahoo.com (n20.grp.scd.yahoo.com    [66.218.66.76]) by dogma.slashnull.org (8.11.6/8.11.6) with SMTP id    g7MBkTZ05087 for <zzzz@example.com>; Thu, 22 Aug 2002 12:46:29 +0100
X-Egroups-Return : sentto-2242572-52726-1030016790-zzzz=example.com@returns.groups.yahoo.com
Received : from [66.218.67.196] by n20.grp.scd.yahoo.com with NNFMP;    22 Aug 2002 11:46:30 -0000
X-Sender : steve.burt@cursor-system.com
X-Apparently-To : zzzzteana@yahoogroups.com
Received : (EGP: mail-8_1_0_1); 22 Aug 2002 11:46:29 -0000
Received : (qmail 11764 invoked from network); 2

## Subject of the ham and spam emails

In [None]:
print(ham_emails[1]['Subject'])

[zzzzteana] RE: Alexander


In [None]:
print(spam_emails[6]['Subject'])

RE: Your Bank Account Information 


# Splitting the dataset

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

X = np.array(ham_emails + spam_emails, dtype = 'object')
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## HTML to plain text generation

In [None]:
import re
from html import unescape

# Custom function for reading all html tags and converting them into plain text

def html_to_plain_text(html):
    if isinstance(html, bytes):
        html = html.decode('latin-1')
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)


### HTML contents in a sample spam email

In [None]:
html_spam_email = [emails for emails in X_train[y_train==1]
                   if get_email_structure(emails)=='text/html']
sample_html_spam = html_spam_email[7]
print(sample_html_spam.get_content().strip()[:1000], '...')

<html>
<head>
</head>
<center>
<h1>
<b><font face="Arial Black"><font color="#0000FF"><font size=+2>&nbsp;
Free Personal and Business Grants</font></font></font></b></h1></center>

<p>&nbsp;
<center><table BORDER=0 CELLSPACING=0 CELLPADDING=10 WIDTH="419" BGCOLOR="#0000FF" >
<tr>
<td WIDTH="397" BGCOLOR="#FFFF00">
<center>
<h2>
<font face="Arial Narrow">" Qualify for <u>at least</u> $25,000 in free
grants money - Guaranteed! "</font></h2></center>
</td>
</tr>
</table></center>

<center>
<h3>
<font face="Arial"><font size=+0>Each day over One Million Dollars in Free
Government<br>
Grants&nbsp; is given away to people just like you for a wide<br>
variety of Business And Personal Needs</font></font></h3></center>
<font face="Verdana"><font size=-1>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
Dear Grant Seeker,</font></font>
<blockquote><font face="Verdana"><font size=-1>In a moment, I'll tell you
exactly <b>HOW &amp; WHERE</b> to get Grants. This <b>MONEY</b> has to
be given away, <b>WHY</b

### Plain Text from the HTML sapm sample

In [None]:
print(html_to_plain_text(sample_html_spam.get_content())[:1000], '...')


 
Free Personal and Business Grants
 
" Qualify for at least $25,000 in free
grants money - Guaranteed! "
Each day over One Million Dollars in Free
Government
Grants  is given away to people just like you for a wide
variety of Business And Personal Needs
       
Dear Grant Seeker,
In a moment, I'll tell you
exactly HOW & WHERE to get Grants. This MONEY has to
be given away, WHY not to YOU?
You may be thinking, "How
can I get some of this Free Grants Money"
Maybe you think it's impossible
to get free money?
Let me tell you it's not
impossible! It's a fact, ordinary people and businesses all across the
United States are receiving millions of dollars from these Government and
Private Foundation's everyday.
Who Can Apply?
ANYONE can apply
for a Grant from 18 years old and up!
Grants from $500.00 to $50,000.00
are possible! GRANTS don't have to be paid back,
EVER! Claim
your slice of the FREE American Pie.
This money is not a loan,
Trying to get money through a conventional bank can be ver

## HTML content in a HAM email

In [None]:
html_ham_emails = [emails for emails in X_train[y_train==0]
                   if get_email_structure(emails)=='multipart(text/plain, text/html)']

# print(len(html_ham_emails))
sample_html_ham = html_ham_emails[1]

# Custom content manager for mutipart emails
content_manager = email.contentmanager.ContentManager()

def handle_multipart_alternative(msg):
    text_part = None
    html_part = None
    for part in msg.get_payload():
        if part.get_content_type() == 'text/plain':
            text_part = part.get_payload(decode = True)
        elif part.get_content_type() == 'text/html':
            html_part = part.get_payload(decode = True)
    return text_part, html_to_plain_text(html_part)

# Register the handler with content manager
content_manager.add_get_handler('multipart/alternative', handle_multipart_alternative)

sample_text, sample_html = content_manager.get_content(sample_html_ham)

# Converting bytes to string
if isinstance(sample_text, bytes):
    sample_text = sample_text.decode()
if isinstance(sample_html, bytes):
    sample_html = sample_html.decode()

# sample_text = sample_text.split('\n')
# sample_html = sample_html.split('\n')

print(sample_text[:1000], '...')
print(sample_html[:1000], '...')

# print(sample_html_ham.get_content())


To view this newsletter in full-color:
http://newsletter.mediaunspun.com/index000021410.cfm

Media Unspun
What the Press is Reporting and Why (www.mediaunspun.com)
-----------------------------------------------------------------
October 8, 2002

-----------------------------------------------------------------
IN THIS ISSUE
-----------------------------------------------------------------
* BUSH COVERS THE WATERFRONT
* THE BIGGEST CABLE HOOKUP

-----------------------------------------------------------------
EDITOR'S NOTE
-----------------------------------------------------------------
Is Media Unspun useful to you? Then pass it on to a colleague.
The more readers we have, the more successful we'll be. The more 
successful we are, the more useful we can be to you. Pass it
on!

Media Unspun serves business news and analysis, authoritatively
and irreverently, every business day. An annual subscription
costs $50, less than a dollar a week. If your four-week free
trial is coming to an e

In [None]:
# Plain Text from the HTML ham sample

print(sample_text[:1000], '...')
print(sample_html[:1000], '...')


To view this newsletter in full-color:
http://newsletter.mediaunspun.com/index000021410.cfm

Media Unspun
What the Press is Reporting and Why (www.mediaunspun.com)
-----------------------------------------------------------------
October 8, 2002

-----------------------------------------------------------------
IN THIS ISSUE
-----------------------------------------------------------------
* BUSH COVERS THE WATERFRONT
* THE BIGGEST CABLE HOOKUP

-----------------------------------------------------------------
EDITOR'S NOTE
-----------------------------------------------------------------
Is Media Unspun useful to you? Then pass it on to a colleague.
The more readers we have, the more successful we'll be. The more 
successful we are, the more useful we can be to you. Pass it
on!

Media Unspun serves business news and analysis, authoritatively
and irreverently, every business day. An annual subscription
costs $50, less than a dollar a week. If your four-week free
trial is coming to an e

## Using email as input and getting plain text as optput

In [None]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ('text/plain', 'text/html'):
            continue
        try:
            content = part.get_content()
        except:
            content = str(part.get_payload())
        if ctype == 'text/plain':
          return content
        else:
          html = content
    if html:
      return html_to_plain_text(html)

In [None]:
print(html_to_plain_text(sample_html_spam.get_content())[:500], '...')


 
Free Personal and Business Grants
 
" Qualify for at least $25,000 in free
grants money - Guaranteed! "
Each day over One Million Dollars in Free
Government
Grants  is given away to people just like you for a wide
variety of Business And Personal Needs
       
Dear Grant Seeker,
In a moment, I'll tell you
exactly HOW & WHERE to get Grants. This MONEY has to
be given away, WHY not to YOU?
You may be thinking, "How
can I get some of this Free Grants Money"
Maybe you think it's impossible
to get ...


# Using NLTK for creating word count sparse matrix

### Installing NLTK and urlextract

In [None]:
%pip install nltk



In [None]:
# Testing the nltk toolkit
try:
  import nltk
  stemmer = nltk.PorterStemmer()
  for word in ('computer', 'compuations', 'compute', 'computer'):
      print(word, '=> ', stemmer.stem(word))
except ImportError:
  print('Nltk not installed')

computer =>  comput
compuations =>  compuat
compute =>  comput
computer =>  comput


In [None]:
%pip install -q -U urlextract

In [None]:
# Testing urlextract
try:
  import urlextract
  url_extractor = urlextract.URLExtract()
  print(url_extractor.find_urls('Will this detect github.com or https://github.com/arua23/handson-ml2/blob/master/03_classification.ipynb'))
except ImportError:
  print('urlextract not installed')

['github.com', 'https://github.com/arua23/handson-ml2/blob/master/03_classification.ipynb']


## Tranforming emails in to word count arrays

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class email_to_word_counter(BaseEstimator, TransformerMixin):
  def __init__(self, strip_headers = True, lower_case = True, remove_punc = True,
               replace_urls = True, replace_num = True, stemming = True):
    self.strip_headers = strip_headers
    self.lower_case = lower_case
    self.remove_punc = remove_punc
    self.replace_urls = replace_urls
    self.replace_num = replace_num
    self.stemming = stemming

  def fit(self, X, y = None):
    return self

  def transform(self, X, y = None):
    X_transformed = []
    for email in X:
      text = email_to_text(email) or ''
      if self.lower_case:
        text = text.lower()
      if self.replace_urls and url_extractor is not None:
        urls = list(set(url_extractor.find_urls(text)))
        urls.sort(key = lambda url: len(url), reverse = True)
        for url in urls:
          text = text.replace(url, ' URL ')
      if self.replace_num:
        text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
      if self.remove_punc:
        text = re.sub(r'\W+', ' ', text, flags=re.M)

      word_counts = Counter(text.split())

      if self.stemming and stemmer is not None:
        stemmed_word_counts = Counter()
        for word, count in word_counts.items():
          stemmed_word = stemmer.stem(word)
          stemmed_word_counts[stemmed_word]+= count
        word_counts = stemmed_word_counts

      X_transformed.append(word_counts)

    return np.array(X_transformed)

In [None]:
# Testing on the first sample email
X_first = X_train[:1]
X_first_word_counts = email_to_word_counter().fit_transform(X_first)

print(X_first_word_counts)

[Counter({'it': 4, 'pay': 3, 't': 3, 'the': 2, 'you': 2, 'without': 2, 's': 2, 'i': 2, 'a': 2, 'can': 2, 'look': 2, 'at': 2, 'if': 1, 'creator': 1, 'didnt': 1, 'say': 1, 'could': 1, 'have': 1, 'theft': 1, 'so': 1, 'simpl': 1, 'hell': 1, 'that': 1, 'even': 1, 'in': 1, 'all': 1, 'major': 1, 'holi': 1, 'book': 1, 'wow': 1, 've': 1, 'got': 1, 'great': 1, 'idea': 1, 'll': 1, 'hire': 1, 'skywrit': 1, 'to': 1, 'write': 1, 'thi': 1, 'then': 1, 'lock': 1, 'up': 1, 'everybodi': 1, 'who': 1, 'and': 1, 'didn': 1, 'fail': 1, 'jesu': 1, 'is': 1, 'on': 1, 'my': 1, 'side': 1, 'url': 1})]


In [None]:
# Testing on the first 5 sample email
X_few = X_train[:5]
X_few_word_counts = email_to_word_counter().fit_transform(X_few)

print(X_few_word_counts)
print(type(X_few_word_counts))

[Counter({'it': 4, 'pay': 3, 't': 3, 'the': 2, 'you': 2, 'without': 2, 's': 2, 'i': 2, 'a': 2, 'can': 2, 'look': 2, 'at': 2, 'if': 1, 'creator': 1, 'didnt': 1, 'say': 1, 'could': 1, 'have': 1, 'theft': 1, 'so': 1, 'simpl': 1, 'hell': 1, 'that': 1, 'even': 1, 'in': 1, 'all': 1, 'major': 1, 'holi': 1, 'book': 1, 'wow': 1, 've': 1, 'got': 1, 'great': 1, 'idea': 1, 'll': 1, 'hire': 1, 'skywrit': 1, 'to': 1, 'write': 1, 'thi': 1, 'then': 1, 'lock': 1, 'up': 1, 'everybodi': 1, 'who': 1, 'and': 1, 'didn': 1, 'fail': 1, 'jesu': 1, 'is': 1, 'on': 1, 'my': 1, 'side': 1, 'url': 1})
 Counter({'i': 8, 'number': 7, 'to': 5, 'the': 5, 'of': 4, 'com': 3, 'we': 3, 'realli': 3, 'look': 3, 'it': 3, 'that': 3, 'with': 3, 'date': 2, 'welch': 2, 'panasa': 2, 't': 2, 'but': 2, 'is': 2, 'what': 2, 'would': 2, 'time': 2, 'for': 2, 'a': 2, 'have': 2, 'm': 2, 'not': 2, 'will': 2, 'exmh': 2, 'worker': 2, 'tue': 1, 'aug': 1, 'from': 1, 'brent': 1, 'messag': 1, 'id': 1, 'numbervaanumb': 1, 'blackcomb': 1, 'if': 1, 

## Converting Word Counts to Vectors

In [None]:
from scipy.sparse import csr_matrix

class word_count_to_vector(BaseEstimator, TransformerMixin):
  def __init__(self, vocab_size = 10):
    self.vocab_size = vocab_size

  def fit(self, X, y = None):
    total_count = Counter()

    for word_count in X:
      for word, count in word_count.items():
        total_count[word] += min(count, 10)

    most_common = total_count.most_common()[:self.vocab_size]
    self.vocablary_ = {word : index + 1 for index, (word, count) in enumerate(most_common)}
    return self

  def transform(self, X, y = None):
    rows = []
    cols = []
    data = []

    for row, word_count in enumerate(X):
      for word, count in word_count.items():
        rows.append(rows)
        cols.append(self.vocablary_.get(word, 0))
        data.append(count)
    return csr_matrix((data, (rows, cols)), shape = (len(X), self.vocab_size + 1))

In [None]:
vocab_transformer = word_count_to_vector(vocab_size = 10)
X_few_vectors = vocab_transformer.fit_transform(X_few_word_counts)

print(X_few_vectors)

KeyboardInterrupt: 