<a href="https://colab.research.google.com/github/Yadav-Roshan/NLP/blob/main/02_CountVectorizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk

In [None]:
nltk.download('all')

In [None]:
!pip install scipy

In [None]:
text = '''Abraham Lincoln was an American lawyer, politician, and statesman who served as the 16th president of the United States from 1861 until his assassination in 1865. Lincoln led the Union through the American Civil War to defend the nation as a constitutional union and succeeded in abolishing slavery, bolstering the federal government, and modernizing the U.S. economy.
Lincoln was born into poverty in a log cabin in Kentucky and was raised on the frontier, primarily in Indiana. He was self-educated and became a lawyer, Whig Party leader, Illinois state legislator, and U.S. congressman from Illinois. In 1849, he returned to his successful law practice in Springfield, Illinois. In 1854, he was angered by the Kansas–Nebraska Act, which opened the territories to slavery, and he re-entered politics. He soon became a leader of the new Republican Party. He reached a national audience in the 1858 Senate campaign debates against Stephen A. Douglas. Lincoln ran for president in 1860, sweeping the North to gain victory. Pro-slavery elements in the South viewed his election as a threat to slavery, and Southern states began seceding from the nation. During this time, the newly formed Confederate States of America began seizing federal military bases in the south. Just over one month after Lincoln assumed the presidency, the Confederate States attacked Fort Sumter, a U.S. fort in South Carolina. Following the bombardment, Lincoln mobilized forces to suppress the rebellion and restore the union.'''

In [None]:
txt = nltk.sent_tokenize(text)
txt

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
import re
import string
def clean_text(txt):
  texts=[]
  for sentence in txt:
    texts.append(''.join(word for word in sentence if word not in string.punctuation))

  return texts

In [None]:
clean_txt = clean_text(txt)
clean_txt

In [None]:
def create_vocab(text):
  unique_words = set()

  for sentence in text:
    for word in sentence.split(' '):
      if len(word)>2:
        unique_words.add(word.lower())


  vocab = {}
  for index, word in enumerate(sorted(list(unique_words))):
    vocab[word] = index

  return vocab

In [None]:
vocab = create_vocab(clean_txt)
vocab

{'16th': 0,
 '1849': 1,
 '1854': 2,
 '1858': 3,
 '1860': 4,
 '1861': 5,
 '1865': 6,
 'abolishing': 7,
 'abraham': 8,
 'act': 9,
 'after': 10,
 'against': 11,
 'america': 12,
 'american': 13,
 'and': 14,
 'angered': 15,
 'assassination': 16,
 'assumed': 17,
 'attacked': 18,
 'audience': 19,
 'bases': 20,
 'became': 21,
 'began': 22,
 'bolstering': 23,
 'bombardment': 24,
 'born': 25,
 'cabin': 26,
 'campaign': 27,
 'carolina': 28,
 'civil': 29,
 'confederate': 30,
 'congressman': 31,
 'constitutional': 32,
 'debates': 33,
 'defend': 34,
 'douglas': 35,
 'during': 36,
 'economy': 37,
 'election': 38,
 'elements': 39,
 'federal': 40,
 'following': 41,
 'for': 42,
 'forces': 43,
 'formed': 44,
 'fort': 45,
 'from': 46,
 'frontier': 47,
 'gain': 48,
 'government': 49,
 'his': 50,
 'illinois': 51,
 'indiana': 52,
 'into': 53,
 'just': 54,
 'kansas–nebraska': 55,
 'kentucky': 56,
 'law': 57,
 'lawyer': 58,
 'leader': 59,
 'led': 60,
 'legislator': 61,
 'lincoln': 62,
 'log': 63,
 'military': 

In [None]:
from collections import Counter
from scipy.sparse import csr_matrix

In [None]:
def transform(data):
  vocab = create_vocab(data)
  row, col, val = [], [], []

  for index, sentence in enumerate(data):
    word_count = dict(Counter(sentence.lower().split(' ')))

    for word, count in word_count.items():
      if len(word)>2:
        col_index = vocab.get(word)
        if col_index >= 0:
          row.append(index)
          col.append(col_index)
          val.append(count)

  return csr_matrix((val, (row, col)), shape = (len(data), len(vocab)))

In [None]:
matrix = transform(clean_txt)

In [None]:
print(matrix.toarray())

[[1 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
import pandas as pd
df = pd.DataFrame(matrix.toarray(), columns = vocab.keys())
df

Unnamed: 0,16th,1849,1854,1858,1860,1861,1865,abolishing,abraham,act,...,union,united,until,victory,viewed,war,was,which,whig,who
0,1,0,0,0,0,1,1,0,1,0,...,0,1,1,0,0,0,1,0,0,1
1,0,0,0,0,0,0,0,1,0,0,...,2,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,1,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [None]:
len(df.columns)

128

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
mat = vec.fit_transform(clean_text)

print(mat.toarray())

[[1 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
