In [1]:
import pandas as pd
import numpy as np

df_items = pd.read_csv('items-Copy1.csv') # read dataset into pandas dataframe

documents = list(df_items['description']) # get only description column and create documents list for this
    
print(documents)



In [2]:
from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer()    # converts to all lowercase, ignores punctuation marks and treats
                                    # them as delimiters
                                    # removes all words that match list of english stop words

count_vector.fit(documents) # fit document dataset to CountVectorizer object
count_vector.get_feature_names() # returns set of words that make up vocabulary of documents


['00',
 '000',
 '01',
 '026',
 '04',
 '06',
 '07',
 '09',
 '092017',
 '093',
 '0r',
 '10',
 '100',
 '100k',
 '101',
 '101st',
 '102017',
 '1029022833877757274',
 '105',
 '106',
 '107th',
 '108th',
 '1090657914303896',
 '10th',
 '11',
 '110',
 '11200',
 '112017',
 '11th',
 '12',
 '120',
 '128',
 '128371547505950',
 '12918599586',
 '12pm',
 '12th',
 '13',
 '130',
 '1300',
 '134000',
 '135',
 '14',
 '14572967_1772905896322',
 '147549059117757',
 '15',
 '1523',
 '15th',
 '16',
 '162',
 '1685853058396977274',
 '16th',
 '17',
 '170',
 '172017',
 '1723444621280652',
 '1723447',
 '1723447757947005',
 '17305',
 '175',
 '175000',
 '1791',
 '17th',
 '18',
 '1814',
 '182017',
 '1833',
 '1838',
 '1839993966238044274',
 '1840',
 '1844',
 '1853',
 '1856',
 '1860s',
 '1864',
 '1876',
 '18_0',
 '18th',
 '19',
 '190',
 '1900',
 '1910',
 '1914',
 '1920',
 '192017',
 '193',
 '1931',
 '1938',
 '1945',
 '1947',
 '1948',
 '1952',
 '1960',
 '1965',
 '1966',
 '1967',
 '1968',
 '1969',
 '1970',
 '1971',
 '1972'

In [3]:
doc_array = count_vector.transform(documents).toarray() # frequency matrix of how often vocab words appear in documents

doc_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [4]:
# create frequency matrix pandas dataframe of how often vocab words appear in documents
frequency_matrix = pd.DataFrame(doc_array, columns = count_vector.get_feature_names())

frequency_matrix['black'] # display the 'black' column to show occurrences of word in documents

#frequency_matrix.to_csv(index=False) # save dataframe to csv file

frequency_matrix

  


'1\n0\n1\n3\n1\n2\n2\n0\n0\n0\n0\n1\n0\n0\n1\n0\n0\n1\n1\n0\n0\n1\n0\n1\n0\n0\n0\n1\n0\n0\n0\n0\n0\n0\n0\n0\n0\n1\n0\n1\n2\n1\n1\n0\n0\n0\n0\n3\n0\n0\n0\n1\n0\n1\n0\n0\n0\n1\n2\n0\n2\n0\n0\n0\n3\n1\n1\n0\n0\n1\n0\n1\n0\n0\n0\n1\n2\n0\n2\n1\n0\n0\n0\n1\n3\n1\n1\n3\n0\n0\n0\n1\n0\n0\n0\n0\n0\n1\n0\n0\n1\n0\n1\n0\n1\n1\n3\n1\n1\n0\n0\n0\n1\n1\n3\n0\n0\n0\n0\n0\n3\n1\n1\n2\n2\n0\n1\n1\n2\n0\n0\n0\n0\n1\n0\n1\n0\n0\n0\n2\n0\n0\n2\n0\n1\n0\n0\n0\n0\n1\n0\n3\n1\n2\n0\n1\n0\n1\n1\n3\n2\n0\n1\n0\n1\n1\n1\n0\n1\n3\n1\n2\n2\n0\n0\n0\n0\n0\n0\n0\n0\n3\n0\n1\n0\n0\n0\n0\n0\n0\n0\n1\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n2\n1\n0\n0\n0\n0\n0\n0\n0\n0\n1\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n1\n1\n0\n0\n0\n0\n1\n0\n0\n0\n0\n1\n0\n0\n1\n1\n0\n0\n0\n0\n0\n0\n0\n0\n0\n1\n0\n0\n1\n0\n0\n0\n0\n0\n1\n1\n1\n1\n1\n0\n0\n0\n0\n0\n0\n0\n1\n0\n0\n0\n0\n1\n0\n0\n0\n1\n1\n0\n1\n0\n0\n1\n0\n0\n0\n0\n3\n0\n0\n0\n0\n0\n0\n1\n0\n1\n1\n0\n0\n1\n0\n0\n0\n0\n0\n0\n