# Term Frequency - Inverse Document Frequency; Text Analytics in Python

In [76]:
import pandas as pd
import numpy as np
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.tokenize import word_tokenize, sent_tokenize

In [66]:
stopwords = list(STOP_WORDS)

In [210]:
# first we generate random text that will be used later in the model

text = [["I really like chicken flavored ramen noodles.",'food'],
["I don't normally use coupons to buy groceries, even though I know you can save money.",'food'],
["Mediterranean cuisine is my favorite.",'food'],
["I drive a 2014 Ford Focus SE, but my dream car is a Toyota Tacoma.",'car'],
["Dark blue is the color I want my dream car to be.",'car'],
["I go to the University of Denver. I started in 2018 and expect to finish in 2020. I'm going for my masters of science in Data Science",'school'],
["The University of Denver is not like Michigan State University, which is public and has many more students than DU.",'school'],
["Pizza is my favorite food, even if that's pretty lame",'food'],
["The Toyota Tacoma is a truck that a lot of people have in Colorado. It's great because it's a small truck, but handles well in the mountains.",'car'],
["Michigan State Univsersity is a great school",'school']]

In [211]:
# create dataframe

df = pd.DataFrame(text, columns = ['text','category'])

In [214]:
df

Unnamed: 0,text,category
0,I really like chicken flavored ramen noodles.,food
1,"I don't normally use coupons to buy groceries,...",food
2,Mediterranean cuisine is my favorite.,food
3,"I drive a 2014 Ford Focus SE, but my dream car...",car
4,Dark blue is the color I want my dream car to be.,car
5,I go to the University of Denver. I started in...,school
6,The University of Denver is not like Michigan ...,school
7,"Pizza is my favorite food, even if that's pret...",food
8,The Toyota Tacoma is a truck that a lot of peo...,car
9,Michigan State Univsersity is a great school,school


In [215]:
# minor text processing

def text_process(dataframe, column_to_prep):
    """
    Removes whitespace, special characters, and stop words
    
    input: dataframe(pandas dataframe)
           column_to_prep(str)
           
    output: dataframe(pandas dataframe)
    """
    
    # lowercase all text
    dataframe[f'{column_to_prep}'] = dataframe[f'{column_to_prep}'].str.lower()
    
    # remove whitespace
    dataframe[f'{column_to_prep}'] = dataframe[f'{column_to_prep}'].str.strip()
    
    # remove stop words
    dataframe[f'{column_to_prep}'] = (dataframe[f'{column_to_prep}']
                                      .apply(lambda x: ' '.join([i for i in x.split() if i not in stopwords])))
    
    # remove punctuation
    dataframe[f'{column_to_prep}'] = dataframe[f'{column_to_prep}'].str.replace('[^\w\s]','')
    
    return(dataframe)

In [216]:
# creating column of word count in df

def word_count(dataframe, column_to_count):
    """
    Counts the frequency of words in a document
    
    input: dataframe(pandas dataframe)
           column_to_prep(str)
           
    output: frequency(int)
    """
    words = dataframe[f'{column_to_count}'].apply(lambda x: word_tokenize(' '.join([i for i in x.split()])))
    dataframe['num_words'] = [len(i) for i in words]
    
    return(dataframe)

In [217]:
# creating a dictionary with total word counts and inverse frequency

def freq_dict(dataframe, column_to_count):
    """
    Counts the frequency of words in all text
    
    input: dataframe(pandas dataframe)
           column_to_prep(str)
           
    output: word_dict(dictionary: [count, inverse frequency])
    """
    
    word_dict = {}

    for i in dataframe[f'{column_to_count}'].str.split().apply(pd.Series).stack():
        if i in word_dict:
            word_dict[i] += 1
        else:
            word_dict[i] = 1

    for k,v in word_dict.items():
        word_dict[k] = [v, np.log(len(df)/v)]

    return(word_dict)

In [243]:
# HERE WE SHOULD CALCULATE TF*IDF SOMEHOW, THEN MAKE THE MATRIX

In [244]:
test

[['like', 'chicken', 'flavored', 'ramen', 'noodles'],
 ['dont',
  'normally',
  'use',
  'coupons',
  'buy',
  'groceries',
  'know',
  'save',
  'money'],
 ['mediterranean', 'cuisine', 'favorite'],
 ['drive', '2014', 'ford', 'focus', 'se', 'dream', 'car', 'toyota', 'tacoma'],
 ['dark', 'blue', 'color', 'want', 'dream', 'car', 'be'],
 ['university',
  'denver',
  'started',
  '2018',
  'expect',
  'finish',
  '2020',
  'im',
  'going',
  'masters',
  'science',
  'data',
  'science'],
 ['university',
  'denver',
  'like',
  'michigan',
  'state',
  'university',
  'public',
  'students',
  'du'],
 ['pizza', 'favorite', 'food', 'thats', 'pretty', 'lame'],
 ['toyota',
  'tacoma',
  'truck',
  'lot',
  'people',
  'colorado',
  'its',
  'great',
  'its',
  'small',
  'truck',
  'handles',
  'mountains'],
 ['michigan', 'state', 'univsersity', 'great', 'school']]

In [191]:
freq_dict(df, 'text')

{'like': 2,
 'chicken': 1,
 'flavored': 1,
 'ramen': 1,
 'noodles': 1,
 'dont': 1,
 'normally': 1,
 'use': 1,
 'coupons': 1,
 'buy': 1,
 'groceries': 1,
 'know': 1,
 'save': 1,
 'money': 1,
 'mediterranean': 1,
 'cuisine': 1,
 'favorite': 2,
 'drive': 1,
 '2014': 1,
 'ford': 1,
 'focus': 1,
 'se': 1,
 'dream': 2,
 'car': 2,
 'toyota': 2,
 'tacoma': 2,
 'dark': 1,
 'blue': 1,
 'color': 1,
 'want': 1,
 'be': 1,
 'university': 3,
 'denver': 2,
 'started': 1,
 '2018': 1,
 'expect': 1,
 'finish': 1,
 '2020': 1,
 'im': 1,
 'going': 1,
 'masters': 1,
 'science': 2,
 'data': 1,
 'michigan': 2,
 'state': 2,
 'public': 1,
 'students': 1,
 'du': 1,
 'pizza': 1,
 'food': 1,
 'thats': 1,
 'pretty': 1,
 'lame': 1,
 'truck': 2,
 'lot': 1,
 'people': 1,
 'colorado': 1,
 'its': 2,
 'great': 2,
 'small': 1,
 'handles': 1,
 'mountains': 1,
 'univsersity': 1,
 'school': 1}

In [91]:
df['text'].apply(lambda x: word_tokenize(' '.join([i for i in x.split()])))

0            [like, chicken, flavored, ramen, noodles]
1    [dont, normally, use, coupons, buy, groceries,...
2                   [mediterranean, cuisine, favorite]
3    [drive, 2014, ford, focus, se, dream, car, toy...
4            [dark, blue, color, want, dream, car, be]
5    [university, denver, started, 2018, expect, fi...
6    [university, denver, like, michigan, state, un...
7         [pizza, favorite, food, thats, pretty, lame]
8    [toyota, tacoma, truck, lot, people, colorado,...
9        [michigan, state, univsersity, great, school]
Name: text, dtype: object

In [89]:
word_tokenize(df.iloc[1][0])

['dont',
 'normally',
 'use',
 'coupons',
 'buy',
 'groceries',
 'know',
 'save',
 'money']