# Term Frequency - Inverse Document Frequency; Text Analytics in Python

In [52]:
import pandas as pd
import numpy as np
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [53]:
stopwords = list(STOP_WORDS)

In [81]:
# first we generate random text that will be used later in the model

text = [
    ["I really like chicken flavored ramen noodles.",'food'],
    ["I don't normally use coupons to buy groceries, even though I know you can save money.",'food'],
    ["Mediterranean cuisine is my favorite.",'food'],
    ["I drive a 2014 Ford Focus SE, but my dream car is a Toyota Tacoma.",'car'],
    ["Dark blue is the color I want my dream car to be.",'car'],
    ["I go to the University of Denver. I started in 2018 and expect to finish in 2020. I'm going for my masters of science in Data Science",'school'],
    ["The University of Denver is not like Michigan State University, which is public and has many more students than DU.",'school'],
    ["Pizza is my favorite food, even if that's pretty lame",'food'],
    ["The Toyota Tacoma is a truck that a lot of people have in Colorado. It's great because it's a small truck, but handles well in the mountains.",'car'],
    ["Michigan State University is a great school",'school'],
    ["University of Michigan is Michigan State University's rival. A lot of people go to U of M, but clearly Michigan State University is better.",'school'],
    ["There are a lot of places to eat in downtown Denver. One of my favorites is La Loma.",'food'],
    ["Driving down I-25 can be a pain. Often, it is busy and there is a lot of traffic. They are working on the roads now, which causes further congestion.",'car'],
    ["Sometimes I shop at King Soopers, and sometimes I shop at Safeway. I haven't decided which is my favorite grocery store, yet.",'food'],
    ["There are many companies that make cars. You have Ford, Toyota, Honda, Hundai, Dodge, Chrysler, Chevrolet, and many more.",'car'],
    ["I'm excited for the advent of autonomous driving. I think it will improve traffic congestion, as well as road safety. Plus, I will be able to nap while 'driving'",'car'],
    ["My friend drives a Jeep. I don't think I'd ever buy one, as I've heard many negative stories about them, but to each his or her own.",'car'],
    ["Intermittent Fasting is the idea of eating during a specified shortened window during the day. The goal is help your digestion, and give your body a rest by not eating every minute of every day.",'food'],
    ["On the Fourth of July, in America, many people head outside to a park or backyard and grill hotdogs, burgers, and bratwursts. It's a lot of fun",'food'],
    ["There are many different types of cuisines in America. You can find alost anything.",'food'],
    ["I haven't found a pizza place in Denver that I really like, yet",'food'],
    ["Once I finish my masters, I plan to look for a full time job somewhere in Denver.",'school'],
    ["I enjoy being a student. The atmoshphere is great, and I get a lot of student discounts!",'school'],
    ["I'm building this model as a demonstration of my knowledge and skills.",'school'],
    ["There are many things I still hope to learn, and I'm eager for my classes this quarter.",'school'],
    ["My favorite class so far has been Python programming, but I can tell I'm going to enjoy Data Mining.",'school'],
    ["Probability and Statistics was a tough class, but I'm grateful for it. The class taught me a great deal, and really set the foundation for future learning.",'school'],
    ["Now that my resume is becoming more robust with Data Science experience, I'm starting to get offers from various companies.",'school']
]

In [82]:
# create dataframe

df = pd.DataFrame(text, columns = ['text','category'])

In [56]:
# minor text processing

def text_process(dataframe, column_to_prep):
    """
    Removes whitespace, special characters, and stop words
    
    input: dataframe(pandas dataframe)
           column_to_prep(str)
           
    output: dataframe(pandas dataframe)
    """
    
    # lowercase all text
    dataframe[f'{column_to_prep}'] = dataframe[f'{column_to_prep}'].str.lower()
    
    # remove whitespace
    dataframe[f'{column_to_prep}'] = dataframe[f'{column_to_prep}'].str.strip()
    
    # remove stop words
    dataframe[f'{column_to_prep}'] = (dataframe[f'{column_to_prep}']
                                      .apply(lambda x: ' '.join([i for i in x.split() if i not in stopwords])))
    
    # remove punctuation
    dataframe[f'{column_to_prep}'] = dataframe[f'{column_to_prep}'].str.replace('[^\w\s]','')
    
    return(dataframe)

In [57]:
# creating column of word count in df

def word_count(dataframe, column_to_count):
    """
    Counts the frequency of words in a document
    
    input: dataframe(pandas dataframe)
           column_to_prep(str)
           
    output: frequency(int)
    """
    words = dataframe[f'{column_to_count}'].apply(lambda x: word_tokenize(' '.join([i for i in x.split()])))
    dataframe['num_words'] = [len(i) for i in words]
    
    return(dataframe)

In [58]:
# creating a dictionary with total word counts and inverse frequency

def freq_dict(dataframe, column_to_count):
    """
    Counts the frequency of words in all text
    
    input: dataframe(pandas dataframe)
           column_to_prep(str)
           
    output: word_dict(dictionary: [count, inverse frequency])
    """
    
    word_dict = {}
#     dataframe = text_process(dataframe, f'{column_to_count}')

    for i in dataframe[f'{column_to_count}'].str.split().apply(pd.Series).stack():
        if i in word_dict:
            word_dict[i] += 1
        else:
            word_dict[i] = 1

    for k,v in word_dict.items():
        word_dict[k] = [v, np.log(len(df)/v)]

    return(word_dict)

In [83]:
# prepping our data using the functions we built

df = word_count(text_process(df, 'text'),'text')

In [60]:
# creating the word count frequency matrix

doc_li = [i for i in df.text.str.split()]
col_li = sorted(list(freq_dict(df,'text').keys()))
freq_li = [[0 for i in range(len(col_li))] for j in range(len(doc_li))]
                
for doc_idx, doc in enumerate(doc_li):
    for word in doc:
        for idx, col in enumerate(col_li):
            if word == col:
                freq_li[doc_idx][idx] += 1

In [61]:
# joining word count and setting index to document text data

word_freq_mat = pd.DataFrame(np.matrix(freq_li), columns=col_li).join(df[['text','num_words']]).set_index('text')

In [62]:
# resetting values to term frequency

word_freq_mat = word_freq_mat.iloc[:,0:-1].div(word_freq_mat.num_words, axis=0)

In [63]:
# adding inverse document frequency

word_freq_mat.loc[len(word_freq_mat)] = [i[1] for i in list(freq_dict(df, 'text').values())]

In [64]:
# multiplying tf by idf to get tfidf

word_freq_mat = word_freq_mat.iloc[:-1,:].mul(word_freq_mat.iloc[-1,:].values, axis=1)

There you have it! We now have a Term Frequency-Inverse Document Frequency matrix. With these newly dervied numerical variables we can fit a machine learning model (random forest, in this case) and use it to predict the classifications of future documents based on their word composition. 

<b>Note</b>: we are only using 10 documents to train this model, and therefore our corpus is small and our training will be light. I don't expect a high accuracy, but this can of course be improved with more data.

In [65]:
X = np.array(word_freq_mat)
y = df.category

In [66]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [67]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [68]:
y_pred = classifier.predict(X_test)

In [69]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[0 0 1]
 [0 1 2]
 [0 0 2]]


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

         car       0.00      0.00      0.00         1
        food       1.00      0.33      0.50         3
      school       0.40      1.00      0.57         2

    accuracy                           0.50         6
   macro avg       0.47      0.44      0.36         6
weighted avg       0.63      0.50      0.44         6

0.5


Most of the above was for demonstration and edification purposes. Fortunately, a lot of the prep work has been done for us in sklearn's TfidfVectorizer() function. We set the min_df parameter to 2 to specify that a word included in the corpus must be present in at least 2 of the documents. This helps prevent irrelevent words from being considered.

In [94]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfconverter = TfidfVectorizer(max_features=1500, min_df=2, max_df=0.7)
X = tfidfconverter.fit_transform(df.text)
print(tfidfconverter.get_feature_names())
X = X.toarray()

['america', 'buy', 'car', 'class', 'companies', 'congestion', 'data', 'denver', 'dont', 'dream', 'driving', 'enjoy', 'favorite', 'finish', 'ford', 'going', 'great', 'havent', 'im', 'its', 'like', 'lot', 'masters', 'michigan', 'people', 'pizza', 'science', 'state', 'tacoma', 'think', 'toyota', 'traffic', 'university']


Once the matrix is built, we can run it through the random forest.

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [97]:
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [98]:
y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[0 1 0]
 [0 3 0]
 [0 1 1]]


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

         car       0.00      0.00      0.00         1
        food       0.60      1.00      0.75         3
      school       1.00      0.50      0.67         2

    accuracy                           0.67         6
   macro avg       0.53      0.50      0.47         6
weighted avg       0.63      0.67      0.60         6

0.6666666666666666


Not great predictive power, but of course we expected this. We simply need more data to tune the model.