# Using TF-IDF with Scikit-Learn
* This notebook is mainly derived from Melanie Walsh and her wonderful JupyterBook on Cultural Analytics & Python.
  * https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/03-TF-IDF-Scikit-Learn.html

## Scikit-learn

"Scikit-learn is an open source machine learning library that supports supervised and unsupervised learning. It also provides various tools for model fitting, data preprocessing, model selection, model evaluation, and many other utilities." -- [scikit-learn webpage](https://scikit-learn.org/stable/)
* scikit-learn and [Working with Text Data](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from pathlib import Path  
import glob

In [None]:
directory_path = "US_Inaugural_Addresses/"

In [None]:
text_files = glob.glob(f"{directory_path}/*.txt")

In [None]:
text_files

In [None]:
text_titles = [Path(text).stem for text in text_files]

In [None]:
text_titles

## The VF-IDF step

"Convert a collection of raw documents to a matrix of TF-IDF features." with scikit-learn's [TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)

In [None]:
tfidf_vectorizer = TfidfVectorizer(input='filename', stop_words='english')

In [None]:
tfidf_vector = tfidf_vectorizer.fit_transform(text_files)

In [None]:
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=text_titles, columns=tfidf_vectorizer.get_feature_names())
tfidf_df.loc['00_Document Frequency'] = (tfidf_df > 0).sum()

In [None]:
tfidf_slice = tfidf_df[['government', 'borders', 'people', 'obama', 'war', 'honor','foreign', 'men', 'women', 'children']]
tfidf_slice.sort_index().round(decimals=2)

In [None]:
tfidf_df = tfidf_df.drop('00_Document Frequency', errors='ignore')

In [None]:
tfidf_df.stack().reset_index()

In [None]:
tfidf_df = tfidf_df.stack().reset_index()

In [None]:
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 
                                    'level_0': 'document',
                                    'level_1': 'term', 
                                    'level_2': 'term'})

In [None]:
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)

In [None]:
top_tfidf = tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)

In [None]:
top_tfidf[top_tfidf['term'].str.contains('women')]

In [None]:
top_tfidf[top_tfidf['document'].str.contains('obama')]

In [None]:
top_tfidf[top_tfidf['document'].str.contains('trump')]

In [None]:
top_tfidf[top_tfidf['document'].str.contains('lincoln')]

In [None]:
# Don't worry about this code
# This is to make a nice graphic of our results

import altair as alt
import numpy as np

# Terms in this list will get a red dot in the visualization
term_list = ['war', 'peace']

# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'document:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["document"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# red circle over terms in above list
circle = base.mark_circle(size=100).encode(
    color = alt.condition(
        alt.FieldOneOfPredicate(field='term', oneOf=term_list),
        alt.value('red'),
        alt.value('#FFFFFF00')        
    )
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + circle + text).properties(width = 600)