# Visualizing top 30 terms each period by TF-IDF

In [24]:
import pandas as pd
import altair as alt
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the Humanist Listserv dataset
humanist_vols = pd.read_csv("web_scraped_humanist_listserv_volumes.csv")

# Group the volumes by period
humanist_vols['period'] = pd.cut(humanist_vols['inferred_start_year'], bins=[float('-inf'), 2000, 2010, 2020], labels=['early_internet', 'web_2.0', 'contemporary'])
# Create a vectorizer
vectorizer = TfidfVectorizer(max_df=.8)
# Fit the vectorizer to our documents
transformed_documents = vectorizer.fit_transform(humanist_vols.groupby('period')['volume_text'].apply(' '.join).tolist())
# Now get the top features for each document
transformed_documents_as_array = transformed_documents.toarray()
# Get the periods for each volume
periods = humanist_vols['period'].unique()
# Create an empty list to store our results
tfidf_results = []
# Loop through each document and get the top terms
for counter, doc in enumerate(transformed_documents_as_array):
    # Zip together the terms and the scores
    tf_idf_tuples = list(zip(vectorizer.get_feature_names_out(), doc))
    # Sort the terms by score
    one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)
    # Add the date to the dataframe
    one_doc_as_df['period'] = periods[counter]
    # Append the dataframe to our list
    tfidf_results.append(one_doc_as_df)
# Concatenate all the dataframes together
tfidf_df = pd.concat(tfidf_results)
# Sort the dataframe by score
tfidf_df = tfidf_df.sort_values(by=['score'], ascending=False)
# Get the top thirty terms for each period
top_terms = tfidf_df.groupby('period').apply(lambda x: x.sort_values('score', ascending=False).head(30)).reset_index(drop=True)

  transformed_documents = vectorizer.fit_transform(humanist_vols.groupby('period')['volume_text'].apply(' '.join).tolist())
  top_terms = tfidf_df.groupby('period').apply(lambda x: x.sort_values('score', ascending=False).head(30)).reset_index(drop=True)


In [25]:
top_terms

Unnamed: 0,term,score,period
0,digitalhumanities,0.916535,contemporary
1,onlinehome,0.217490,contemporary
2,s16382816,0.217490,contemporary
3,dhhumanist,0.121123,contemporary
4,joyent,0.114608,contemporary
...,...,...,...
85,wmccarty,0.030223,web_2.0
86,utf,0.029034,web_2.0
87,spamscore,0.028404,web_2.0
88,listsv440,0.028404,web_2.0


In [26]:
top_terms['period'] = top_terms['period'].astype(str)
selection = alt.selection_point(fields=['term'], bind='legend')
chart = alt.Chart(top_terms).mark_bar().encode(
    y='score',
    x=alt.X('period', sort=['early_internet', 'web_2.0', 'contemporary'], axis=alt.Axis(title='Period')),
    color=alt.Color('term', legend=alt.Legend(title='Term', orient='right', symbolLimit=len(top_terms['term'].unique()), columns=5), scale=alt.Scale(scheme='tableau20')),
    tooltip=['term', 'score', 'period'],
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
    title='Top 30 Terms by TF-IDF Score in Humanist Volumes by Period'
)
chart

# Try using stemming

In [27]:
from nltk.stem import PorterStemmer
porter = PorterStemmer()

def stem_words(row):
    stemmed_words = ''
    for token in row.volume_text.split(' '):
        stemmed_words += porter.stem(token) + ' '
    return stemmed_words
humanist_vols['stemmed_text'] = humanist_vols.apply(stem_words, axis=1)


In [28]:
# Fit the vectorizer to our documents
transformed_documents = vectorizer.fit_transform(humanist_vols.groupby('period')['stemmed_text'].apply(' '.join).tolist())
# Now get the top features for each document
transformed_documents_as_array = transformed_documents.toarray()
# Get the periods for each volume
periods = humanist_vols['period'].unique()
# Create an empty list to store our results
tfidf_results = []
# Loop through each document and get the top terms
for counter, doc in enumerate(transformed_documents_as_array):
    # Zip together the terms and the scores
    tf_idf_tuples = list(zip(vectorizer.get_feature_names_out(), doc))
    # Sort the terms by score
    one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)
    # Add the date to the dataframe
    one_doc_as_df['period'] = periods[counter]
    # Append the dataframe to our list
    tfidf_results.append(one_doc_as_df)
# Concatenate all the dataframes together
tfidf_df = pd.concat(tfidf_results)
# Sort the dataframe by score
tfidf_df = tfidf_df.sort_values(by=['score'], ascending=False)
# Get the top thirty terms for each period
top_terms2 = tfidf_df.groupby('period').apply(lambda x: x.sort_values('score', ascending=False).head(30)).reset_index(drop=True)

  transformed_documents = vectorizer.fit_transform(humanist_vols.groupby('period')['stemmed_text'].apply(' '.join).tolist())
  top_terms2 = tfidf_df.groupby('period').apply(lambda x: x.sort_values('score', ascending=False).head(30)).reset_index(drop=True)


In [29]:
top_terms2

Unnamed: 0,term,score,period
0,digitalhumanities,0.916881,contemporary
1,s16382816,0.217627,contemporary
2,onlinehome,0.217627,contemporary
3,dhhumanist,0.121200,contemporary
4,joyent,0.114680,contemporary
...,...,...,...
85,infobit,0.028793,web_2.0
86,phishscore,0.028460,web_2.0
87,adultscore,0.028460,web_2.0
88,quarantine_notspam,0.028460,web_2.0


In [30]:
top_terms2['period'] = top_terms2['period'].astype(str)
selection = alt.selection_point(fields=['term'], bind='legend')
chart = alt.Chart(top_terms2).mark_bar().encode(
    y='score',
    x=alt.X('period', sort=['early_internet', 'web_2.0', 'contemporary'], axis=alt.Axis(title='Period')),
    color=alt.Color('term', legend=alt.Legend(title='Term', orient='right', symbolLimit=len(top_terms2['term'].unique()), columns=5), scale=alt.Scale(scheme='tableau20')),
    tooltip=['term', 'score', 'period'],
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
    title='Top 30 Terms by TF-IDF Score in Humanist Volumes by Period'
)
chart

the result does not change that much only the score changes

# Change the max_df

In [31]:
vectorizer = TfidfVectorizer(max_df=.7)
# Fit the vectorizer to our documents
transformed_documents = vectorizer.fit_transform(humanist_vols.groupby('period')['volume_text'].apply(' '.join).tolist())
# Now get the top features for each document
transformed_documents_as_array = transformed_documents.toarray()
# Get the periods for each volume
periods = humanist_vols['period'].unique()
# Create an empty list to store our results
tfidf_results = []
# Loop through each document and get the top terms
for counter, doc in enumerate(transformed_documents_as_array):
    # Zip together the terms and the scores
    tf_idf_tuples = list(zip(vectorizer.get_feature_names_out(), doc))
    # Sort the terms by score
    one_doc_as_df = pd.DataFrame.from_records(tf_idf_tuples, columns=['term', 'score']).sort_values(by='score', ascending=False).reset_index(drop=True)
    # Add the date to the dataframe
    one_doc_as_df['period'] = periods[counter]
    # Append the dataframe to our list
    tfidf_results.append(one_doc_as_df)
# Concatenate all the dataframes together
tfidf_df = pd.concat(tfidf_results)
# Sort the dataframe by score
tfidf_df = tfidf_df.sort_values(by=['score'], ascending=False)
# Get the top thirty terms for each period
top_terms3 = tfidf_df.groupby('period').apply(lambda x: x.sort_values('score', ascending=False).head(30)).reset_index(drop=True)

  transformed_documents = vectorizer.fit_transform(humanist_vols.groupby('period')['volume_text'].apply(' '.join).tolist())
  top_terms3 = tfidf_df.groupby('period').apply(lambda x: x.sort_values('score', ascending=False).head(30)).reset_index(drop=True)


In [32]:
top_terms3

Unnamed: 0,term,score,period
0,digitalhumanities,0.916535,contemporary
1,onlinehome,0.217490,contemporary
2,s16382816,0.217490,contemporary
3,dhhumanist,0.121123,contemporary
4,joyent,0.114608,contemporary
...,...,...,...
85,wmccarty,0.030223,web_2.0
86,utf,0.029034,web_2.0
87,spamscore,0.028404,web_2.0
88,listsv440,0.028404,web_2.0


In [33]:
top_terms3['period'] = top_terms3['period'].astype(str)
selection = alt.selection_point(fields=['term'], bind='legend')
chart = alt.Chart(top_terms3).mark_bar().encode(
    y='score',
    x=alt.X('period', sort=['early_internet', 'web_2.0', 'contemporary'], axis=alt.Axis(title='Period')),
    color=alt.Color('term', legend=alt.Legend(title='Term', orient='right', symbolLimit=len(top_terms3['term'].unique()), columns=5), scale=alt.Scale(scheme='tableau20')),
    tooltip=['term', 'score', 'period'],
    opacity=alt.condition(selection, alt.value(1), alt.value(0.2))
).add_params(selection).properties(
    title='Top 30 Terms by TF-IDF Score in Humanist Volumes by Period'
)
chart

In [35]:
# Convert the 'term' columns of the dataframes to sets
top_term1_set = set(top_terms['term'])
top_term2_set = set(top_terms2['term'])
top_term3_set = set(top_terms3['term'])

# Find the observations that only exist in each set
only_in_top_term1 = top_term1_set - (top_term2_set.union(top_term3_set))
only_in_top_term2 = top_term2_set - (top_term1_set.union(top_term3_set))
only_in_top_term3 = top_term3_set - (top_term1_set.union(top_term2_set))

# Print the observations
print("Only in top_term1:", only_in_top_term1)
print("Only in top_term2:", only_in_top_term2)
print("Only in top_term3:", only_in_top_term3)

Only in top_term1: set()
Only in top_term2: {'uribl_block', 'ocp', 'all_trust', 'phishscore', 'e9', 'quarantine_notspam', 'adultscore', 'ecu', 'acadvm1', 'gã', 'dos', 'infobit'}
Only in top_term3: set()


appearently there is some terms that is only unique to top_terms2 ( with stemming) that did not exists in the others