In [1]:
# import block

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from pathlib import Path
import pandas as pd
import glob

#directoryname = "wiki_articles"
# gets all file names
text_files = glob.glob("ingredients" + "/*.txt")
file_names = [Path(text).stem for text in text_files]

# does the TF-IDF counting
tfidf_vectorizer = TfidfVectorizer(input='filename', stop_words='english')
tfidf_vector = tfidf_vectorizer.fit_transform(text_files)

# This converts the results to a pandas dataframe, which makes it easier to
# process and visualize
tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=file_names, columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df.stack().reset_index()
tfidf_df = tfidf_df.stack().reset_index()
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)

# This line of code just saves the above output to a variable so that you can query it.

top_tfidf = tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)

# heatmap
import altair as alt
import numpy as np


    # adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

    # base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'document:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["document"],
)

    # heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

    # text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

    # display the three superimposed visualizations
(heatmap + text).properties(width = 600)



In [16]:
import re
import os
import glob

def write_info(filepath):
    # gets all file names
    file_names = os.listdir("company_ingredient_data")
    # edit file names
    text_files = [re.sub(".csv", "", name) for name in file_names]
    text_files = [re.sub(".*/", "", name) for name in text_files]
    # loop through the files
    for index in range(len(file_names)):
        df = pd.read_csv("company_ingredient_data/" + file_names[index], delimiter = '@')
        # read in the ingredient column
        df = df[['ingredients']]
        line_list = []
        # write all the ingredients of one company to 1 file
        with open("ingredients/" + text_files[index] + ".txt", "w", encoding="utf-8") as w:
            for ind in df.index:
                all_ingredients = str(df["ingredients"][ind])
                line_list.append(all_ingredients)
            for line in line_list:
                w.write(line)

write_info("ingredients")