In [12]:
tv7 = "dorozeaCheer dorozeaSpymain dorozeaDoroyeti dorozeaHELLO dorozeaDoroMAD dorozeaDoroHuge dorozeaLOVE dorozeaHey dorozeaDoroScared dorozeaWar dorozeaHUH dorozeaChad dorozeaSlam dorozeaDoropog dorozeaDoroMock dorozeaTouchingyou"
word_list = tv7.split(' ')
tv72 = "dorozeaAAAA dorozeaPog dorozeaDoroEvil dorozeaOmg dorozeaSignOff dorozeaDoroPogShake dorozeaAlert dorozeaTweak dorozeaGG dorozeaSCHIZO dorozeaUH dorozeaUWU dorozeaStare"
tv7_list2 = tv72.split(' ')

merged_list = list(set(word_list + tv7_list2))

merged_list = list(filter(None, merged_list))

In [13]:
import pandas as pd
import re
from collections import Counter

In [14]:
# Read the list of filenames from the configuration file
with open('file_list.txt', 'r', encoding='utf-8') as config_file:
    file_names = config_file.read().splitlines()

# Regex pattern to match the data format
pattern = r'\[(.*?)\] (.*?): (.*)'


# Initialize an empty list to store parsed data
datalist = []
stream_count = 0
# Iterate over each specified file
for file in file_names:
    full_path = "data\\"+file
    with open(full_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            match = re.match(pattern, line)
            if match:
                date, user, message = match.groups()
                datalist.append([date, user, message,stream_count])
    stream_count = stream_count + 1

# Create a DataFrame from the parsed data
data = pd.DataFrame(datalist, columns=["date", "user", "message","stream"])

In [15]:
def optimize_word_counts(data, merged_list):
    # Convert date once at the start
    data['date'] = pd.to_datetime(data['date']).dt.date
    
    # Filter null streams
    data = data[data['stream'].notnull()]
    
    # Pre-split all messages into words (do this once instead of repeatedly)
    split_messages = data['message'].str.split(' ')
    
    # Create a dictionary to store word counts by date
    word_counts_dict = {}
    
    # Process each message's words in bulk
    for date, words in zip(data['date'], split_messages):
        # Convert words to set for faster lookup
        words_set = set(words)
        # Check which target words appear in this message
        matched_words = [word for word in merged_list if word in words_set]
        # Update counts for each matched word
        for word in matched_words:
            if word not in word_counts_dict:
                word_counts_dict[word] = Counter()
            word_counts_dict[word][date] += 1
    
    # Convert the nested dictionary to a dataframe
    rows = []
    for word, date_counts in word_counts_dict.items():
        for date, count in date_counts.items():
            rows.append({
                'date': date,
                'word': word,
                'count': count
            })
    
    final_data = pd.DataFrame(rows)
    
    # Sort if there are any results
    if not final_data.empty:
        final_data = final_data.sort_values(by=['word', 'date']).reset_index(drop=True)
    
    return final_data

In [16]:
final_data = optimize_word_counts(data, merged_list)

In [17]:
# Pivot the data to have words as columns and dates as rows
pivot_data = final_data.pivot_table(
    index='date',    # Rows: dates
    columns='word',  # Columns: words
    values='count',  # Values: word counts
    fill_value=0     # Fill missing values with 0
).reset_index()

# Flatten the columns if needed (e.g., remove multi-index)
pivot_data.columns.name = None  # Remove the name of columns
pivot_data = pivot_data.rename_axis(None, axis=1)

# Display the reshaped data
print(pivot_data)

           date  dorozeaAAAA  dorozeaAlert  dorozeaChad  dorozeaCheer  \
0    2024-05-01          0.0           0.0          0.0           0.0   
1    2024-05-02          0.0           0.0          0.0           0.0   
2    2024-05-03          0.0           0.0          0.0           0.0   
3    2024-05-04          0.0           0.0          0.0           0.0   
4    2024-05-05          0.0           0.0          0.0           0.0   
..          ...          ...           ...          ...           ...   
213  2025-01-16          2.0           7.0          3.0           1.0   
214  2025-01-17          4.0           0.0          1.0           2.0   
215  2025-01-18          1.0          12.0          1.0           1.0   
216  2025-01-19          2.0          11.0          3.0           1.0   
217  2025-01-20          0.0           5.0          3.0           0.0   

     dorozeaDoroEvil  dorozeaDoroHuge  dorozeaDoroMAD  dorozeaDoroMock  \
0                0.0              0.0            

In [18]:
threshold = 1
# Calculate the total usage for each word
total_usage = pivot_data.drop(columns=['date']).sum().sort_values()

# Find words with the lowest usage
barely_used_words = total_usage[total_usage <= threshold].index.tolist()

# Display results
print("Total usage of each word:")
print(total_usage)

print("\nWords barely used (threshold <= {}):".format(threshold))
print(barely_used_words)

Total usage of each word:
dorozeaGG                 6.0
dorozeaStare             19.0
dorozeaUH                21.0
dorozeaSCHIZO            29.0
dorozeaSpymain           53.0
dorozeaUWU               68.0
dorozeaDoroyeti          92.0
dorozeaWar              117.0
dorozeaDoroHuge         121.0
dorozeaCheer            152.0
dorozeaAAAA             197.0
dorozeaDoroMAD          202.0
dorozeaChad             209.0
dorozeaPog              238.0
dorozeaHELLO            251.0
dorozeaDoroScared       274.0
dorozeaTweak            326.0
dorozeaOmg              369.0
dorozeaTouchingyou      372.0
dorozeaDoroMock         462.0
dorozeaSignOff          465.0
dorozeaAlert            473.0
dorozeaDoroEvil         518.0
dorozeaLOVE             567.0
dorozeaHUH              583.0
dorozeaHey              822.0
dorozeaDoroPogShake     856.0
dorozeaSlam            1192.0
dtype: float64

Words barely used (threshold <= 1):
[]


In [19]:
pivot_data_cleaned = pivot_data.drop(columns=barely_used_words)

In [20]:
pivot_data_cleaned = pivot_data_cleaned.set_index('date').cumsum().reset_index()

In [21]:
pivot_data_cleaned_transposed = pivot_data_cleaned.T
pivot_data_cleaned_transposed = pivot_data_cleaned.T.reset_index()

In [22]:
pivot_data_cleaned_transposed.to_excel('your_excel_file.xlsx', sheet_name='Pivot Table')