In [None]:
import pandas as pd
import numpy as np
import os, csv
import re, json
import threading
import time

In [None]:
def split_df(df, n):
    total_num = len(df)
    batch_size = total_num // n
    df_list = []
    for i in range(n):
        new_df = pd.DataFrame(columns = df.columns)
        start = i*batch_size
        end = (i+1)*batch_size
        if i==n-1:
            end = total_num
        start_from = df.index[0]
        new_df = df.loc[start+start_from:end+start_from,:]
        df_list.append(new_df)
    return df_list

def count_words_in_text(text, word_list):
    count = 0
    for word in word_list:
        try:
            matches = re.findall(word, text)
        except:
            matches = []
        count += len(matches)
    return count

def measure_moralization(df, valence, columns, writer,file):
    for index, row in df.iterrows():
        text = row[columns]
        word_cnt_dict = dict()
        word_cnt_dict['ID'] = row['ID']
        for v in valence:
            word_cnt = count_words_in_text(text, moral_list[v])
            word_cnt_dict[v] = word_cnt
        writer.writerow(word_cnt_dict)
        file.flush()
    return df

class WordCounterThread(threading.Thread):
    def __init__(self, df, valence, columns, idx, data_name):
        threading.Thread.__init__(self)
        self.df_new = None
        self.df = df
        self.valence = valence
        self.columns = columns
        self.file = open(f"thread/{data_name}/{idx}.csv", "a+")
        self.writer = csv.DictWriter(self.file, delimiter=',', lineterminator='\n',fieldnames=['ID'] + valence)
        if not os.path.getsize(f"thread/{data_name}/{idx}.csv"):
            self.writer.writeheader()
        
        # self.file = None

    def run(self):
        self.df_new = measure_moralization(self.df, self.valence, self.columns, self.writer, self.file)

def count_words_multi_threaded(df, valence, columns, data_name):
    threads = []
    # set how many threads you need
    N = 4 
    df_list = split_df(df, N)
    df_list_new = []
    # Create a thread for each text
    for idx, df in enumerate(df_list):
        thread = WordCounterThread(df, valence, columns, idx, data_name)
        threads.append(thread)

    # Start all the threads
    for thread in threads:
        thread.start()

    # Wait for all the threads to finish
    for thread in threads:
        thread.join()

    # Collect the word counts from each thread
    for thread in threads:
        df_list_new.append(thread.df_new)

    # combine df_list_new
    return df_list_new

def get_data(file_name):
    file_path = 'data/'
    data = pd.read_csv(file_path+file_name+'.csv')
    collected_path = 'thread/'
    folder_path = f'{collected_path}{file_name}/'  # Specify the folder path
    file_names = []
    print(folder_path)
        
    print('before drop completed: ',len(data))
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            f_name = os.path.join(file)
            if f_name.endswith('.csv'):
                if not 'checkpoint' in f_name:
                    old_data = pd.read_csv(folder_path+f_name)
                    rows_to_remove = data.set_index(['ID']).index.isin(old_data.set_index(['ID']).index)
                    data = data[~rows_to_remove]
    print('after drop completed: ',len(data))
    return data

with open("moral_dictionary.txt", "r") as f:
    s = f.read()
    moral_list = json.loads(s)
moral_keys = ['neu','pos','neg','pas']

In [None]:
def count_words_in_text(text, word_list):
    count = []
    for word in word_list:
        try:
            matches = re.findall(word, text)
        except:
            matches = []
        if len(matches) > 0:
            count.append(matches)
    return count

In [None]:
print(time.asctime())
data_name = 'weibo_comments'
data = get_data(data_name)
moral_keys = ['neu','pos','neg','pas']
df_list_new = count_words_multi_threaded(data, moral_keys, 'comment_content', data_name)