In [204]:
import pandas as pd
import glob
import os
import random
import textwrap


In [205]:
# Define the folder path
folder_path = 'raw_data'

# Get a list of all CSV files in the folder and sort them by name
all_files = sorted(glob.glob(os.path.join(folder_path, '*.csv')))

# Dictionary to hold the dataframes
dataframes = {}

# Loop through each file and read it into a dataframe
for file in all_files:
    # Get the file name without the extension
    file_name = os.path.splitext(os.path.basename(file))[0]
    
    # Read the file into a dataframe and store it in the dictionary
    dataframes[file_name] = pd.read_csv(file)

# Display the keys of the dictionary (file names)
print("Files read into dataframes:", dataframes.keys())


Files read into dataframes: dict_keys(['1-test1', '2-test2', '3-test3', '4-test4', '5-test', '6-test'])


In [206]:
data_array = []

# Loop through each file and read its data
for file in all_files:
    # Read the CSV file into a dataframe
    df = pd.read_csv(file)
    
    header = df.columns.tolist()
    data_array.append([header]+df.values.tolist())

for i, data in enumerate(data_array):
   print(f"Data from {i+1}: {data}")

Data from 1: [['test'], ['test2'], ['test3']]
Data from 2: [['2-test'], ['2-test2'], ['2-test3']]
Data from 3: [['3-test'], ['3-test2'], ['3-test3'], ['3-test4'], ['3-test5'], ['3-test6'], ['3-test7'], ['3-test8'], ['3-test9'], ['3-test10'], ['3-test11'], ['3-test12']]
Data from 4: [['4-test'], ['4-test2'], ['4-test3'], ['4-test4'], ['4-test5'], ['4-test6'], ['4-test7'], ['4-test8'], ['4-test9'], ['4-test10'], ['4-test11'], ['4-test12']]
Data from 5: [['5-apiwat'], ['5-apiwat2'], ['5-apiwat3'], ['5-apiwat4'], ['5-apiwat5'], ['5-apiwat6'], ['5-apiwat7'], ['5-apiwat8'], ['5-apiwat9'], ['5-apiwat10'], ['5-apiwat11'], ['5-apiwat12']]
Data from 6: [['60001'], [60002], [60003], [60004], [60005], [60006], [60007], [60008], [60009], [600010], [600011], [600012]]


In [207]:
lines_per_cell = int(input("Enter the number of lines per cell: "))

In [208]:
# Convert integers to strings
data_array = [[[str(item) for item in sublist] if isinstance(sublist[0], int) else sublist for sublist in data] for data in data_array]

In [209]:
# Expand the data to the desired line count
expanded_data = []
for i, data in enumerate(data_array):
    expanded_data.append(data * int(int(lines_per_cell) // len(data) + 1))

In [210]:
# Shuffle the data
for  i, data in enumerate(expanded_data):
    random.shuffle(expanded_data[i])

In [211]:
# Set start with original
for  i, data in enumerate(expanded_data):
   expanded_data[i] =  data_array[i]+expanded_data[i]

In [212]:
# Trim the expanded data to the desired line count
for  i, data in enumerate(expanded_data):
    expanded_data[i] = expanded_data[i][:lines_per_cell]

In [213]:

# Create DataFrames
dfs = [pd.DataFrame(value, columns=[f'Column {i+1}' for i in range(len(value[0]))]) for value in expanded_data]

# Concatenate DataFrames
result_df = pd.concat(dfs, axis=1)


# Format with "|" separators
formatted_result = result_df.apply(lambda x: '|'.join(x), axis=1)

print(formatted_result)


0              test|2-test|3-test|4-test|5-apiwat|60001
1         test2|2-test2|3-test2|4-test2|5-apiwat2|60002
2         test3|2-test3|3-test3|4-test3|5-apiwat3|60003
3         test3|2-test3|3-test4|4-test4|5-apiwat4|60004
4           test|2-test|3-test5|4-test5|5-apiwat5|60005
                             ...                       
295        test|2-test2|3-test4|4-test3|5-apiwat9|60002
296        test2|2-test|3-test8|4-test5|5-apiwat4|60004
297    test2|2-test3|3-test7|4-test10|5-apiwat10|600010
298       test3|2-test|3-test3|4-test8|5-apiwat10|60002
299      test2|2-test|3-test12|4-test4|5-apiwat6|600012
Length: 300, dtype: object


In [214]:

# Write to a text file
with open('formatted_data.txt', 'w') as file:
    for line in formatted_result:
        file.write(line + '\n')
