In [48]:
import pandas as pd
from collections import Counter

# Simulating loading the dataset
# Assume this is the content of train_text_seq.csv
data = {
    "input_str": [
        "000Hello15436World",
        "000Hello1596There",
        "000Hello464Everyone",
        "000Hello614All",
        "000Hello262Everyone",
        "000Hello422There",
        "000Hello284World"
    ]
}
train_df = pd.DataFrame(data)

# Substrings to remove from the strings
remove_substrings_list = ['15436', '1596', '464', '614', '262', '422', '284']

# Function to remove the specified substrings from a sequence
def filter_substrings(sequence, substrings):
    for substring in substrings:
        sequence = sequence.replace(substring, '')  # Remove each specified substring
    return sequence

# Preprocessing function for the dataset
def clean_data(df):
    # Remove leading zeros
    df['cleaned_str'] = df['input_str'].apply(lambda x: x[3:])
    # Remove specific substrings
    df['modified_str'] = df['cleaned_str'].apply(lambda x: filter_substrings(x, remove_substrings_list))
    # Add a new column for the length of the modified string
    df['modified_length'] = df['modified_str'].apply(len)
    # Filter only strings with length 13 after modification
    df = df[df['modified_length'] == 13]
    return df

# Clean the training dataset
train_data_cleaned = clean_data(train_df)

# Step 1: Remove leading zeros from cleaned strings
train_data_cleaned['processed_str'] = train_data_cleaned['modified_str']

# Function to extract all subsequences of a given length range
def extract_subsequences(sequence, min_len=2, max_len=7):
    subsequences = []
    for length in range(min_len, max_len + 1):
        subsequences.extend([sequence[i:i+length] for i in range(len(sequence) - length + 1)])
    return subsequences

# Step 2: Find all subsequences in the cleaned dataset
subseq_counter = Counter()
string_subseq_counts = []

for seq in train_data_cleaned['processed_str']:
    subsequences = extract_subsequences(seq)
    subseq_counter.update(subsequences)  # Update global frequency count
    string_subseq_counts.append(Counter(subsequences))  # Per string subsequence count

# Step 3: Filter subsequences that appear in all strings
common_subsequences = [subseq for subseq in subseq_counter if all(subseq in c for c in string_subseq_counts)]

# Step 4: Get occurrence count in each string
subseq_occurrences_in_strings = {subseq: [] for subseq in common_subsequences}

for subseq in common_subsequences:
    for subseq_count in string_subseq_counts:
        subseq_occurrences_in_strings[subseq].append(subseq_count[subseq])

# Step 5: Prepare data for CSV
output_data = {
    "Subsequence": [],
    "Occurrences": []
}

for subseq, counts in subseq_occurrences_in_strings.items():
    output_data["Subsequence"].append(subseq)
    output_data["Occurrences"].append(counts)

# Create DataFrame for output
output_df = pd.DataFrame(output_data)

# Convert list of counts to a string for better CSV formatting
output_df['Occurrences'] = output_df['Occurrences'].apply(lambda x: ', '.join(map(str, x)))

# Save the processed dataset and common subsequences occurrences to CSV files
train_csv_path = '/content/train_text_seq_proc.csv'
output_csv_path = '/content/common_subsequences_occurrences_rowwise.csv'

train_data_cleaned.to_csv(train_csv_path, index=False)
output_df.to_csv(output_csv_path, index=False)

train_csv_path, output_csv_path


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_cleaned['processed_str'] = train_data_cleaned['modified_str']


('/content/train_text_seq_proc.csv',
 '/content/common_subsequences_occurrences_rowwise.csv')

In [None]:

# Load the data from the CSV file
dataset = pd.read_csv("/content/train_text_seq.csv")

# Step 1: Remove the first three characters (leading zeros) from the input strings
dataset['cleaned_string'] = dataset['input_str'].apply(lambda x: x[3:])

# Function to generate substrings of a specific length from a sequence
def generate_substrings(seq, sub_length):
    return [seq[i:i + sub_length] for i in range(len(seq) - sub_length + 1)]

# Step 2: Initialize counters for different substring lengths (3, 4, 5, and 6)
subseq_length_3 = Counter()
subseq_length_4 = Counter()
subseq_length_5 = Counter()
subseq_length_6 = Counter()

# Step 3: Count occurrences of substrings of lengths 3, 4, 5, and 6 for each sequence
for string in dataset['cleaned_string']:
    subseq_length_3.update(generate_substrings(string, 3))
    subseq_length_4.update(generate_substrings(string, 4))
    subseq_length_5.update(generate_substrings(string, 5))
    subseq_length_6.update(generate_substrings(string, 6))

# Step 4: Retrieve the top 10 most common substrings for each length
top_3_substrings = subseq_length_3.most_common(10)
top_4_substrings = subseq_length_4.most_common(10)
top_5_substrings = subseq_length_5.most_common(10)
top_6_substrings = subseq_length_6.most_common(10)

# Step 5: Print out the results for each substring length
print("Top 10 most frequent 3-length substrings:")
for subseq, count in top_3_substrings:
    print(f"Substring '{subseq}' appears {count} times.")

print("\nTop 10 most frequent 4-length substrings:")
for subseq, count in top_4_substrings:
    print(f"Substring '{subseq}' appears {count} times.")

print("\nTop 10 most frequent 5-length substrings:")
for subseq, count in top_5_substrings:
    print(f"Substring '{subseq}' appears {count} times.")

print("\nTop 10 most frequent 6-length substrings:")
for subseq, count in top_6_substrings:
    print(f"Substring '{subseq}' appears {count} times.")


Top 10 most frequent 3-length substrings:
Substring '262' appears 14595 times.
Substring '614' appears 14487 times.
Substring '596' appears 14246 times.
Substring '159' appears 14161 times.
Substring '415' appears 7567 times.
Substring '436' appears 7303 times.
Substring '154' appears 7267 times.
Substring '422' appears 7163 times.
Substring '464' appears 7156 times.
Substring '543' appears 7144 times.

Top 10 most frequent 4-length substrings:
Substring '1596' appears 14160 times.
Substring '5436' appears 7134 times.
Substring '1543' appears 7082 times.
Substring '4262' appears 5580 times.
Substring '4159' appears 5388 times.
Substring '6142' appears 5113 times.
Substring '5962' appears 4942 times.
Substring '6141' appears 4276 times.
Substring '2621' appears 4155 times.
Substring '6262' appears 3875 times.

Top 10 most frequent 5-length substrings:
Substring '15436' appears 7080 times.
Substring '41596' appears 5388 times.
Substring '15962' appears 4941 times.
Substring '21596' appea

In [None]:
import pandas as pd
from google.colab import files

# Load the training and validation datasets
train_data = pd.read_csv("/content/train_text_seq.csv")
valid_data = pd.read_csv("/content/valid_text_seq.csv")

# Substrings to remove from the strings
remove_substrings_list = ['15436', '1596', '464', '614', '262', '422', '284']

# Function to remove the specified substrings from a sequence
def filter_substrings(sequence, substrings):
    for substring in substrings:
        sequence = sequence.replace(substring, '')  # Remove each specified substring
    return sequence

# Preprocessing function for the dataset
def clean_data(df):
    # Remove leading zeros
    df['cleaned_str'] = df['input_str'].apply(lambda x: x[3:])
    # Remove specific substrings
    df['modified_str'] = df['cleaned_str'].apply(lambda x: filter_substrings(x, remove_substrings_list))
    # Add a new column for the length of the modified string
    df['modified_length'] = df['modified_str'].apply(len)
    # Filter only strings with length 13 after modification
    df = df[df['modified_length'] == 13]
    return df

# Clean both the training and validation datasets
train_data_cleaned = clean_data(train_data)
valid_data_cleaned = clean_data(valid_data)

# Save the processed datasets to CSV files
train_csv_path = '/content/train_text_seq_processed.csv'
valid_csv_path = '/content/valid_text_seq_processed.csv'
train_data_cleaned.to_csv(train_csv_path, index=False)
valid_data_cleaned.to_csv(valid_csv_path, index=False)

# Make the CSV files downloadable
files.download(train_csv_path)
files.download(valid_csv_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Digits analysed post processing

In [None]:
import pandas as pd
from collections import defaultdict

# Read the dataset from a CSV file and specify the column names
df = pd.read_csv('/content/label1_process.csv', names=['input_str', 'label'])

# Function to count occurrences of digits (1 to 100) across rows
def count_digits_in_files(data, min_num=1, max_num=100):
    digit_count = defaultdict(int)

    # Iterate over each row in the input_str column
    for numeric_string in data:
        # Check for each number between 1 and 100
        for number in range(min_num, max_num + 1):
            number_str = str(number)
            # If the number is found in the string, increase its count
            if number_str in numeric_string:
                digit_count[number_str] += 1

    return digit_count

# Count occurrences of digits from 1 to 100 across the 'input_str' column
digit_count = count_digits_in_files(df['input_str'])

# Display the count of each digit in ascending order
for digit in range(1, 101):
    digit_str = str(digit)
    count = digit_count.get(digit_str, 0)  # Get count, default to 0 if not found
    print(f"Digit {digit_str} appears in {count} files.")


Digit 1 appears in 3094 files.
Digit 2 appears in 3011 files.
Digit 3 appears in 2743 files.
Digit 4 appears in 2446 files.
Digit 5 appears in 2531 files.
Digit 6 appears in 2017 files.
Digit 7 appears in 2138 files.
Digit 8 appears in 2332 files.
Digit 9 appears in 2363 files.
Digit 10 appears in 629 files.
Digit 11 appears in 652 files.
Digit 12 appears in 821 files.
Digit 13 appears in 666 files.
Digit 14 appears in 772 files.
Digit 15 appears in 552 files.
Digit 16 appears in 278 files.
Digit 17 appears in 335 files.
Digit 18 appears in 563 files.
Digit 19 appears in 495 files.
Digit 20 appears in 593 files.
Digit 21 appears in 505 files.
Digit 22 appears in 384 files.
Digit 23 appears in 621 files.
Digit 24 appears in 652 files.
Digit 25 appears in 429 files.
Digit 26 appears in 301 files.
Digit 27 appears in 665 files.
Digit 28 appears in 514 files.
Digit 29 appears in 674 files.
Digit 30 appears in 454 files.
Digit 31 appears in 878 files.
Digit 32 appears in 632 files.
Digit 33

In [None]:
import pandas as pd
from collections import defaultdict

# Read the dataset from a CSV file and specify the column names
df = pd.read_csv('/content/label0_process.csv', names=['input_str', 'label'])

# Function to count occurrences of digits (1 to 100) across rows
def count_digits_in_files(data, min_num=1, max_num=100):
    digit_count = defaultdict(int)

    # Iterate over each row in the input_str column
    for numeric_string in data:
        # Check for each number between 1 and 100
        for number in range(min_num, max_num + 1):
            number_str = str(number)
            # If the number is found in the string, increase its count
            if number_str in numeric_string:
                digit_count[number_str] += 1

    return digit_count

# Count occurrences of digits from 1 to 100 across the 'input_str' column
digit_count = count_digits_in_files(df['input_str'])

# Display the count of each digit in ascending order
for digit in range(1, 101):
    digit_str = str(digit)
    count = digit_count.get(digit_str, 0)  # Get count, default to 0 if not found
    print(f"Digit {digit_str} appears in {count} files.")


Digit 1 appears in 3113 files.
Digit 2 appears in 3053 files.
Digit 3 appears in 2772 files.
Digit 4 appears in 2495 files.
Digit 5 appears in 2626 files.
Digit 6 appears in 2009 files.
Digit 7 appears in 2150 files.
Digit 8 appears in 2311 files.
Digit 9 appears in 2396 files.
Digit 10 appears in 643 files.
Digit 11 appears in 686 files.
Digit 12 appears in 841 files.
Digit 13 appears in 664 files.
Digit 14 appears in 740 files.
Digit 15 appears in 565 files.
Digit 16 appears in 278 files.
Digit 17 appears in 329 files.
Digit 18 appears in 548 files.
Digit 19 appears in 475 files.
Digit 20 appears in 601 files.
Digit 21 appears in 508 files.
Digit 22 appears in 380 files.
Digit 23 appears in 649 files.
Digit 24 appears in 666 files.
Digit 25 appears in 428 files.
Digit 26 appears in 305 files.
Digit 27 appears in 654 files.
Digit 28 appears in 462 files.
Digit 29 appears in 693 files.
Digit 30 appears in 461 files.
Digit 31 appears in 867 files.
Digit 32 appears in 598 files.
Digit 33

In [None]:
import pandas as pd

# Load the dataset into a pandas DataFrame
data = pd.read_csv('/content/label0_process.csv')  # Adjust the file name as needed

# Convert the 'input_str' column to string type
data['input_str'] = data['input_str'].astype(str)

# Function to count digits at each position
def count_digits_at_positions(df, max_length=14):  # Set a maximum length for consideration
    # Initialize a list of dictionaries to hold digit counts for each position
    digit_counts = [{} for _ in range(max_length)]

    for _, row in df.iterrows():
        input_str = row['input_str']
        length = len(input_str)

        for pos in range(max_length):
            if pos < length:  # Only process positions that exist in the input string
                digit = input_str[pos]
                if digit in digit_counts[pos]:
                    digit_counts[pos][digit] += 1
                else:
                    digit_counts[pos][digit] = 1

    return digit_counts

# Get digit counts at each column position
max_length = 14  # Set this according to the maximum length you expect
digit_counts = count_digits_at_positions(data, max_length)

# Print the digit counts
for pos, counts in enumerate(digit_counts):
    print(f"Position {pos + 1}:")
    for digit, count in counts.items():
        print(f"  Digit {digit}: {count}")

Position 1:
  Digit 2: 756
  Digit 3: 628
  Digit 5: 358
  Digit 8: 79
  Digit 1: 979
  Digit 6: 140
  Digit 4: 505
  Digit 9: 68
  Digit 7: 63
Position 2:
  Digit 7: 453
  Digit 6: 258
  Digit 9: 362
  Digit 0: 298
  Digit 1: 356
  Digit 5: 295
  Digit 4: 370
  Digit 2: 348
  Digit 3: 536
  Digit 8: 300
Position 3:
  Digit 1: 265
  Digit 9: 414
  Digit 5: 381
  Digit 6: 416
  Digit 0: 356
  Digit 2: 581
  Digit 3: 383
  Digit 7: 338
  Digit 4: 164
  Digit 8: 278
Position 4:
  Digit 8: 317
  Digit 5: 465
  Digit 9: 418
  Digit 6: 307
  Digit 3: 251
  Digit 1: 439
  Digit 0: 481
  Digit 7: 177
  Digit 4: 423
  Digit 2: 298
Position 5:
  Digit 2: 610
  Digit 1: 908
  Digit 5: 237
  Digit 8: 249
  Digit 9: 108
  Digit 3: 537
  Digit 4: 414
  Digit 0: 179
  Digit 7: 121
  Digit 6: 213
Position 6:
  Digit 8: 248
  Digit 1: 437
  Digit 3: 617
  Digit 2: 478
  Digit 5: 319
  Digit 4: 465
  Digit 6: 247
  Digit 9: 228
  Digit 0: 248
  Digit 7: 289
Position 7:
  Digit 0: 319
  Digit 5: 400
  Di

In [None]:
import pandas as pd

# Load the dataset into a pandas DataFrame
data = pd.read_csv('/content/label1_process.csv')  # Adjust the file name as needed

# Convert the 'input_str' column to string type
data['input_str'] = data['input_str'].astype(str)

# Function to count digits at each position
def count_digits_at_positions(df, max_length=14):  # Set a maximum length for consideration
    # Initialize a list of dictionaries to hold digit counts for each position
    digit_counts = [{} for _ in range(max_length)]

    for _, row in df.iterrows():
        input_str = row['input_str']
        length = len(input_str)

        for pos in range(max_length):
            if pos < length:  # Only process positions that exist in the input string
                digit = input_str[pos]
                if digit in digit_counts[pos]:
                    digit_counts[pos][digit] += 1
                else:
                    digit_counts[pos][digit] = 1

    return digit_counts

# Get digit counts at each column position
max_length = 14  # Set this according to the maximum length you expect
digit_counts = count_digits_at_positions(data, max_length)

# Print the digit counts
for pos, counts in enumerate(digit_counts):
    print(f"Position {pos + 1}:")
    for digit, count in counts.items():
        print(f"  Digit {digit}: {count}")

Position 1:
  Digit 1: 1230
  Digit 3: 585
  Digit 2: 937
  Digit 4: 334
  Digit 5: 87
  Digit 6: 75
  Digit 9: 111
  Digit 8: 85
  Digit 7: 60
Position 2:
  Digit 2: 288
  Digit 8: 335
  Digit 3: 428
  Digit 6: 196
  Digit 7: 279
  Digit 4: 450
  Digit 9: 492
  Digit 0: 416
  Digit 5: 287
  Digit 1: 333
Position 3:
  Digit 3: 352
  Digit 9: 326
  Digit 6: 368
  Digit 5: 346
  Digit 0: 305
  Digit 7: 390
  Digit 1: 424
  Digit 2: 498
  Digit 8: 329
  Digit 4: 166
Position 4:
  Digit 8: 444
  Digit 9: 364
  Digit 5: 449
  Digit 1: 451
  Digit 0: 442
  Digit 2: 334
  Digit 6: 250
  Digit 7: 235
  Digit 3: 317
  Digit 4: 218
Position 5:
  Digit 8: 232
  Digit 2: 652
  Digit 0: 188
  Digit 7: 123
  Digit 1: 1053
  Digit 3: 451
  Digit 6: 210
  Digit 4: 338
  Digit 9: 101
  Digit 5: 156
Position 6:
  Digit 9: 300
  Digit 0: 277
  Digit 2: 554
  Digit 6: 198
  Digit 3: 514
  Digit 7: 227
  Digit 1: 556
  Digit 4: 409
  Digit 8: 256
  Digit 5: 213
Position 7:
  Digit 5: 330
  Digit 6: 366
  D

In [None]:
import pandas as pd

def sum_of_digits_in_rows(file_path):
    # Read the CSV file
    df = pd.read_csv('/content/label0_process.csv')

    # Iterate through each row of the DataFrame
    for index, row in df.iterrows():
        row_sum = 0

        # Iterate through each element in the row
        for value in row:
            if isinstance(value, (int, float)):  # Check if the value is a number
                # Convert to string, iterate through digits, and sum them
                row_sum += sum(int(digit) for digit in str(int(value)))
            else:
                # If it's a non-numeric value, handle it as needed
                # Here, we'll just ignore it
                continue

        print(f"Sum of digits in row {index + 1}: {row_sum}")

# Replace 'your_file.csv' with the path to your CSV file
file_path = 'your_file.csv'
sum_of_digits_in_rows('/content/label0_process.csv')

Sum of digits in row 1: 37
Sum of digits in row 2: 54
Sum of digits in row 3: 59
Sum of digits in row 4: 52
Sum of digits in row 5: 72
Sum of digits in row 6: 55
Sum of digits in row 7: 48
Sum of digits in row 8: 33
Sum of digits in row 9: 60
Sum of digits in row 10: 46
Sum of digits in row 11: 35
Sum of digits in row 12: 55
Sum of digits in row 13: 47
Sum of digits in row 14: 37
Sum of digits in row 15: 48
Sum of digits in row 16: 49
Sum of digits in row 17: 52
Sum of digits in row 18: 48
Sum of digits in row 19: 37
Sum of digits in row 20: 53
Sum of digits in row 21: 54
Sum of digits in row 22: 51
Sum of digits in row 23: 45
Sum of digits in row 24: 47
Sum of digits in row 25: 43
Sum of digits in row 26: 50
Sum of digits in row 27: 42
Sum of digits in row 28: 47
Sum of digits in row 29: 48
Sum of digits in row 30: 57
Sum of digits in row 31: 46
Sum of digits in row 32: 43
Sum of digits in row 33: 65
Sum of digits in row 34: 66
Sum of digits in row 35: 39
Sum of digits in row 36: 45
S

In [None]:
import pandas as pd

def sum_of_digits_in_rows(file_path):
    # Read the CSV file
    df = pd.read_csv('/content/label1_process.csv')

    # Iterate through each row of the DataFrame
    for index, row in df.iterrows():
        row_sum = 0

        # Iterate through each element in the row
        for value in row:
            if isinstance(value, (int, float)):  # Check if the value is a number
                # Convert to string, iterate through digits, and sum them
                row_sum += sum(int(digit) for digit in str(int(value)))
            else:
                # If it's a non-numeric value, handle it as needed
                # Here, we'll just ignore it
                continue

        print(f"Sum of digits in row {index + 1}: {row_sum}")

# Replace 'your_file.csv' with the path to your CSV file
file_path = 'your_file.csv'
sum_of_digits_in_rows('/content/label1_process.csv')

Sum of digits in row 1: 52
Sum of digits in row 2: 63
Sum of digits in row 3: 64
Sum of digits in row 4: 53
Sum of digits in row 5: 50
Sum of digits in row 6: 43
Sum of digits in row 7: 46
Sum of digits in row 8: 44
Sum of digits in row 9: 65
Sum of digits in row 10: 57
Sum of digits in row 11: 71
Sum of digits in row 12: 48
Sum of digits in row 13: 44
Sum of digits in row 14: 50
Sum of digits in row 15: 69
Sum of digits in row 16: 38
Sum of digits in row 17: 60
Sum of digits in row 18: 43
Sum of digits in row 19: 39
Sum of digits in row 20: 54
Sum of digits in row 21: 43
Sum of digits in row 22: 42
Sum of digits in row 23: 59
Sum of digits in row 24: 56
Sum of digits in row 25: 54
Sum of digits in row 26: 43
Sum of digits in row 27: 45
Sum of digits in row 28: 46
Sum of digits in row 29: 69
Sum of digits in row 30: 41
Sum of digits in row 31: 30
Sum of digits in row 32: 51
Sum of digits in row 33: 63
Sum of digits in row 34: 68
Sum of digits in row 35: 44
Sum of digits in row 36: 58
S

In [None]:
import pandas as pd
from google.colab import files

# Load the dataset
df = pd.read_csv('/content/label0_process.csv', names=['input_str', 'label'])

def sum_even_odd_digits_per_string(data):
    results = []  # List to store the results for each string

    for numeric_string in data:
        even_sum = 0
        odd_sum = 0

        for char in numeric_string:
            if char.isdigit():  # Check if the character is a digit
                if int(char) % 2 == 0:
                    even_sum += int(char)
                else:
                    odd_sum += int(char)

        results.append((even_sum, odd_sum))  # Append the sums as a tuple

    return results

# Get the sum of even and odd digits for each input string
sums = sum_even_odd_digits_per_string(df['input_str'])

# Create a DataFrame for better display
sums_df = pd.DataFrame(sums, columns=['Even Sum', 'Odd Sum'])

# Save the results to a CSV file
csv_file_path = '/content/even_odd_sums_label0.csv'
sums_df.to_csv(csv_file_path, index=False)

# Make the CSV file downloadable
files.download(csv_file_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
from google.colab import files

# Load the dataset
df = pd.read_csv('/content/label1_process.csv', names=['input_str', 'label'])

def sum_even_odd_digits_per_string(data):
    results = []  # List to store the results for each string

    for numeric_string in data:
        even_sum = 0
        odd_sum = 0

        for char in numeric_string:
            if char.isdigit():  # Check if the character is a digit
                if int(char) % 2 == 0:
                    even_sum += int(char)
                else:
                    odd_sum += int(char)

        results.append((even_sum, odd_sum))  # Append the sums as a tuple

    return results

# Get the sum of even and odd digits for each input string
sums = sum_even_odd_digits_per_string(df['input_str'])

# Create a DataFrame for better display
sums_df = pd.DataFrame(sums, columns=['Even Sum', 'Odd Sum'])

# Save the results to a CSV file
csv_file_path = '/content/even_odd_sums_label1.csv'
sums_df.to_csv(csv_file_path, index=False)

# Make the CSV file downloadable
files.download(csv_file_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [49]:
import pandas as pd
from google.colab import files

# Load the dataset
df = pd.read_csv('/content/label1_process.csv', names=['input_str', 'label'])

# Define a function to replace digits
def replace_digits_with_letters(input_str):
    # Define the mapping
    replacement = {'0': 'e', '1': 'o', '2': 'e', '3': 'o', '4': 'e',
                   '5': 'o', '6': 'e', '7': 'o', '8': 'e', '9': 'o'}

    # Replace each digit in the input string
    result_str = ''.join(replacement.get(char, char) for char in input_str)

    return result_str

# Apply the function to the 'input_str' column
df['modified_str'] = df['input_str'].apply(replace_digits_with_letters)

# Save the modified DataFrame to a new CSV file
output_csv_path = '/content/modified_label1_process.csv'
df.to_csv(output_csv_path, index=False)

# Make the CSV file downloadable
files.download(output_csv_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [50]:
import pandas as pd
from google.colab import files

# Load the dataset
df = pd.read_csv('/content/label0_process.csv', names=['input_str', 'label'])

# Define a function to replace digits
def replace_digits_with_letters(input_str):
    # Define the mapping
    replacement = {'0': 'e', '1': 'o', '2': 'e', '3': 'o', '4': 'e',
                   '5': 'o', '6': 'e', '7': 'o', '8': 'e', '9': 'o'}

    # Replace each digit in the input string
    result_str = ''.join(replacement.get(char, char) for char in input_str)

    return result_str

# Apply the function to the 'input_str' column
df['modified_str'] = df['input_str'].apply(replace_digits_with_letters)

# Save the modified DataFrame to a new CSV file
output_csv_path = '/content/modified_label0_process.csv'
df.to_csv(output_csv_path, index=False)

# Make the CSV file downloadable
files.download(output_csv_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>