# Data Wrangling

Working with text files

- Download WSC dataset: https://drive.google.com/file/d/1pAFXnjHyeRm9TGV9mknM25f7AvS8UZ4-/view?usp=share_link
- Read files and store them into one dataframe (preserve train, test, and val split info)
- Count label values: What is the distribution of the target variable?
- Create a new variable that stores text without tags
- Create new variables that store the number of characters, sentences and words of every sample
- Create new variables that store information about tagged words: their position in the text (index). Assume that empty space is a delimiter. 
- Save to disk 3 files in json lines format (train.jsonl, val.jsonl, test.jsonl) 

In [1]:
import pandas as pd
import os

# Define a function to extract files from the ZIP and read them into a Pandas DataFrame
def read_files_to_dataframe():
    # Initialize lists to store data and labels
    texts = []
    labels = []
    split_sets = []

    # Walk through the directory and read files
    for root, dirs, files in os.walk("wsc-txt"):
        for file in files:
            # Read the file content
            with open(os.path.join(root, file), 'r', encoding='utf-8-sig') as f:
                split_set = file.split('_')[0].lower()
                split_sets.append(split_set)
                for line in f:
                    if line.startswith('text:'):
                        text = line.strip().split('text:', 1)[1]
                        texts.append(text)
                    if line.startswith('label'):
                        label = line.strip().split('label:', 1)[1]
                        labels.append(label)

    # Create a DataFrame
    df = pd.DataFrame({'text': texts, 'label': labels, 'split': split_sets})
    
    return df

# Call the function and store the data in a DataFrame
df = read_files_to_dataframe()
df.head()

Unnamed: 0,text,label,split
0,<w1> John </w1> je potrkal na vrata in Susan j...,True,train
1,"<w1> Bob </w1> se ni razjezil na Sally, ki ga ...",False,train
2,<w1> Znanstvenici </w1> preiskujeta tri vrste ...,True,train
3,<w1> Bill </w1> je računalniško konzolo podal ...,True,train
4,"<w1> John </w1> je najel Beth, zato da bi lahk...",True,train


In [2]:
# Count the distribution of the target variable (label values)
label_distribution = df['label'].value_counts()

# Display the distribution
label_distribution

label
False    66
True     58
Name: count, dtype: int64

In [3]:
import re

def remove_tags_and_clean_whitespace(text):
    # Remove tags but keep the text between them
    text_without_tags = re.sub(r'<\/?w\d+>', '', text)
    # Replace multiple spaces with a single space and strip leading/trailing whitespace
    clean_text = re.sub(r'\s+', ' ', text_without_tags).strip()
    return clean_text

# Assuming `df` is your DataFrame and it has a 'text' column
df['clean_text'] = df['text'].apply(remove_tags_and_clean_whitespace)

df.head()  # Display the first few rows to verify the updated 'clean_text' variable

Unnamed: 0,text,label,split,clean_text
0,<w1> John </w1> je potrkal na vrata in Susan j...,True,train,John je potrkal na vrata in Susan jih je odprl...
1,"<w1> Bob </w1> se ni razjezil na Sally, ki ga ...",False,train,"Bob se ni razjezil na Sally, ki ga je prekinil..."
2,<w1> Znanstvenici </w1> preiskujeta tri vrste ...,True,train,"Znanstvenici preiskujeta tri vrste rib, ki so ..."
3,<w1> Bill </w1> je računalniško konzolo podal ...,True,train,"Bill je računalniško konzolo podal Jane, ker j..."
4,"<w1> John </w1> je najel Beth, zato da bi lahk...",True,train,"John je najel Beth, zato da bi lahko skrbela z..."


In [4]:
import re

def extract_tagged_words_with_details(text):
    # Split the text into words and tags, keeping the tags intact
    words_and_tags = re.findall(r'<w\d+>.*?<\/w\d+>|[\w\'-]+', text)

    # Initialize an empty list to store information about tagged words
    tagged_words_info = []

    # Iterate through the sequence, identifying tagged words and their details
    for index, word_or_tag in enumerate(words_and_tags, start=0):  # Starting index from 1 for readability
        # Check if the current element is a tagged word
        tag_match = re.match(r'(<w\d+>)(.*?)(<\/w\d+>)', word_or_tag)
        if tag_match:
            # Extract the tag, the word, and the closing tag
            opening_tag, word, closing_tag = tag_match.groups()
            # Store the index, the clean word, and the tag information
            tagged_words_info.append({
                'index': index,
                'word': word.strip(),
                'tag': opening_tag 
            })

    return tagged_words_info

# Example usage
text = "<w1> John </w1> je potrkal na vrata in Susan jih je odprla. <w2> Povabil jo je </w2> ven."
tagged_words_info = extract_tagged_words_with_details(text)

print(tagged_words_info)

[{'index': 0, 'word': 'John', 'tag': '<w1>'}, {'index': 10, 'word': 'Povabil jo je', 'tag': '<w2>'}]


In [5]:
df['tagged_words_details'] = df['text'].apply(extract_tagged_words_with_details)

In [6]:
import re
from nltk.tokenize import sent_tokenize, word_tokenize

def analyze_text(text):
    char_count = len(text)
    word_count = len(word_tokenize(text, language='slovene'))
    sentence_count = len(sent_tokenize(text, language='slovene'))
    
    return char_count, word_count, sentence_count

# Assuming `df` is your DataFrame and it has a 'clean_text' column
df['char_count'], df['word_count'], df['sentence_count'] = zip(*df['clean_text'].map(analyze_text))

df.head()  # Display the first few rows to verify the new variables

Unnamed: 0,text,label,split,clean_text,tagged_words_details,char_count,word_count,sentence_count
0,<w1> John </w1> je potrkal na vrata in Susan j...,True,train,John je potrkal na vrata in Susan jih je odprl...,"[{'index': 0, 'word': 'John', 'tag': '<w1>'}, ...",67,16,2
1,"<w1> Bob </w1> se ni razjezil na Sally, ki ga ...",False,train,"Bob se ni razjezil na Sally, ki ga je prekinil...","[{'index': 0, 'word': 'Bob', 'tag': '<w1>'}, {...",90,21,1
2,<w1> Znanstvenici </w1> preiskujeta tri vrste ...,True,train,"Znanstvenici preiskujeta tri vrste rib, ki so ...","[{'index': 0, 'word': 'Znanstvenici', 'tag': '...",115,21,2
3,<w1> Bill </w1> je računalniško konzolo podal ...,True,train,"Bill je računalniško konzolo podal Jane, ker j...","[{'index': 0, 'word': 'Bill', 'tag': '<w1>'}, ...",73,14,1
4,"<w1> John </w1> je najel Beth, zato da bi lahk...",True,train,"John je najel Beth, zato da bi lahko skrbela z...","[{'index': 0, 'word': 'John', 'tag': '<w1>'}, ...",51,12,1


In [7]:
# Filter the DataFrame based on the 'split' values and save each subset to a .jsonl file
for split_value in df['split'].unique():
    subset_df = df[df['split'] == split_value]
    
    # Define the file path based on the split value
    file_path = f'{split_value}.jsonl'
    
    # Save the subset DataFrame to a JSON Lines file
    subset_df.to_json(file_path, orient='records', lines=True, force_ascii=False)

    print(f'Saved {split_value} set to {file_path}')

Saved train set to train.jsonl
Saved test set to test.jsonl
Saved val set to val.jsonl
