## Read File to Dataframe

In [3]:
input_file = "original.csv"

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv(input_file)

In [6]:
df.shape

(111861, 8)

### Select Stratified Random Sample of 8000

In [7]:
groupby = df.groupby("Category", group_keys=False)

In [8]:
sample_df = groupby.apply(lambda x: x.sample(n=2000, random_state= 91))

In [9]:
sample_df.drop(columns=["Index"], inplace=True)

In [10]:
sample_df = sample_df.reset_index(drop=True)

In [11]:
sample_df.shape

(8000, 7)

In [12]:
sample_df["Category"].value_counts()

Business & Economics    2000
Entertainment           2000
Science & Technology    2000
Sports                  2000
Name: Category, dtype: int64

## Fix Spacing around English Numerals

In [13]:
import re

In [14]:
def fix_spacing(text):
    # Add space before numbers
    fixed_text = re.sub(r'(\D)(\d+)', r'\1 \2', text)  
    # Add space after numbers
    fixed_text = re.sub(r'(\d+)(\D)', r'\1 \2', fixed_text)  
    return fixed_text

## Convert English Numerals to Arabic

In [15]:
# following code has been taken from: https://stackoverflow.com/questions/26626238/how-to-convert-normal-numbers-into-arabic-numbers-in-django
def en_to_ar_num(number_string):
    dic = {
        '0': '۰',
        '1': '١',
        '2': '٢',
        '3': '۳',
        '4': '٤',
        '5': '۵',
        '6': '٦',
        '7': '۷',
        '8': '۸',
        '9': '۹',
    }

    return "".join([dic[char] for char in number_string])

en_to_ar_num("124") # ١٢٤

'١٢٤'

In [16]:
def replace_numerals(text):
    # Regular expression to match sequences of digits
    return re.sub(r'\d+', lambda match: en_to_ar_num(match.group(0)), text)

## Remove Non-Alphanumeric Symbols

In [17]:
def remove_non_alpha(text):
    # Regular expression to match any non-alphanumeric character
    return re.sub(r'[^\w\s]', "", text)

## Remove Extra Whitespaces

In [18]:
def remove_extra_spaces(text):
    # Replace multiple spaces with a single space and strip leading/trailing spaces
    return re.sub(r'\s+', ' ', text).strip()

## Check for Remaining Non-Urdu Characters

In [19]:
def is_urdu (char):
    #to avoid removal of whitespace
    if char == ' ':
        return True
    
    unicode_range = ('\u0600', '\u06FF')
    return (unicode_range[0] <= char <= unicode_range[1])

### Remove Non-Urdu Characters (if any)

In [20]:
def remove_non_urdu (text):
    cleaned_string = ''.join([char for char in text if is_urdu(char)])
    return cleaned_string

## Remove Stop Words

In [21]:
with open ("urdu_stopwords.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

stop_words = [line.strip() for line in lines]

In [22]:
def remove_stop_words (example):
    output = [word for word in example.split() if word not in stop_words]
    output = " ".join(output)
    return output  

# Apply Transformations to Sampled Set

In [23]:
def apply_functions(df):
    # Apply each function sequentially to the 'News Text' column
    df['News Text'] = df['News Text'].apply(fix_spacing)
    df['News Text'] = df['News Text'].apply(replace_numerals)
    df['News Text'] = df['News Text'].apply(remove_non_alpha)
    df['News Text'] = df['News Text'].apply(remove_extra_spaces)
    df['News Text'] = df['News Text'].apply(remove_non_urdu)
    df['News Text'] = df['News Text'].apply(remove_stop_words)
    return df

In [24]:
sample_df = apply_functions(sample_df)

## Split Into Train, Validation and Test Sets

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
train_df, temp_df = train_test_split (sample_df, test_size=0.25, stratify=sample_df["Category"], random_state=1)

In [27]:
train_df["Category"].value_counts()

Science & Technology    1500
Business & Economics    1500
Sports                  1500
Entertainment           1500
Name: Category, dtype: int64

In [28]:
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["Category"], random_state=2)

In [29]:
val_df["Category"].value_counts()

Science & Technology    250
Sports                  250
Business & Economics    250
Entertainment           250
Name: Category, dtype: int64

In [30]:
test_df["Category"].value_counts()

Entertainment           250
Science & Technology    250
Business & Economics    250
Sports                  250
Name: Category, dtype: int64

### Write to File

In [31]:
train_df.to_csv("train_set.csv", index=False)
val_df.to_csv("val_set.csv", index=False)
test_df.to_csv("test_set.csv", index=False)