# English - Hindi Pipelines 

In [6]:
import pandas as pd
raw_data = pd.read_csv("../Datasets/WikiMatrix/Processed/processed_en-hi.csv")

In [7]:
raw_data.head(10)

Unnamed: 0,English,Hindi
0,recite in the name of your lord who created—cr...,"अपने परवरदिगार का नाम ले कर पढ़ो, जिसने (दुनिय..."
1,they were tenants to their lord.,अतः वे अपने इष्ट परम प्रभु की उपासना में ही दत...
2,indeed your lord is the all-beneficent.,तुम्हारा रब एक है।
3,"i mean, we all lived in this century.","मेरा मतलब है, हम सभी को इस सदी में रहते थे।"
4,be steadfastly righteous!,अतः तुम वही करो जो उचित है।
5,"""2013 is here, and we are ready!"".","""2013 यहाँ है, और हम तैयार हैं!"
6,"""ədalət şükürov bio"".",वह उनके प्रति अत्यंत कृतज्ञ है।
7,(onkelos furthur queried) what/how (do you adv...,बादरायण ने उनको क्या समझा और भाष्यकारों ने उनक...
8,allah sent him (as an apostle) when he was for...,अल्लाह ने उसे (प्रेषित के रूप में) भेजा जब वह ...
9,have we not made for him two eyes?,क्या हमने उसके लिए दो आंखें नहीं बनाई हैं?


In [8]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from langdetect import detect, DetectorFactory, LangDetectException

# Ensure consistent language detection
DetectorFactory.seed = 0

# Variables for configuration
DATASET_PATH = "../Datasets/WikiMatrix/Processed/processed_en-hi.csv"  # Path to your dataset
SOURCE_COLUMN = "English"  # Column name for source text
TARGET_COLUMN = "Hindi"    # Column name for target text
SOURCE_LANG = "en"         # Desired source language (e.g., English)
TARGET_LANG = "hi"         # Desired target language (e.g., Hindi)
OUTPUT_PATH = "../Datasets/WikiMatrix/Processed/clean_en-hi.csv"  # Path to save cleaned dataset


# Step 1: Drop Null and Empty Values
class DropNullAndEmptyValues(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Drop rows where source or target columns are null or empty
        return X.dropna(subset=[SOURCE_COLUMN, TARGET_COLUMN]).loc[
            (X[SOURCE_COLUMN].str.strip() != '') & (X[TARGET_COLUMN].str.strip() != '')
        ]


# Step 2: Validate Source Language
class ValidateSourceLanguage(BaseEstimator, TransformerMixin):
    def __init__(self, source_lang, source_column):
        self.source_lang = source_lang
        self.source_column = source_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Detect language for each source text and filter rows
        def is_valid_lang(text):
            try:
                return detect(text) == self.source_lang
            except LangDetectException:
                return False

        return X[X[self.source_column].apply(is_valid_lang)]


# Step 3: Validate Target Language
class ValidateTargetLanguage(BaseEstimator, TransformerMixin):
    def __init__(self, target_lang, target_column):
        self.target_lang = target_lang
        self.target_column = target_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Check if target text matches the desired target language
        def is_valid_lang(text):
            try:
                return detect(text) == self.target_lang
            except LangDetectException:
                return False

        return X[X[self.target_column].apply(is_valid_lang)]


# Step 4: Drop Duplicate Rows
class DropDuplicates(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop_duplicates()


# Step 5: Write Dataset to CSV
class WriteToCSV(BaseEstimator, TransformerMixin):
    def __init__(self, output_path):
        self.output_path = output_path

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X.to_csv(self.output_path, index=False)
        return X


# Load the dataset
raw_data = pd.read_csv(DATASET_PATH)

# Define the pipeline
pipeline = Pipeline([
    ('drop_null_and_empty', DropNullAndEmptyValues()),
    ('validate_source_lang', ValidateSourceLanguage(SOURCE_LANG, SOURCE_COLUMN)),
    ('validate_target_lang', ValidateTargetLanguage(TARGET_LANG, TARGET_COLUMN)),
    ('drop_duplicates', DropDuplicates()),
    ('write_to_csv', WriteToCSV(OUTPUT_PATH))
])

# Apply the pipeline
cleaned_data = pipeline.fit_transform(raw_data)

# Display the cleaned dataset (optional)
print("Pipeline execution completed. Cleaned data preview:")
print(cleaned_data.head())


Pipeline execution completed. Cleaned data preview:
                                             English  \
0  recite in the name of your lord who created—cr...   
1                   they were tenants to their lord.   
2            indeed your lord is the all-beneficent.   
3              i mean, we all lived in this century.   
4                          be steadfastly righteous!   

                                               Hindi  
0  अपने परवरदिगार का नाम ले कर पढ़ो, जिसने (दुनिय...  
1  अतः वे अपने इष्ट परम प्रभु की उपासना में ही दत...  
2                                 तुम्हारा रब एक है।  
3        मेरा मतलब है, हम सभी को इस सदी में रहते थे।  
4                        अतः तुम वही करो जो उचित है।  


# English - Greek Pipelines 

In [21]:
import pandas as pd
raw_data = pd.read_csv("../Datasets/WikiMatrix/Processed/processed_en-el.csv")

In [22]:
raw_data.head(10)

Unnamed: 0,English,Greek
0,"""palace brothers: there is no one what will ta...",Ἡ Μάνη ἐφθόνησε τὸν Μπέη.
1,"to the sky, the sky-god, and the supreme god, ...",Ǧ (g με ανάποδη οξυβαρεία) Ġayn (Αραβικά) Ghay...
2,.]],Εκκλησία Αγίου Παντελεήμονος]]].
3,london: ]]].,Η Καθημερινή. ]]
4,for us this is a lesson and a warning.,Για εμάς αυτό είναι ένα μάθημα και μια προειδο...
5,"anyone who tells you differently is a liar.""","Όποιος σου πει το αντίθετο είναι ψεύτης""."
6,sports events in france for the weekend of 14–...,Τα αθλητικά γεγονότα στη Γαλλία για το Σαββατο...
7,they ask him to come in for questioning.,Ήρθαν λοιπόν για να το ζητήσουν.
8,i am always asking mama when you will come ...,Ρωτώ συνέχεια τη μαμά πότε θα έρθεις...
9,"we're friends with them, but there's a limit t...",Είμαστε φίλοι με αυτούς αλλά υπάρχει ένα όριο ...


In [23]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from langdetect import detect, DetectorFactory, LangDetectException

# Ensure consistent language detection
DetectorFactory.seed = 0

# Variables for configuration
DATASET_PATH = "../Datasets/WikiMatrix/Processed/processed_en-el.csv"  # Path to your dataset
SOURCE_COLUMN = "English"  # Column name for source text
TARGET_COLUMN = "Greek"    # Column name for target text
SOURCE_LANG = "en"         # Desired source language (e.g., English)
TARGET_LANG = "el"         # Desired target language (e.g., Hindi)
OUTPUT_PATH = "../Datasets/WikiMatrix/Processed/clean_en-el.csv"  # Path to save cleaned dataset


# Step 1: Drop Null and Empty Values
class DropNullAndEmptyValues(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Drop rows where source or target columns are null or empty
        return X.dropna(subset=[SOURCE_COLUMN, TARGET_COLUMN]).loc[
            (X[SOURCE_COLUMN].str.strip() != '') & (X[TARGET_COLUMN].str.strip() != '')
        ]


# Step 2: Validate Source Language
class ValidateSourceLanguage(BaseEstimator, TransformerMixin):
    def __init__(self, source_lang, source_column):
        self.source_lang = source_lang
        self.source_column = source_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Detect language for each source text and filter rows
        def is_valid_lang(text):
            try:
                return detect(text) == self.source_lang
            except LangDetectException:
                return False

        return X[X[self.source_column].apply(is_valid_lang)]


# Step 3: Validate Target Language
class ValidateTargetLanguage(BaseEstimator, TransformerMixin):
    def __init__(self, target_lang, target_column):
        self.target_lang = target_lang
        self.target_column = target_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Check if target text matches the desired target language
        def is_valid_lang(text):
            try:
                return detect(text) == self.target_lang
            except LangDetectException:
                return False

        return X[X[self.target_column].apply(is_valid_lang)]


# Step 4: Drop Duplicate Rows
class DropDuplicates(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop_duplicates()


# Step 5: Write Dataset to CSV
class WriteToCSV(BaseEstimator, TransformerMixin):
    def __init__(self, output_path):
        self.output_path = output_path

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X.to_csv(self.output_path, index=False)
        return X


# Load the dataset
raw_data = pd.read_csv(DATASET_PATH)

# Define the pipeline
pipeline = Pipeline([
    ('drop_null_and_empty', DropNullAndEmptyValues()),
    ('validate_source_lang', ValidateSourceLanguage(SOURCE_LANG, SOURCE_COLUMN)),
    ('validate_target_lang', ValidateTargetLanguage(TARGET_LANG, TARGET_COLUMN)),
    ('drop_duplicates', DropDuplicates()),
    ('write_to_csv', WriteToCSV(OUTPUT_PATH))
])

# Apply the pipeline
cleaned_data = pipeline.fit_transform(raw_data)

# Display the cleaned dataset (optional)
print("Pipeline execution completed. Cleaned data preview:")
print(cleaned_data.head())


Pipeline execution completed. Cleaned data preview:
                                             English  \
0  "palace brothers: there is no one what will ta...   
1  to the sky, the sky-god, and the supreme god, ...   
5       anyone who tells you differently is a liar."   
6  sports events in france for the weekend of 14–...   

                                               Greek  
0                          Ἡ Μάνη ἐφθόνησε τὸν Μπέη.  
1  Ǧ (g με ανάποδη οξυβαρεία) Ġayn (Αραβικά) Ghay...  
4  Για εμάς αυτό είναι ένα μάθημα και μια προειδο...  
5          Όποιος σου πει το αντίθετο είναι ψεύτης".  
6  Τα αθλητικά γεγονότα στη Γαλλία για το Σαββατο...  
