In [None]:
# pip install pyarabic
# pip install googletrans==4.0.0-rc1
# pip install openpyxl
# pip install -U deep-translator

In [1]:
import pandas as pd
import pyarabic.araby as araby
import numpy as np
from googletrans import Translator
from deep_translator import GoogleTranslator
import time
import tqdm
import json
import random

In [2]:
pd.set_option("display.max_columns", None)

# TRANSLATE DATA FROM AR TO EN

In [None]:
# READ ORIGINAL DATA
data = pd.read_excel(
    r"C:\Users\sanav\work\D2R\NLO\University\public\data\UNI-IND-2022-tamher.xlsx",
    dtype="object",
)
df.dropna(inplace=True)

df = df[
    [
        "IndicatorCode",
        "IndicatorDescription",
        "Nationality",
        "Gender",
        "Graduation Year",
        "EducationLevel",
        "GeneralMajorName",
        "NarrowMajorName",
        "MajorCodeByClassification",
        "MajorNameByClassification",
        "GOSIoccupationDescription",
        "ISCOOccupationDescription",
        "IsMatched",
        "Employment Information Source",
        "PeriodToEmployment",
        "IndicatorValue"
    ]
]

In [None]:
# translation code
def translate_text(df, column, translation_results):
    a_to_e_translated = {}
    unique_values = df[column].unique().tolist()

    for value in tqdm.tqdm(unique_values, total=len(unique_values), desc=f"Translating {column}..."):
        if isinstance(value, str):
            if value not in a_to_e_translated:
                success = False
                retries = 3
                while not success and retries > 0:
                    try:
                        translator = GoogleTranslator(source='auto', target='en')
                        translated_text = translator.translate(value)
                        a_to_e_translated[value] = translated_text
                        success = True
                    except Exception as e:
                        print(f"Error translating '{value}': {e}. Retrying in 1 minute...")
                        time.sleep(60)
                        retries -= 1
                if not success:
                    print(f"Failed to translate '{value}' after retries.")
                    a_to_e_translated[value] = value
        else:
            a_to_e_translated[value] = str(value)
    translation_results[column] = a_to_e_translated
    df[f'{column}'] = df[column].map(lambda x: a_to_e_translated.get(x, x))

columns_to_translate = [
        "ISCOOccupationDescription"
        
]

translation_results = {}
for column in columns_to_translate:
    translate_text(df, column, translation_results)

In [None]:
output_excel_path = r"C:\Users\sanav\work\D2R\NLO\University\public\output\ar_to_en_translations.xlsx"
with pd.ExcelWriter(output_excel_path, engine='xlsxwriter') as writer:
    for sheet_name, key_value_pairs in translation_results.items():
        df = pd.DataFrame(list(key_value_pairs.items()), columns=['Key', 'Value'])
        df.to_excel(writer, index=False, sheet_name=sheet_name)

# DATA TRANSLATION

In [44]:
# READ ORIGINAL DATA
data = pd.read_excel(
    r"C:\Users\sanav\work\D2R\NLO\University\public\data\UNI-IND-2022-tamher.xlsx",
    dtype="object",
)

In [46]:
df = data.copy()

In [47]:
df.shape

(253883, 16)

In [48]:
df.dropna(inplace=True)

df = df[
    [
        "IndicatorDescription",
        "IndicatorValue",
        "Nationality",
        "Gender",
        "Graduation Year",
        "EducationLevel",
        "GeneralMajorName",
        "NarrowMajorName",
        "MajorNameByClassification",
        # "GOSIoccupationDescription",
        "ISCOOccupationDescription",
        "PeriodToEmployment",
    ]
]

In [49]:
df.columns

Index(['IndicatorDescription', 'IndicatorValue', 'Nationality', 'Gender',
       'Graduation Year', 'EducationLevel', 'GeneralMajorName',
       'NarrowMajorName', 'MajorNameByClassification',
       'ISCOOccupationDescription', 'PeriodToEmployment'],
      dtype='object')

In [50]:
# Code for changing values from Arabic to English in original data
translated_results = {}
excel_data = pd.read_excel(r"C:\Users\sanav\work\D2R\NLO\University\public\data\reflected.xlsx", sheet_name=None)

for sheet_name, sheet_data in excel_data.items():
    translated_results[sheet_name] = {
        str(key).strip(): str(value).strip() if isinstance(value, str) else value
        for key, value in zip(sheet_data.iloc[:, 0], sheet_data.iloc[:, 1])
    }

for column, translation_dict in translated_results.items():
    if column in df.columns:
        if df[column].dtype == 'object':
            df[column] = df[column].str.strip()
        df[column] = df[column].map(translation_dict).fillna(df[column])

In [51]:
df = df[df['Nationality'] == 'Saudi']
df.reset_index(drop=True, inplace=True)
df.replace(['N/A', 'N.A.', np.nan], None, inplace=True)

In [52]:
df.columns

Index(['IndicatorDescription', 'IndicatorValue', 'Nationality', 'Gender',
       'Graduation Year', 'EducationLevel', 'GeneralMajorName',
       'NarrowMajorName', 'MajorNameByClassification',
       'ISCOOccupationDescription', 'PeriodToEmployment'],
      dtype='object')

In [53]:
# ignore_values = {
#     "Gender": ['Unclassified'],
#     'EducationLevel' : ['Unclassified'],
#     'GeneralMajorName': ['Unknown programs'],
#     'NarrowMajorName': ['Unclassified programs'],
#     "Major": [
#         'Unknown Specializations',
#         'Unspecified Specialization in Social and Behavioral Sciences',
#         'Unspecified Specialization in Humanities (Excluding Languages)',
#         'Unspecified Specialization in Business, Administration, and Law',
#         'Unspecified Specialization in Health',
#         'Unspecified Specialization in Physical Sciences',
#         'Unspecified Specialization in Security Services',
#         'Unspecified Specialization in Business and Management'
#     ],
#     'ISCOOccupationDescription': [0]
# }

ignore_values = {
    "Gender": ['Unclassified'],
    'EducationLevel' : ['Unclassified'],
    'GeneralMajorName': ['Unknown programs'],
    'NarrowMajorName': ['Unclassified programs'],
    "Major": [
        'Unknown Specializations'
    ],
    'ISCOOccupationDescription': [0]
}

In [54]:
# Remove rows containing the specified values
for column, values in ignore_values.items():
    if column in df.columns:
        df = df[~df[column].isin(values)]


# # Replace specified values with None
# for column, values in ignore_values.items():
#     if column in df.columns:
#         df[column] = df[column].replace(values, None)

df.reset_index(drop=True, inplace=True)

In [92]:
df.shape

(252408, 11)

In [55]:
df.to_json(r"C:\Users\sanav\work\D2R\NLO\University\public\output\translated_full_data.json", orient="records", indent=4, force_ascii=False)
df.to_excel(r"C:\Users\sanav\work\D2R\NLO\University\public\output\translated_full_data.xlsx", index=False)

# TRANSLATE FROM EN TO AR

In [3]:
df = pd.read_excel(r"C:\Users\sanav\work\D2R\NLO\University\public\output\translated_full_data.xlsx", dtype='object')

In [4]:
df.shape

(252408, 11)

In [5]:
df.head()

Unnamed: 0,IndicatorDescription,IndicatorValue,Nationality,Gender,Graduation Year,EducationLevel,GeneralMajorName,NarrowMajorName,MajorNameByClassification,ISCOOccupationDescription,PeriodToEmployment
0,Number of Graduates,4715,Saudi,Female,2022,Bachelor's,"Business, administration and law",Business and administration,Business Administration,,
1,Number of Graduates,16,Saudi,Female,2022,Bachelor's,"Business, administration and law",Business and administration,Business Administration,Account Manager,
2,Number of Graduates,101,Saudi,Female,2022,Bachelor's,"Business, administration and law",Business and administration,Business Administration,accountant,
3,Number of Graduates,15,Saudi,Female,2022,Bachelor's,"Business, administration and law",Business and administration,Business Administration,accountant,
4,Number of Graduates,17,Saudi,Female,2022,Bachelor's,"Business, administration and law",Business and administration,Business Administration,Accountant,


In [6]:
df['Gender'].value_counts()

Gender
Male      126205
Female    126203
Name: count, dtype: int64

In [7]:
# Code for changing values from Arabic to English in original data
translated_results = {}
excel_data = pd.read_excel(r"C:\Users\sanav\work\D2R\NLO\University\public\data\reflected.xlsx", sheet_name=None)

for sheet_name, sheet_data in excel_data.items():
    # Swap the key-value pairs - now English will be key and Arabic will be value
    translated_results[sheet_name] = {
        str(value).strip(): str(key).strip() if isinstance(key, str) else key
        for key, value in zip(sheet_data.iloc[:, 0], sheet_data.iloc[:, 1])
    }

# Apply the translations
for column, translation_dict in translated_results.items():
    if column in df.columns:
        if df[column].dtype == 'object':
            df[column] = df[column].str.strip()
        df[column] = df[column].map(translation_dict).fillna(df[column])


In [8]:
df.head()

Unnamed: 0,IndicatorDescription,IndicatorValue,Nationality,Gender,Graduation Year,EducationLevel,GeneralMajorName,NarrowMajorName,MajorNameByClassification,ISCOOccupationDescription,PeriodToEmployment
0,Number of Graduates,4715,سعودي,أنثى,2022,بكالوريوس,الأعمال والإدارة والقانون,الأعمال والإدارة,إدارة الأعمال,,
1,Number of Graduates,16,سعودي,أنثى,2022,بكالوريوس,الأعمال والإدارة والقانون,الأعمال والإدارة,إدارة الأعمال,مدير حسابات,
2,Number of Graduates,101,سعودي,أنثى,2022,بكالوريوس,الأعمال والإدارة والقانون,الأعمال والإدارة,إدارة الأعمال,محاسب,
3,Number of Graduates,15,سعودي,أنثى,2022,بكالوريوس,الأعمال والإدارة والقانون,الأعمال والإدارة,إدارة الأعمال,محاسب,
4,Number of Graduates,17,سعودي,أنثى,2022,بكالوريوس,الأعمال والإدارة والقانون,الأعمال والإدارة,إدارة الأعمال,كاتب حسابات,


In [7]:
df.shape

(252408, 11)

In [8]:
df.columns

Index(['IndicatorDescription', 'IndicatorValue', 'Nationality', 'Gender',
       'Graduation Year', 'EducationLevel', 'GeneralMajorName',
       'NarrowMajorName', 'MajorNameByClassification',
       'ISCOOccupationDescription', 'PeriodToEmployment'],
      dtype='object')

In [10]:
df['NarrowMajorName'].unique()

array(['Business and administration', 'law',
       'Basic programs and qualifications', 'education',
       'Personal services', 'Welfare', 'health', 'Journalism and media',
       'Social and behavioral sciences', 'environment',
       'Mathematics and Statistics',
       'Biological sciences and related sciences', 'Physical sciences',
       'Human studies except languages', 'Arts', 'Languages',
       'Manufacturing and processing', 'Architecture and construction',
       'Engineering and engineering crafts',
       'Multi -disciplinary programs and qualifications include telecommunications and information technology',
       'Communications and Information Technology',
       'Skills and personal development development', 'Forestry',
       'General hygiene and occupational health services',
       'Unlimited programs in business, administration and law',
       'Multidisciplinary programs and qualifications include health and wellbeing',
       'Other programs in natural sciences

In [24]:
df.to_json(r"C:\Users\sanav\work\D2R\NLO\University\public\output\translated_full_data_arabic.json", orient="records", indent=4, force_ascii=False)
df.to_excel(r"C:\Users\sanav\work\D2R\NLO\University\public\output\translated_full_data_arabic.xlsx", index=False)

In [25]:
df['Gender'].unique()

array(['أنثى', 'ذكر'], dtype=object)

EXTRA CODES

In [11]:
import pandas as pd
import json

# Read the Excel file with all sheets
excel_data = pd.read_excel(r"C:\Users\sanav\work\D2R\NLO\University\public\data\reflected.xlsx", sheet_name=None)

# Create a dictionary to store all translations
translation_dict = {}

# Process each sheet
for sheet_name, sheet_data in excel_data.items():
    # Convert the two columns into a dictionary
    translation_dict[sheet_name] = {
        str(key).strip(): str(value).strip() if isinstance(value, str) else value
        for key, value in zip(sheet_data.iloc[:, 0], sheet_data.iloc[:, 1])
    }

# Save as JSON file
with open(r"C:\Users\sanav\work\D2R\NLO\University\public\data\translations.json", 'w', encoding='utf-8') as f:
    json.dump(translation_dict, f, ensure_ascii=False, indent=4)