# Load Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Main Functions

In [3]:
def load_data(file):
    # Load  data from file and return it as panads dataframe
    return pd.read_csv(file)

def scale_features(X_train, X_test, scaler='standard'):
    # load x data and returned it as a scaled data
    if scaler == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled        


# NLP Functions

In [12]:
def drop_null(train, test):
    train = train.dropna()
    test  = test.dropna().reset_index(drop=True)    
    return train, test

In [33]:
def filter_arabic(text):
    import re
    arabic_pattern = re.compile(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]+')
    arabic_text = arabic_pattern.findall(text)
    return ' '.join(arabic_text)

# Model

## Load & Describe Data

In [7]:
train_data = load_data('train.csv')
test_data  = load_data('val.csv')

In [8]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52758 entries, 0 to 52757
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  52758 non-null  object
 1   answer    52657 non-null  object
 2   label     52758 non-null  object
dtypes: object(3)
memory usage: 1.2+ MB


In [9]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17586 entries, 0 to 17585
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  17586 non-null  object
 1   answer    17547 non-null  object
 2   label     17586 non-null  object
dtypes: object(3)
memory usage: 412.3+ KB


## PreProcessing DATA

In [34]:
# 1.Drop Null Data From Train and Test Data   
train , test = drop_null(train_data, test_data)

In [36]:
train.head()

Unnamed: 0,question,answer,label
0,\nما هي مميزات و عيوب الدواء جلوكوفانس 500 5 و...,\n\nلكل علاج ايجابيته وسلبياته والتي تعتمد على...,الدم
1,\nاليك نتيجة تحليل هرمونات الغدة الدرقية علما ...,\n\nنعم. يجب تخفيض الجرعة، الا اذا كان سبب است...,الاورام-الخبيثة-والحميدة
2,\nحلول منزلية لأعراض ارتفاع ضغط الدم,\n\nيفضل عدم الاستغناء عن العلاج الدوائي لمرضى...,جراحة-عامة
3,\nعملت عملية دوالي الساقين قسطرة الليزر من شهر...,\n\nراجع طبيبك من اجري الجراحه افضل من يجيب لا...,أمراض-الجهاز-التنفسي
4,\nما حقيقة ان تمرين العضلة النعلية يخفض السكر ...,\n\nإذا قصدت تدليك العضلة فهذا كلام غير صحيح ....,مرض-السكري


In [39]:
# 2. Filtering Non-Arabic Content: Remove any irrelevant content that is not in Arabic
train['question'] = train['question'].apply(filter_arabic)
train['answer']   = train['answer'].apply(filter_arabic)
test['question']  = test['question'].apply(filter_arabic)
test['answer']    = test['answer'].apply(filter_arabic)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['question'] = train['question'].apply(filter_arabic)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['answer']   = train['answer'].apply(filter_arabic)


In [21]:
# 3.Text Segmentation

In [22]:
# 4.Normalization

In [23]:
# 5.Tokenization

In [None]:
# 6.Removing Stopwords

In [24]:
# 7.Stemming

In [25]:
# 8.Part-of-Speech Tagging

In [26]:
# 9.Named Entity Recognition

In [27]:
# 10.Handling Dialectal Variations
# https://medium.com/@neri.vvo/arabic-nlp-how-to-overcome-challenges-in-preprocessing-ed56de0c43e2

In [28]:
# 11. Feature Engineering:
# Extract relevant features from the text data, such as n-grams, part-of-speech tags, or named entities, depending on the requirements of your chatbot.

In [None]:
# 11. Vectorization:
# Convert the preprocessed text data into numerical vectors suitable for machine learning algorithms, such as TF-IDF vectors or word embeddings.