In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install alphabet_detector

Collecting alphabet_detector
  Downloading alphabet-detector-0.0.7.tar.gz (1.6 kB)
Building wheels for collected packages: alphabet-detector
  Building wheel for alphabet-detector (setup.py) ... [?25l[?25hdone
  Created wheel for alphabet-detector: filename=alphabet_detector-0.0.7-py3-none-any.whl size=2446 sha256=682d9b22f5741646a9721e245c2f741f3929e6d14e63a95da71b7aa07d1aa063
  Stored in directory: /root/.cache/pip/wheels/22/8c/ab/4afb1765f2b8450f894a1f06c9aa2b3f8e73f2fb8b55849e17
Successfully built alphabet-detector
Installing collected packages: alphabet-detector
Successfully installed alphabet-detector-0.0.7


In [28]:
import re
import string
import pickle

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from alphabet_detector import AlphabetDetector
from sklearn.linear_model import SGDClassifier
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [4]:
df = pd.read_csv("/content/drive/MyDrive/dialect_dataset.csv",lineterminator='\n')

In [5]:
df['label'].unique()

array(['IQ', 'BH', 'LY'], dtype=object)

In [6]:
df.label = df.label.apply({'IQ':0,'BH':1, "LY":2}.get)

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,ids,text,label
0,0,1175358310087892992,@Nw8ieJUwaCAAreT ŸÑŸÉŸÜ ÿ®ÿßŸÑŸÜŸáÿßŸäÿ© .. ŸäŸÜÿ™ŸÅÿ∂ .. Ÿäÿ∫Ÿäÿ± .,0
1,1,1022409931029458944,@jolnar121 ÿßŸÑÿ≥ÿ≠ŸÑŸá ÿ∂ŸäŸÅŸä Ÿä ÿ®ÿ™ÿ∑ŸÑÿπ ŸÑŸÉ ÿ≥ÿ≠ŸÑŸäŸáüòÖüòÖ,1
2,2,1175416117793349632,@7zNqXP0yrODdRjK ŸäÿπŸÜŸä Ÿáÿ∞ÿß ŸÖÿ≠ÿ≥Ÿàÿ® ÿπŸÑŸâ ÿßŸÑÿ®ÿ¥ÿ± .. ÿ≠...,0
3,3,1022430374696239232,@haneenalmwla ÿßŸÑŸÑŸá Ÿäÿ®ÿßÿ±ŸÉ ŸÅŸäŸáÿß Ÿàÿ®ÿßŸÑÿπÿßŸÅŸäŸá üòãüòãüòã,1
4,4,1175450108898565888,@KanaanRema ŸÖÿ®ŸäŸÜ ŸÖŸÜ ŸÉŸÑÿßŸÖŸá ÿÆŸÑŸäÿ¨Ÿä,0


In [11]:
arabic_diacritics = re.compile("""
                             Ÿë    | # Tashdid
                             Ÿé    | # Fatha
                             Ÿã    | # Tanwin Fath
                             Ÿè    | # Damma
                             Ÿå    | # Tanwin Damm
                             Ÿê    | # Kasra
                             Ÿç    | # Tanwin Kasr
                             Ÿí    | # Sukun
                             ŸÄ     # Tatwil/Kashida
                         """, re.VERBOSE)


def remove_diacritics(text):
    text = re.sub(arabic_diacritics, '', text)
    return text


def remove_punctuation(s):
    my_punctuations = string.punctuation + "ÿå" + "ÿõ" + "ÿü" + "¬´" + "¬ª"
    translator = str.maketrans('', '', my_punctuations)
    return s.translate(translator)


def remove_punctuation2(s): # replace punctuation with space
    my_punctuations = string.punctuation + "ÿå" + "ÿõ" + "ÿü" + "¬´" + "¬ª"
    replace_table = str.maketrans(my_punctuations,  ' '*len(my_punctuations))
    return s.translate(replace_table)

'''
def html2text(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()
'''

def remove_links(text):
    # return re.sub(r'\s*(?:https?://)?www\.\S*\.[A-Za-z]{2,5}\s*', ' ', text, flags=re.MULTILINE).strip()
    # return re.sub(r'^https?:\/\/.*[\r\n]*', '', clean_text, flags=re.MULTILINE)
    return re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE)


def remove_empty_lines(text):
    lines = [s.rstrip() for s in text.split("\n") if s.rstrip()]
    return '\n'.join(lines)


def remove_repeating_char(text):
    # return re.sub(r'(.)\1+', r'\1', text)     # keep only 1 repeat
    return re.sub(r'(.)\1+', r'\1\1', text)  # keep 2 repeat


def keep_only_arabic(text):
    ad = AlphabetDetector()
    clean_lines = list()
    for line in text.splitlines():
        clean_line = list()
        for word in line.split():
            if len(word) > 1:
                if ad.is_arabic(word):
                    if word.isalpha():
                        clean_line.append(word)
        clean_lines.append(' '.join(clean_line))
    return '\n'.join(clean_lines)


def clean_text(text):
    #text = html2text(text)
    #text = text.replace('.', '\n')
    #text = text.replace('\\', ' ')
    #text = text.replace('/', ' ')
    #text = text.replace('-', ' ')
    #text = text.replace('(', ' ')
    #text = text.replace(')', ' ')
    clean_text = remove_links(text)
    clean_text = remove_diacritics(clean_text)
    clean_text = remove_punctuation2(clean_text)
    clean_text = keep_only_arabic(clean_text)
    clean_text = remove_repeating_char(clean_text)
    clean_text = remove_empty_lines(clean_text)
    return clean_text

In [9]:
df['text'] = np.array([clean_text(text) for text in df['text']])

In [12]:
X, y = df['text'], df['label']

In [13]:
X_train, X_val, y_train, y_val =\
    train_test_split(X, y, test_size=0.2, random_state=20)

In [19]:
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='epsilon_insensitive', penalty='l2',
                                            alpha=1e-3, random_state=42,
                                            )),])

In [20]:
text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 SGDClassifier(alpha=0.001, loss='epsilon_insensitive',
                               random_state=42))])

In [27]:
target_names=[0,1,2]
target_names=list(map(str,target_names))
predicted = text_clf.predict(X_val)
print(np.mean(predicted == y_val))


print('7. Evaluation Metrics') 
print(metrics.classification_report(y_val, predicted,
     target_names = target_names)) 
#print(predicted)
print(metrics.confusion_matrix(y_val, predicted))

0.6271025823264629
7. Evaluation Metrics
              precision    recall  f1-score   support

           0       0.90      0.32      0.47      3149
           1       0.58      0.98      0.73      4197
           2       0.98      0.15      0.26      1096

    accuracy                           0.63      8442
   macro avg       0.82      0.48      0.49      8442
weighted avg       0.75      0.63      0.57      8442

[[1005 2142    2]
 [  74 4122    1]
 [  33  896  167]]


In [30]:
pickle.dump(text_clf, open("dialect_machine.pkl", 'wb'))