### Importing relevant packages

In [1]:
## Load the data to get started
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import nltk
from sklearn import preprocessing
import joblib
import os

In [107]:
stop_words_list= stopwords.words('arabic')

In [108]:
len(stop_words_list)

754

### Loading our data

In [2]:
path=os.getcwd()
parent=os.path.dirname(path)
data_path=os.path.join(parent,'Data fetching from api','data.csv')

In [3]:
df=pd.read_csv(data_path)
df.head()

Unnamed: 0,sentences,dialect
0,لكن بالنهاية .. ينتفض .. يغير .,IQ
1,يعني هذا محسوب على البشر .. حيونه ووحشيه .. وت...,IQ
2,مبين من كلامه خليجي,IQ
3,يسلملي مرورك وروحك الحلوه💐,IQ
4,وين هل الغيبه اخ محمد 🌸🌺,IQ


### Data cleaning

### Removing punctuations and word tokenization

In [110]:
tokenizer = nltk.RegexpTokenizer(r"[\u0621-\u064A]+")
new_words = [tokenizer.tokenize(df.sentences[i]) for i in range(len(df))]

In [111]:
len(new_words)

458197

In [112]:
clean_list=[' '.join(i) for i in new_words]

In [113]:
clean_list[:5]

['لكن بالنهاية ينتفض يغير',
 'يعني هذا محسوب على البشر حيونه ووحشيه وتطلبون من الغرب يحترمكم ويؤمن بدينكم ولاينعتكم بالإرهاب',
 'مبين من كلامه خليجي',
 'يسلملي مرورك وروحك الحلوه',
 'وين هل الغيبه اخ محمد']

In [114]:
clean_list[0].split()

['لكن', 'بالنهاية', 'ينتفض', 'يغير']

### Applying count vectorizer to get unique feature names

In [115]:
vec=CountVectorizer()

In [116]:
vec.fit(clean_list)

CountVectorizer()

In [117]:
feature_names= vec.get_feature_names()
feature_names[1000:1010]



['آلجنة',
 'آلجنسيه',
 'آلجنه',
 'آلجهآت',
 'آلجو',
 'آلجوف',
 'آلجيريا',
 'آلجيوب',
 'آلح',
 'آلحآاجه']

In [118]:
len(feature_names)

482112

### Getting word2idx dictionary. note: we start our indexing from 1 and leave 0 for out of vocab words

In [119]:
word2idx= {v:k for k,v in enumerate(feature_names,start=1)}

In [120]:
len(word2idx)

482112

In [146]:
word2idx

{'ءء': 1,
 'ءءءءءءءءء': 2,
 'ءاتها': 3,
 'ءادم': 4,
 'ءال': 5,
 'ءالله': 6,
 'ءالوجع': 7,
 'ءرءتو': 8,
 'ءع': 9,
 'ءلى': 10,
 'ءمن': 11,
 'ءنااهه': 12,
 'ءه': 13,
 'ءهـ': 14,
 'ءو': 15,
 'ءوالصدق': 16,
 'ءي': 17,
 'ءيل': 18,
 'آء': 19,
 'آآ': 20,
 'آآآ': 21,
 'آآآآ': 22,
 'آآآآآ': 23,
 'آآآآآآ': 24,
 'آآآآآآآآآآآآآآآآآآآآ': 25,
 'آآآآآآآآآآآآآآآآآآآآآآآآآآآآآآآآآآآآه': 26,
 'آآآآآآآآآآآآآآآآآآآآمين': 27,
 'آآآآآآآآآآآآآآآآآمين': 28,
 'آآآآآآآآآآآآآآآآه': 29,
 'آآآآآآآآآآآآآمين': 30,
 'آآآآآآآآآآآآآنا': 31,
 'آآآآآآآآآآآخركرم': 32,
 'آآآآآآآآآآآه': 33,
 'آآآآآآآآخ': 34,
 'آآآآآآآراس': 35,
 'آآآآآآخ': 36,
 'آآآآآآه': 37,
 'آآآآآبه': 38,
 'آآآآآخر': 39,
 'آآآآآمـــووووت': 40,
 'آآآآآمـيـن': 41,
 'آآآآآه': 42,
 'آآآآآههه': 43,
 'آآآآآويلييي': 44,
 'آآآآلي': 45,
 'آآآبدا': 46,
 'آآآخ': 47,
 'آآآخر': 48,
 'آآآد': 49,
 'آآآمين': 50,
 'آآآه': 51,
 'آآآو': 52,
 'آآحـــب': 53,
 'آآحـــبك': 54,
 'آآخ': 55,
 'آآخخ': 56,
 'آآرب': 57,
 'آآرحـب': 58,
 'آآلام': 59,
 'آآمين': 60,
 'آآميين': 61,
 'آآنا'

In [147]:
import json
with open('worddict.json', 'w') as fp:
    json.dump(word2idx, fp)

### Encoding words to numbers. out of vocab words take zero index

In [122]:
def encode(word2idx,data):
   
    vectors=[]
    for sent in data:
        vector = []
        for word in sent.split():
            vector.append(word2idx.get(word,0))
        vectors.append(vector)
        
    return vectors

In [123]:
encoded_data= encode(word2idx,clean_list)

In [124]:
len(encoded_data)

458197

In [125]:
encoded_data[:10]

[[281306, 103523, 478924, 474055],
 [473407,
  360437,
  316822,
  237866,
  43158,
  186478,
  441544,
  399906,
  333748,
  64457,
  463152,
  442355,
  116370,
  426275,
  95724],
 [311215, 333748, 264689, 190444],
 [469236, 320731, 410606, 50591],
 [448588, 361628, 64888, 22250, 317279],
 [450590,
  37481,
  24250,
  260908,
  234151,
  208815,
  252039,
  448588,
  75506,
  237866,
  93661,
  65990,
  190444,
  429233,
  199739,
  448588,
  74692,
  125375,
  70986,
  466918,
  239263,
  190041,
  323835,
  62776,
  371085,
  113403,
  16119,
  61587,
  76738,
  205669,
  333748,
  448588,
  409774,
  448588,
  68623,
  71140,
  415502],
 [326351, 475600, 27371, 421337, 88525, 31585],
 [190032,
  384154,
  279683,
  84839,
  303084,
  219014,
  328432,
  100720,
  333748,
  311432,
  71140,
  122541,
  254458,
  190032,
  220113,
  274041],
 [478336,
  205696,
  244335,
  69005,
  71140,
  471122,
  217999,
  338382,
  422975,
  481103,
  122123,
  77002,
  179550,
  75789,
  4629

### sentence of maximum length

In [126]:
max_sequence_len = 0
for sentence in encoded_data:
    max_sequence_len = max(len(sentence), max_sequence_len)
print(max_sequence_len)

88


### Average sentence length

In [127]:
l= [len(i) for i in encoded_data]
np.mean(l)

13.354299569835682

### Let' pad our corpus to make sure they are all of the same length

In [128]:
max_sequence_len = 20

In [129]:
data_padded = np.zeros((len(encoded_data), max_sequence_len))
for i, sent in enumerate(encoded_data):
    data_padded[i, :len(sent)] = sent[:max_sequence_len]

In [130]:
data_padded.shape

(458197, 20)

In [131]:
data_padded[0]

array([281306., 103523., 478924., 474055.,      0.,      0.,      0.,
            0.,      0.,      0.,      0.,      0.,      0.,      0.,
            0.,      0.,      0.,      0.,      0.,      0.])

### Vocab size

In [132]:
VOCAB_SIZE = len(word2idx)
VOCAB_SIZE

482112

### Getting labels

In [133]:
le = preprocessing.LabelEncoder()

In [134]:
label_encoder= le.fit(df.dialect)

In [135]:
joblib.dump(label_encoder,'labelencoder')

['labelencoder']

In [136]:
y= label_encoder.transform(df.dialect)

### Checking inputs and labels shapes

In [137]:
data_padded.shape

(458197, 20)

In [138]:
y.shape

(458197,)

### Train/Test split

In [139]:
 X_train, X_test, y_train, y_test = train_test_split(data_padded, y, test_size=0.1, random_state=42)

In [140]:
X_train.shape,y_train.shape

((412377, 20), (412377,))

In [141]:
X_test.shape,y_test.shape

((45820, 20), (45820,))

### Saving our data for training

In [142]:
np.savez('train_data', inputs=X_train, targets=y_train)
np.savez('validation_data', inputs=X_test, targets=y_test)