In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

### Read The Cleaned Data

In [6]:
data=pd.read_csv('dialect_dataset with cleaning text.csv')

In [7]:
data.head()

Unnamed: 0,id,dialect,text
0,1175358310087892992,IQ,لكن بالنهاية ينتفض يغير
1,1175416117793349632,IQ,يعني هذا محسوب على البشر حيونه ووحشيه وتطلبون...
2,1175450108898565888,IQ,مبين من كلامه خليجي
3,1175471073770573824,IQ,يسلملي مرورك وروحك الحلوه
4,1175496913145217024,IQ,وين هل الغيبه اخ محمد


#### Drop Null Values

In [8]:
data=data.dropna()

In [9]:
data.isna().sum()

id         0
dialect    0
text       0
dtype: int64

In [10]:
data.shape

(458196, 3)

#### Spliting the Data to Train and Test
* Train = 80% of Data
* Test = 20% of Data

In [11]:
from sklearn.model_selection import train_test_split,GridSearchCV
feature= data['text']
target=data['dialect']
x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.2, stratify=target, random_state=42)

#### Import Arabic Stopwords from NLTK

In [12]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
arabic= stopwords.words('arabic')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


### Build Pipeline model
use TFIDF vectorizer with character analyzer and remove arabic stop words

then use logistic regression model to classify text

In [13]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier



vectorizer = TfidfVectorizer(analyzer='char',stop_words=arabic , ngram_range=(3, 5), min_df=.000008, max_df=.998)
# vectorizer= CountVectorizer(analyzer='char',stop_words=arabic , ngram_range=(4, 6))
model = LogisticRegression(max_iter=7000)
pipe = make_pipeline(vectorizer, model)
pipe.fit(x_train, y_train)


Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='char', max_df=0.998, min_df=8e-06,
                                 ngram_range=(3, 5),
                                 stop_words=['إذ', 'إذا', 'إذما', 'إذن', 'أف',
                                             'أقل', 'أكثر', 'ألا', 'إلا',
                                             'التي', 'الذي', 'الذين', 'اللاتي',
                                             'اللائي', 'اللتان', 'اللتيا',
                                             'اللتين', 'اللذان', 'اللذين',
                                             'اللواتي', 'إلى', 'إليك', 'إليكم',
                                             'إليكما', 'إليكن', 'أم', 'أما',
                                             'أما', 'إما', 'أن', ...])),
                ('logisticregression', LogisticRegression(max_iter=7000))])

#### Model Prediction
Predict test data and print the accuracy

In [14]:
from sklearn.metrics import confusion_matrix,accuracy_score
prediction = pipe.predict(x_test)
print(f"Accuracy score is {accuracy_score(y_test, prediction):.2f}")

Accuracy score is 0.56


### Custom Test data
use another test data to predict dialect  

In [19]:
input=['سلام يا صاحبي']
pre=pipe.predict(input)
print(pre)

['EG']


### Save Model to Jason file 

In [16]:
# model_json5 = pipe.to_json()
# with open("./model5.json", "w") as json_file:
#     json_file.write(model_json5)

In [20]:
import pickle
pipeline_pkl=pickle.dump(pipe,open('pipeline.pkl', 'wb'))

In [21]:
import pickle
vectorizer_pkl=pickle.dump(vectorizer,open('vectorizer.pkl', 'wb'))

In [22]:
import pickle
model_pkl=pickle.dump(model,open('model.pkl', 'wb'))