### Importing relevant packages

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from nltk.corpus import stopwords
import eli5
import warnings
warnings.filterwarnings('ignore')
import joblib
import os

### Loading our data

In [2]:
path=os.getcwd()
parent=os.path.dirname(path)
data_path=os.path.join(parent,'Data fetching from api','data.csv')

In [3]:
df=pd.read_csv(data_path)
df.head()

Unnamed: 0,sentences,dialect
0,لكن بالنهاية .. ينتفض .. يغير .,IQ
1,يعني هذا محسوب على البشر .. حيونه ووحشيه .. وت...,IQ
2,مبين من كلامه خليجي,IQ
3,يسلملي مرورك وروحك الحلوه💐,IQ
4,وين هل الغيبه اخ محمد 🌸🌺,IQ


In [25]:
df.dialect.value_counts()

EG    57636
PL    43742
KW    42109
LY    36499
QA    31069
JO    27921
LB    27617
SA    26832
AE    26296
BH    26292
OM    19116
SY    16242
DZ    16183
IQ    15497
SD    14434
MA    11539
YE     9927
TN     9246
Name: dialect, dtype: int64

### Shuffling data

In [3]:
df_shuffled=df.sample(frac=1,random_state=42)

### Train/Test split

In [4]:
df_train= df_shuffled.iloc[:int(0.8*len(df_shuffled))]
df_test= df_shuffled.iloc[int(0.8*len(df_shuffled)):]


### Defining report function

In [5]:
def print_report(pipe, x_test, y_test):
    y_pred = pipe.predict(x_test)
    report = metrics.classification_report(y_test, y_pred)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

### Creating stop words list

In [6]:
stop_words_list= stopwords.words('arabic')

### Random forest

In [7]:
vec = CountVectorizer(stop_words=stop_words_list)
clf=RandomForestClassifier(n_estimators=100,max_depth=2)
pipe = make_pipeline(vec, clf)


In [8]:
pipe.fit(df_train.sentences, df_train.dialect)

Pipeline(steps=[('countvectorizer',
                 CountVectorizer(stop_words=['إذ', 'إذا', 'إذما', 'إذن', 'أف',
                                             'أقل', 'أكثر', 'ألا', 'إلا',
                                             'التي', 'الذي', 'الذين', 'اللاتي',
                                             'اللائي', 'اللتان', 'اللتيا',
                                             'اللتين', 'اللذان', 'اللذين',
                                             'اللواتي', 'إلى', 'إليك', 'إليكم',
                                             'إليكما', 'إليكن', 'أم', 'أما',
                                             'أما', 'إما', 'أن', ...])),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=2))])

In [9]:
print_report(pipe, df_test.sentences, df_test.dialect)

              precision    recall  f1-score   support

          AE       0.00      0.00      0.00      5264
          BH       0.00      0.00      0.00      5113
          DZ       0.00      0.00      0.00      3142
          EG       0.13      1.00      0.23     11621
          IQ       0.00      0.00      0.00      3183
          JO       0.00      0.00      0.00      5488
          KW       0.00      0.00      0.00      8423
          LB       0.00      0.00      0.00      5517
          LY       0.00      0.00      0.00      7325
          MA       0.00      0.00      0.00      2281
          OM       0.00      0.00      0.00      3809
          PL       0.00      0.00      0.00      8814
          QA       0.00      0.00      0.00      6216
          SA       0.00      0.00      0.00      5430
          SD       0.00      0.00      0.00      2910
          SY       0.00      0.00      0.00      3217
          TN       0.00      0.00      0.00      1936
          YE       0.00    

### Logistic regression

In [10]:
vec = CountVectorizer()
clf = LogisticRegression()
pipe = make_pipeline(vec, clf)


In [11]:
pipe.fit(df_train.sentences, df_train.dialect)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregression', LogisticRegression())])

In [12]:
print_report(pipe, df_test.sentences, df_test.dialect)

              precision    recall  f1-score   support

          AE       0.41      0.42      0.41      5264
          BH       0.40      0.31      0.35      5113
          DZ       0.59      0.53      0.56      3142
          EG       0.69      0.83      0.76     11621
          IQ       0.63      0.52      0.57      3183
          JO       0.42      0.36      0.39      5488
          KW       0.47      0.56      0.51      8423
          LB       0.60      0.67      0.63      5517
          LY       0.63      0.68      0.66      7325
          MA       0.74      0.56      0.64      2281
          OM       0.41      0.34      0.37      3809
          PL       0.50      0.52      0.51      8814
          QA       0.44      0.49      0.47      6216
          SA       0.40      0.43      0.42      5430
          SD       0.70      0.55      0.62      2910
          SY       0.45      0.34      0.39      3217
          TN       0.67      0.43      0.52      1936
          YE       0.41    

### logistic regression is better than random forest

### Removing stop words doesn't increase accuracy

### Let's better understand our model

In [13]:
eli5.show_weights(clf, vec=vec, top=20)

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6,Unnamed: 17_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7,Unnamed: 17_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8,Unnamed: 17_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9,Unnamed: 17_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10,Unnamed: 17_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11,Unnamed: 17_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12,Unnamed: 17_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13,Unnamed: 17_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14,Unnamed: 17_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15,Unnamed: 17_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16,Unnamed: 17_level_16
Weight?,Feature,Unnamed: 2_level_17,Unnamed: 3_level_17,Unnamed: 4_level_17,Unnamed: 5_level_17,Unnamed: 6_level_17,Unnamed: 7_level_17,Unnamed: 8_level_17,Unnamed: 9_level_17,Unnamed: 10_level_17,Unnamed: 11_level_17,Unnamed: 12_level_17,Unnamed: 13_level_17,Unnamed: 14_level_17,Unnamed: 15_level_17,Unnamed: 16_level_17,Unnamed: 17_level_17
+6.215,الحينه,,,,,,,,,,,,,,,,
+5.075,مب,,,,,,,,,,,,,,,,
+5.067,شرات,,,,,,,,,,,,,,,,
+4.852,يالس,,,,,,,,,,,,,,,,
+4.568,نادين_نسيب_نجيم,,,,,,,,,,,,,,,,
+4.355,بوظبي,,,,,,,,,,,,,,,,
+4.199,عيناوي,,,,,,,,,,,,,,,,
+4.128,نبا,,,,,,,,,,,,,,,,
+4.017,عسب,,,,,,,,,,,,,,,,
+3.924,الشارجة,,,,,,,,,,,,,,,,

Weight?,Feature
+6.215,الحينه
+5.075,مب
+5.067,شرات
+4.852,يالس
+4.568,نادين_نسيب_نجيم
+4.355,بوظبي
+4.199,عيناوي
+4.128,نبا
+4.017,عسب
+3.924,الشارجة

Weight?,Feature
+8.412,عبدالعزيز_الرويحي
+5.878,سمر_البحرينية
+5.383,سمر_البحرينيه
+5.338,غرد_بفتوى
+5.148,المحرق
+4.537,احين
+4.482,صج
+4.275,ليي
+3.957,تركي_اهبط_الاتحاد
+3.952,بخيير

Weight?,Feature
+8.127,واش
+6.591,بزاف
+6.209,سنابات_غازي_المطيري
+6.043,سهيلة
+5.987,راه
+5.785,شكون
+5.356,باش
+5.338,برك
+5.237,زد_رصيدك
+5.019,علاش

Weight?,Feature
+4.924,عليكى
+4.769,اوى
+4.476,إللى
+4.060,دلوقتى
+3.953,اسكندريه
+3.945,الغالى
+3.919,ده
+3.805,وانتى
… 81085 more positive …,… 81085 more positive …
… 368198 more negative …,… 368198 more negative …

Weight?,Feature
+7.271,هيج
+6.448,لعد
+5.489,هواي
+4.863,صدك
+4.794,يكول
+4.781,ماكو
+4.738,احنه
+4.723,هذوله
+4.717,شكد
+4.640,بالعراق

Weight?,Feature
+8.241,اشي
+7.926,تشويش_واضح
+5.939,مبادره_بلال_الماضي
+5.824,هسا
+5.042,الفيصلي
+4.919,يزم
+4.629,بلال_الماضي
+4.594,بالاردن
+4.511,انداري
+4.392,زد_رصيدك

Weight?,Feature
+6.851,عبدالله_العصيدان
+5.165,كرباج
+5.149,البدون
+5.050,القادسيه
+5.020,فنغر
+4.863,صج
+4.730,تكفه
+4.586,ناطر_بيت
+4.573,جذي
+4.360,ألله

Weight?,Feature
+5.800,هلق
+5.317,هوا_الحرية
+5.208,هيدا
+5.131,لبنان_ينتفض
+5.012,هول
+4.941,اجا_وقت_نحاسب
+4.815,ألله
+4.601,قدح_وجم
+4.536,يللي
+4.272,هوي

Weight?,Feature
+13.636,قصقص
+12.689,هكي
+11.461,شن
+9.431,هلبا
+9.330,بنغازي
+8.492,هضا
+8.045,توا
+6.662,ليبي
+6.229,السراج
+5.837,الليبيين

Weight?,Feature
+10.032,ديال
+7.815,واش
+7.401,راه
+6.892,هادشي
+6.778,دابا
+6.313,بزاف
+6.224,حيت
+6.139,اهاب
+5.917,باش
+5.752,ديالي

Weight?,Feature
+5.738,صحار
+5.051,مسقط
+4.785,نزين
+4.275,صلاله
+4.099,موه
+4.092,عراق
+3.934,عمانيون_بلا_وظايف
+3.903,مب
+3.898,عماني
+3.786,كذاك

Weight?,Feature
+8.034,اشي
+6.489,هلقيت
+5.752,شيكل
+5.731,محمد_عساف
+5.147,هادا
+4.833,بغزة
+4.719,فش
+4.532,يخو
+4.409,ış
+4.286,إشي

Weight?,Feature
+6.520,الغرافه
+5.825,تميم_المجد
+5.377,ريكي
+4.724,المهلكه
+4.537,مب
+4.444,دليم
+4.403,صج
+4.130,مهب
+3.961,خنور
+3.824,الريان

Weight?,Feature
+7.384,همثون
+7.079,المهدي_تركي
+4.992,عوض_العلياني
+4.398,أعجبني
+3.623,بدنك
+3.520,ياعمده
+3.466,لاهنت
+3.267,لاعدمتك
+3.244,ساير
+3.210,الطواقي

Weight?,Feature
+7.388,ليهو
+7.381,ياخ
+6.681,ديل
+6.164,فيهو
+6.048,زول
+5.976,ساي
+5.442,موكب
+5.328,زاتو
+5.210,الزول
+5.114,غايتو

Weight?,Feature
+5.224,هلق
+4.141,العفاريت
+4.133,أنو
+4.123,ياااحرية
+4.085,هاد
+4.073,هنن
+4.067,٠٠
+3.967,هدول
+3.836,مشان
+3.688,هيك

Weight?,Feature
+8.039,نسيم
+7.291,جوليا
+6.099,باش
+5.901,ماغي
+5.860,توا
+5.830,يعيشك
+5.597,شكون
+5.526,باهي
+5.238,تونسية
+5.170,توة

Weight?,Feature
+6.715,تعز
+5.113,عفاش
+4.826,عدن
+4.703,باليمن
+4.306,اقلام_تك
+4.010,اليمني
+3.975,صنعاء
+3.952,اليمن
+3.740,يمني
+3.159,شى


In [14]:
pipe.predict(['مشان الله لا تقوصنى معلم'])

array(['SY'], dtype=object)

### Saving our model

In [15]:
joblib.dump(pipe, 'model.pkl')

['model.pkl']