# Text Classification

This is our `hello world` example 

In [1]:
# a helper functions and imports
from IPython.display import display
def highlight_col(x, df):
    #set by condition
    mask =  df['label'] == 'pos'
    mask2 = df['label'] == 'neg'
    x = pd.DataFrame('', index=df.index, columns=df.columns)
    x.loc[mask] = 'background-color: #e6ffe6'
    x.loc[mask2] = 'background-color: #ffe6e6'
    return x    

## Load the data

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

train_pos = pd.read_csv("train_Arabic_tweets_positive_20190413.tsv",
                        sep='\t', header=None)
train_neg = pd.read_csv("train_Arabic_tweets_negative_20190413.tsv",
                        sep='\t', header=None)
train_data = pd.concat([train_pos, train_neg], ignore_index=True)
train_data.columns = ['label', 'tweet']
del train_pos, train_neg

pd.set_option('display.max_colwidth', 100000)
df_tmp = train_data.sample(5)
df_tmp.style.apply(lambda x: highlight_col(x, df_tmp), axis=None)

Unnamed: 0,label,tweet
27333,neg,انا لمن اعرف اني حارجع السودان والبلد ما حيكون فيها مظاهرات 💔
26181,neg,"صوت واحد ل مرشح واحد .. إنو الكفار ما ""بيستاهلو"" نايب ؟ 🤔"
2571,pos,تبسم لي عشان ارضيك 💙
9432,pos,بسم الله الرحمن الرحيم ﷽ (إن الله وملائكته يصلون على النبي يا أيها الذين آمنوا صلوا عليه وسلموا تسليما) اللهم صل…
26572,neg,بكمل نومي 😴


In [3]:
test_pos = pd.read_csv("test_Arabic_tweets_positive_20190413.tsv",
                        sep='\t', header=None)
test_neg = pd.read_csv("test_Arabic_tweets_negative_20190413.tsv",
                        sep='\t', header=None)
test_data = pd.concat([test_pos, test_neg], ignore_index=True)
test_data.columns = ['label', 'tweet']
del test_pos, test_neg

df_tmp = test_data.sample(5)
df_tmp.style.apply(lambda x: highlight_col(x, df_tmp), axis=None)

Unnamed: 0,label,tweet
10960,neg,⛔ مافي فض اعتصام قبل توجيه تجمع المهنيين يا سودان ⛔ مافي فض اعتصام قبل توجيه تجمع المهنيين يا سودان ⛔ مافي فض اعتصا…
10094,neg,وضعي يوم صحيت الساعه 😔
8981,neg,لا عيب عليك الكوشه شفناها الموسم الماضي اكلت من الهلال فمباراة التتويج ،، ليكون نسيت 🤔
6495,neg,تخيل حفلة بمنطقة مكة المكرمة استلمت #MariahCarey 🕋🇸🇦 #ماريا_كاري مليون دولار مقابل غناء ورقص لمدة ساعتين.. علش…
7636,neg,#نعجان 🌺كلين 💪🏼 للتنحيف 🎀 والرشاقة💪🏼 متاابعة دورية خلال مدة الكوورس 🌻 مع خبراء تغذية 🌸 ✨رش…


## Baseline model (using pipeline)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline

vec = CountVectorizer()
clf = LogisticRegression()
pipe = make_pipeline(vec, clf)
pipe.fit(train_data.tweet, train_data.label);

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Let's test this basic model

In [10]:
from sklearn import metrics

def print_report(pipe, x_test, y_test):
    y_pred = pipe.predict(x_test)
    report = metrics.classification_report(y_test, y_pred)
    print(report)
    print("accuracy: {:0.3f}".format(metrics.accuracy_score(y_test, y_pred)))

print_report(pipe, test_data.tweet, test_data.label)

              precision    recall  f1-score   support

         neg       0.77      0.82      0.80      5768
         pos       0.81      0.76      0.78      5752

    accuracy                           0.79     11520
   macro avg       0.79      0.79      0.79     11520
weighted avg       0.79      0.79      0.79     11520

accuracy: 0.792


## let's take a look inside the model

In [13]:
import eli5
eli5.show_weights(clf, vec=vec, top=20)

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


Weight?,Feature
+2.779,الإخونج
+2.414,وصباحك
+2.216,هالسنه
+2.093,ابريل
+2.092,السحب
+2.086,الزرقاء
+2.075,برونو
+2.029,اللوك
+1.887,الطيب
+1.880,حكمة


## Try our model on some tweets

In [14]:
for _, row in test_data.sample(5).iterrows():
    print(f"true label: {row['label']}")
    display(eli5.show_prediction(clf, row['tweet'], vec=vec,))
    print("--"*50)

true label: neg


Contribution?,Feature
1.323,Highlighted in text (sum)
0.334,<BIAS>


----------------------------------------------------------------------------------------------------
true label: neg


Contribution?,Feature
0.441,Highlighted in text (sum)
0.334,<BIAS>


----------------------------------------------------------------------------------------------------
true label: pos


Contribution?,Feature
0.334,<BIAS>
-0.236,Highlighted in text (sum)


----------------------------------------------------------------------------------------------------
true label: neg


Contribution?,Feature
0.548,Highlighted in text (sum)
0.334,<BIAS>


----------------------------------------------------------------------------------------------------
true label: neg


Contribution?,Feature
0.334,<BIAS>
0.32,Highlighted in text (sum)


----------------------------------------------------------------------------------------------------


## Try Tfidf with some processing

In [38]:
vec = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 7), min_df=.01, max_df=.2)
clf = LinearSVC()
pipe_tfidf = make_pipeline(vec, clf)
pipe_tfidf.fit(train_data.tweet, train_data.label)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(analyzer='char_wb', max_df=0.2, min_df=0.01,
                                 ngram_range=(2, 7))),
                ('linearsvc', LinearSVC())])

In [39]:
print_report(pipe_tfidf, test_data.tweet, test_data.label)

              precision    recall  f1-score   support

         neg       0.86      0.84      0.85      5768
         pos       0.84      0.87      0.86      5752

    accuracy                           0.85     11520
   macro avg       0.85      0.85      0.85     11520
weighted avg       0.85      0.85      0.85     11520

accuracy: 0.855


In [42]:
eli5.show_weights(clf, vec=vec, top=20)

Weight?,Feature
+7.542,😂
+5.526,💙
+5.460,💛
+4.420,🤣
+4.146,)
+4.064,🌹
+3.798,❤
+3.760,😍
+3.513,🙏
… 1113 more positive …,… 1113 more positive …


In [43]:
for _, row in test_data.sample(5).iterrows():
    print(f"true label: {row['label']}")
    display(eli5.show_prediction(clf, row['tweet'], vec=vec,))
    print("--"*50)

true label: pos


Contribution?,Feature
0.527,Highlighted in text (sum)
0.028,<BIAS>


----------------------------------------------------------------------------------------------------
true label: pos


Contribution?,Feature
1.692,Highlighted in text (sum)
0.028,<BIAS>


----------------------------------------------------------------------------------------------------
true label: pos


Contribution?,Feature
0.656,Highlighted in text (sum)
0.028,<BIAS>


----------------------------------------------------------------------------------------------------
true label: pos


Contribution?,Feature
0.113,Highlighted in text (sum)
-0.028,<BIAS>


----------------------------------------------------------------------------------------------------
true label: pos


Contribution?,Feature
0.239,Highlighted in text (sum)
-0.028,<BIAS>


----------------------------------------------------------------------------------------------------


# Feel free to play with notebook explore different models with different datasets