# Arabic Sentiment Analysis in tweets using Naive Bayes Machine learning Algorithm and unigram features

In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import nltk
from nltk import NaiveBayesClassifier
from nltk.metrics.scores import f_measure, precision, recall
import collections


# Input data files are available in the "input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
for filename in os.listdir("input"):
    print(filename)

# Any results you write to the current directory are saved as output.

test_Arabic_tweets_negative_20190413.tsv
test_Arabic_tweets_positive_20190413.tsv
train_Arabic_tweets_negative_20190413.tsv
train_Arabic_tweets_positive_20190413.tsv


# define functions 

In [6]:
import re
from itertools import islice

def load_tsv(data_file, n):
    data_features = list()
    data = list()
    infile = open(data_file, encoding='utf-8')
    for line in infile:
        if not line.strip():
            continue
        label, text = line.split('\t')
        text_features = process_text(text, n)
        if text_features:
            data_features += text_features
            data.append((text_features, label))
    return data, data_features

def process_text(text, n=1,
                 remove_vowel_marks=False,
                 remove_repeated_chars=False,
                 ):
    clean_text = text
    if remove_vowel_marks:
        clean_text = remove_diacritics(clean_text)
    if remove_repeated_chars:
        clean_text = remove_repeating_char(clean_text)

    if n == 1:
        return clean_text.split()
    else:
        tokens = clean_text.split()
        grams = tokens
        for i in range(2, n + 1):
            grams += [  ' '.join(g) for g in list(window(tokens, i))  ]
        return grams



def window(words_seq, n):
    """Returns a sliding window (of width n) over data from the iterable"""
    "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
    it = iter(words_seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result


def remove_repeating_char(text):
    # return re.sub(r'(.)\1+', r'\1', text)     # keep only 1 repeat
    return re.sub(r'(.)\1+', r'\1\1', text)  # keep 2 repeat

def document_features(document, corpus_features):
    document_words = set(document)
    features = {}
    for word in corpus_features:
        features['has({})'.format(word)] = (word in document_words)
    return features

# Load corpus

In [7]:
pos_train_file = 'input/train_Arabic_tweets_positive_20190413.tsv'
neg_train_file = 'input/train_Arabic_tweets_negative_20190413.tsv'

pos_test_file = 'input/test_Arabic_tweets_positive_20190413.tsv'
neg_test_file = 'input/test_Arabic_tweets_negative_20190413.tsv'
print('data files')
print('train file (pos)', pos_train_file)
print('train file (neg)', neg_train_file)
print('test file (pos)', pos_test_file)
print('test file (neg)', neg_test_file)

data files
train file (pos) input/train_Arabic_tweets_positive_20190413.tsv
train file (neg) input/train_Arabic_tweets_negative_20190413.tsv
test file (pos) input/test_Arabic_tweets_positive_20190413.tsv
test file (neg) input/test_Arabic_tweets_negative_20190413.tsv


# Parameters (ngrams)

In [8]:
print('parameters')
n = 1
print('n grams:', n)

parameters
n grams: 1


# loading train data .... 

In [9]:
print('loading train data ....')
pos_train_data, pos_train_feat = load_tsv(pos_train_file, n)
neg_train_data, neg_train_feat = load_tsv(neg_train_file, n)
print('loading test data ....')
pos_test_data, pos_test_feat = load_tsv(pos_test_file, n)
neg_test_data, neg_test_feat = load_tsv(neg_test_file, n)

loading train data ....
loading test data ....


# Training data information

In [10]:
print('train data info')
train_data = pos_train_data + neg_train_data
print('train data size', len(train_data))
print('# of positive', len(pos_train_data))
print('# of negative', len(neg_train_data))

train data info
train data size 47000
# of positive 23879
# of negative 23121


# Sample training data 

In [11]:
import random
sample_size = 100
print('{} random tweets .... '.format(sample_size))
for s in random.sample(train_data, sample_size):
    print(s)

100 random tweets .... 
(['طبيعي', 'كل', 'أجوبة', 'امتحان', 'العربي', 'تتشابه', 'اييشش', 'فييه', 'يا', 'واد', '😑'], 'neg')
(['بس', 'هو', 'طعمه', 'احلي', 'من', 'ابو', 'جنيه', '😂😂💖💖'], 'pos')
(['غيمه', 'هادئة', 'تمطر', 'كلما', 'اشتد', 'حزنك', 'لتغسل', 'عنك', 'ذلك', 'الحزن', '💓', '#صباح_الخير'], 'pos')
(['صباح', 'الورد', '🌷'], 'pos')
(['ذا', 'الكفار', 'ما', 'بقى', 'شيء', 'الا', 'سووه', '😐'], 'neg')
(['أجمل', 'تعب', 'والله', '😂'], 'pos')
(['لو', 'مخلينها', 'بالبيت', 'احسن', 'لها', 'يومهم', 'بيخلونها', 'برا!', 'فيه', 'ناس', 'جحلط', 'وربي', 'وعايشين', 'عيشة', 'الطبقيه', 'بشكل', 'غلط!!'], 'neg')
(['حكمة', 'بعد', 'العصر', ':', 'مهما', 'كانت', 'الادلة', 'قوية', 'اجحد', 'يا', 'وحش', '😎'], 'pos')
(['عش', 'محاطا', 'بكل', 'الأشياء', 'المنسوبه', 'للرقه', '🌸'], 'pos')
(['ربما', 'الحياة', 'لا', 'تعطينا', 'كل', 'ما', 'نريد', 'لكن', 'القناعة', 'تعطينا', 'كل', 'الحياة', '🌸'], 'pos')
(['⠀', '⠀', '⠀', '⠀', '⠀', 'إذا', 'عثرت', 'على', 'شخص', 'يتقبل', 'افكارك', 'الغريبة', '،', 'سوف', 'يكون', 'بمثابة', 'كنز', 

# Test data info

In [12]:
print('test data info')
test_data = pos_test_data + neg_test_data
print('test data size', len(train_data))
print('# of positive', len(pos_test_data))
print('# of negative', len(neg_test_data))

test data info
test data size 47000
# of positive 5970
# of negative 5781


# merging all features ...

In [13]:
print('merging all features ... ')
all_features = pos_train_feat + neg_train_feat + \
               pos_test_feat + pos_test_feat
print('len(all_features):', len(all_features))

merging all features ... 
len(all_features): 770508


# Sample features 

In [14]:
print('{} sample features ...'.format(sample_size))
print(random.sample(all_features, sample_size))

100 sample features ...
['لن', 'وانت', 'هاني', 'قيمة', 'لنا', '🌹', 'ماله', 'الناس', 'و', 'كل', 'الدوري', 'خلك', 'العالم', 'الزله', 'جدها', 'اخذتني', '🔴ارسلي', 'مثل', 'ثالث', '-', 'ليه', 'رقيتان', ':-', 'مالي', 'كل', 'الوحيد', 'إنما', 'الحادية', 'خليل', 'المتبعثر', 'لها', 'اسرع', 'كان', 'بمفردي', '/0', 'منها', '♪', 'التاني', 'بعض', 'الكتابة', '!!', 'الاول', 'ي', '.', 'ليبيا:', 'الرجفه', 'ساعة', 'لغايته', '💙', 'عليها', 'والقريب', '🔨', 'توجيه', 'تحاول', 'ومن', 'بالصحة', 'دول', 'لايموتون', '❤', '┅━❀', 'المرتاح', 'بطريقة', 'السلامة!', 'حليب', 'نبض…', 'يأتي', 'انا', 'الل…', 'ينتصر', 'فريقي', '🌳', 'شكرا', '#ساعه_استجابه', 'قلبك', 'وموعدنا', 'إن', 'لأمر', 'محمد', '💔', 'روعه', 'الأساسية', 'طاقتي', 'وأبتعد', 'شهدائن…', 'أصبح', 'يتغيرون', 'انا', '💚', 'ثانية،', '⠀┈┉━◈♔♚♔◈━┅┄', '×', 'مستمر', 'من', 'كنتم', 'صباح', '😏', 'كلمة', 'الخير', 'يازعماء', 'هذول']


# compute frequencies

In [15]:
all_features_count = {}
for w in all_features:
    all_features_count[w] = all_features_count.get(w, 0) + 1

# Sample Frequency

In [16]:
print('sample frequencies')
print(random.sample(list(all_features_count.items()), 30))
word = 'في'
print('freq of word {} is {}'.format(word, all_features_count.get(word, 0)))
word = 'فى'
print('freq of word {} is {}'.format(word, all_features_count.get(word, 0)))
word = 'من'
print('freq of word {} is {}'.format(word, all_features_count.get(word, 0)))

sample frequencies
[('هذولي', 5), ('تغننا', 1), ('🌴وبارك', 1), ('️⃣يعطى..؟!!', 1), ('هنالك', 9), ('الضروري', 3), ('أنبضها', 2), ('اتكرمت', 1), ('مرتين،', 1), ('❀', 154), ('للوضع', 1), ('غابه', 2), ('ﻳﺮ', 1), ('🙃🙄', 1), ('أحياء..!', 1), ('ولاخلصوو', 2), ('الدورعلى', 1), ('#يابو_دالين❤️', 1), ('سبقه', 1), ('أههخ', 1), ('والقديم', 1), ('بتسقط', 1), ('إنعواجه', 2), ('وزودوها', 1), ('أوك', 1), ('ماتصحى', 2), ('👱🏻\u200d♀️:', 3), ('شهرا', 2), ('😁🙊', 1), ('ونسى', 1)]
freq of word في is 9550
freq of word فى is 220
freq of word من is 12655


# Compute Threshold

In [17]:
print('size of training data:',  len(train_data))
min_df = int(0.001 * len(train_data))
max_df = int(0.98 * len(train_data))
print('min document frequency:', min_df)
print('max document frequency:', max_df)

size of training data: 47000
min document frequency: 47
max document frequency: 46060


# Selecting Features 

In [18]:
# remove features that have frequency below/above the threshold
my_features = set([word for word, freq in all_features_count.items() if  max_df > freq > min_df ])
print(len(my_features), 'are kept out of', len(all_features))

1961 are kept out of 770508


# Sample of selected features 

In [19]:
print('{} sample of selected features:'.format(sample_size))
print(random.sample(list(my_features), sample_size))

100 sample of selected features:
['اليوتيوب', 'نورا', '[', 'ليله', 'الجاهل', '😷', 'نسألك', '🙄', 'تكون', 'بصوت', 'الدولية', 'وهي', 'بستحق', 'X', 'القمر', 'بالهبوط.', 'أخذ', 'ادري', 'شوي', 'وأنتم', 'أنا', '🌟', 'حرفيا', 'وشولوه', 'اغنية', 'وحروف', 'انتظرك', 'الدهر', 'للحين', 'نعم', 'الرسول', '👌', 'Luv', 'لكنه', 'حينما', 'بسم', 'الحي', 'الصدارة', 'وصلت', 'مبروك', 'باليوم', 'وياك', 'واليوم', '😀', 'وبعدين', 'الفيفا', 'شهور', 'ينام', 'بحاجة', '⠀⠀', 'البشير', 'فين', 'إنت', 'سنوات', 'حياتك', 'توقعك', 'بينك', 'اشهر', 'أين', 'عشان', 'بدي', 'منها', 'هكذا', 'وجه', 'العالم', 'الاول', 'يحدث', 'الفترة', 'وربي', 'ملگية♛', 'سؤال', 'مكان', 'صبح', 'المسيار', 'البعض', 'وتلفظ', 'أمي', 'عشق', 'نتيجة', 'التغريده', 'أشياء', 'انت', 'اما', 'الدوري', '⇣', '😇', 'مثلا', 'شي', 'الحديث', 'نفسه', 'ذمتك', 'أحسن', 'ابد', 'خبر', 'اكثر', 'عرفت', 'الحزن', 'دقيقة', 'نبينا', 'رد']


# generating features for training documents ...

In [20]:
feature_sets = [(document_features(d, my_features), c) for (d, c) in train_data]

# training ...

In [21]:
classifier = nltk.NaiveBayesClassifier.train(feature_sets)
print('training is done')

training is done


# Most informative features 

In [22]:
classifier.show_most_informative_features(40)

Most Informative Features
               has(موثق) = True              pos : neg    =    238.5 : 1.0
                  has(😭) = True              neg : pos    =    202.0 : 1.0
                  has(😢) = True              neg : pos    =    171.3 : 1.0
            has(المسيار) = True              pos : neg    =    170.1 : 1.0
              has(وصلوا) = True              pos : neg    =    166.9 : 1.0
                  has(😳) = True              neg : pos    =    164.2 : 1.0
             has(الشروط) = True              pos : neg    =    151.4 : 1.0
              has(وتابع) = True              pos : neg    =    143.9 : 1.0
               has(ببكي) = True              neg : pos    =    143.6 : 1.0
                  has(🥀) = True              neg : pos    =    132.4 : 1.0
              has(السحب) = True              pos : neg    =    118.4 : 1.0
                  has(💐) = True              pos : neg    =    116.5 : 1.0
             has(العروس) = True              neg : pos    =    113.3 : 1.0

# generating features for test documents ...

In [23]:
test_features = [(document_features(d, my_features), c) for (d, c) in test_data]

# classify test instances 

In [24]:
ref_sets = collections.defaultdict(set)
test_sets = collections.defaultdict(set)

for i, (feats, label) in enumerate(test_features):
    ref_sets[label].add(i)
    observed = classifier.classify(feats)
    test_sets[observed].add(i)

# Results 

In [25]:
print('accuracy: ', nltk.classify.accuracy(classifier, test_features))
print('pos precision: ', precision(ref_sets['pos'], test_sets['pos']))
print('pos recall:', recall(ref_sets['pos'], test_sets['pos']))
print('neg precision: ', precision(ref_sets['neg'], test_sets['neg']))
print('neg recall:', recall(ref_sets['neg'], test_sets['neg']))
print('positive f-score:', f_measure(ref_sets['pos'], test_sets['pos']))
print('negative f-score:', f_measure(ref_sets['neg'], test_sets['neg']))

accuracy:  0.8913283975831844
pos precision:  0.9198425478618716
pos recall: 0.8611390284757119
neg precision:  0.8654657578708211
neg recall: 0.9225047569624633
positive f-score: 0.8895233151656718
negative f-score: 0.8930754416813196
