In [None]:
# %pip install snowballstemmer

In [None]:
import string
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from snowballstemmer import stemmer
ar_stemmer = stemmer("arabic")
import tashaphyne.arabic_const as arabconst


**we will read tweets and show some of tweets**

In [None]:
tweets_data = pd.read_csv('tweets.csv',encoding = "utf-8")
tweets = tweets_data[['tweet']]
tweets.head()

**Reading positive data and negative data**

In [None]:
positive_data = pd.read_csv('positive.csv' ,encoding = "utf-8")
positive = positive_data[['word', 'polarity']]
negative_data = pd.read_csv('negative.csv' ,encoding = "utf-8")
negative = negative_data[['word', 'polarity']]
print("positive is ")
positive.head()

**Preprocessing the text**

In [None]:
#function to delete unneccessery chars like puncutations,....
def remove_chars(text, del_chars):
    translator = str.maketrans('', '', del_chars)
    return text.translate(translator)

#function to delete repeated chars
def remove_repeating_char(text):
    return re.sub(r'(.)\1{2,}', r'\1', text)



# بداية شغلات جديدة منضافة  

def delete_urls(text):
    return re.sub(r'https?://\S+|www\.\S+|\b\S+\.\S+\b', '', text, flags=re.MULTILINE)

def delete_character_duplicates(text):
    cleaned_text = re.sub(r'([\u0600-\u06FF])\1{2,}', r'\1', text, flags=re.MULTILINE)
    return cleaned_text

def delete_all_numbers(data):
    return re.sub(r'\d', '',data,flags=re.MULTILINE)

def delete_dates_and_times(text):
    cleaned_text = re.sub(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}','', text,flags=re.MULTILINE)
    return cleaned_text

def replace_hamza(text):
    return re.sub("([ؤئ])","ء",text,flags=re.MULTILINE)

def replace_alif(text):
    return re.sub("([أإآ])","ا",text,flags=re.MULTILINE)

def delete_tatweel(text):
    return re.sub(u'[%s]' % arabconst.TATWEEL, '', text,flags=re.MULTILINE)

def delete_tashkeel(text):
    return arabconst.HARAKAT_PAT.sub('', text)

def delete_duplicated_spaces(text):
    return re.sub(r"( )\1{1,}", r"\1", text, flags=re.MULTILINE)

def delete_emojies(data):    
    emojies_regex = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U0001F1F2-\U0001F1F4"
        u"\U0001F1E6-\U0001F1FF"
        u"\U0001F600-\U0001F64F"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U0001F1F2"
        u"\U0001F1F4"
        u"\U0001F620"
        u"\u200d"
        u"\u2640-\u2642"
        "]+", flags=re.UNICODE)
    return emojies_regex.sub('', data)
# نهاية شغلات جديدة منضافة  



#function to clean text
def cleaningText(text):
    # delete arabic and english numbers
    text = re.sub(r'[0-9]+ ', '', text)  
    text= re.sub(r'[0-9\u0660-\u0669\u06F0-\u06F9]+','',text)
    #define arabic punctuations
    arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
    
    # define ENglish punctuations
    english_punctuations = string.punctuation
    
    # Merge english and and Arabic punctuations
    punctuations_list = arabic_punctuations + english_punctuations
    text = remove_chars(text, punctuations_list)
    
    # delete repeated chars 
    # replace /n with spaces
    text = text.replace('\n', ' ')  
    
    # delete plus spaces
    text  = delete_tashkeel(text)
    text  = replace_alif(text)
    text  = delete_tatweel(text)
    text = remove_repeating_char(text)


    text  = delete_all_numbers(text)
    text  = delete_dates_and_times(text)
    text = text.strip(' ')  
    text  = delete_urls(text)

#     text  = replace_hamza(text)

    text  = delete_duplicated_spaces(text)
    text  = delete_emojies(text)
    return text

#Tokenizer function that will devide the text into list of words 
def tokenizingText(text): 
    tokens_list = word_tokenize(text) 
    return tokens_list


#function to delete Arabic stopwords
def filteringText(tokens_list):  
    # Arabic stop words list
    listStopwords = set(stopwords.words('arabic'))
    filtered = []
    for txt in tokens_list:
        if txt not in listStopwords:
            filtered.append(txt)
    tokens_list = filtered 
    return tokens_list

# function to stemm the text 
def stemmingText(tokens_list): 
    tokens_list = [ar_stemmer.stemWord(word) for word in tokens_list]
    return tokens_list

# function to convert a list of words to sentence
def toSentence(words_list):  
    sentence = ' '.join(word for word in words_list)
    return sentence



**Examples of preprocessing texts**

In [None]:
# Example of stemming:
stem = ar_stemmer.stemWord(u"رائعون")
print ("stemming of رانعون: ")
print (stem)

# Example of cleaning text
text= "!أنا أحب الذهاب ااإأالى   الحديقة، كل يووووم 9 صباحاً، مع رفاااااقي هؤلاء "
print("original text:",text)
text=cleaningText(text)
print("Text after cleaning: ",text)
tokens_list=tokenizingText(text)
print("Words list after tokenaization: ",tokens_list)
tokens_list=filteringText(tokens_list)
print("Words list after filtering: ",tokens_list)
tokens_list=stemmingText(tokens_list)
print("Words list after stemming: ",tokens_list)
sentence=toSentence(tokens_list)
print("the result text: ",sentence)

**clean the tweets and store the cleaned and tokenized tweets**

In [None]:
# Preprocessing the tweets and store them
tweets['tweet_clean'] = tweets['tweet'].apply(cleaningText)
tweets['tweet_preprocessed'] = tweets['tweet_clean'].apply(tokenizingText)
tweets['tweet_preprocessed'] = tweets['tweet_preprocessed'].apply(filteringText)
tweets['tweet_preprocessed'] = tweets['tweet_preprocessed'].apply(stemmingText)


In [None]:
print("Examples of tweets before preprocessing")
print(tweets['tweet'].head(),end="\n\n\n")

print("Examples of tweets after preprocessing")
tweets['tweet_preprocessed'].head()

In [None]:
# delete repeated tweets
tweets.drop_duplicates(subset = 'tweet_clean', inplace = True)
# export to csv file
tweets.to_csv(r'tweet_clean.csv',encoding="utf-8", index = False, header = True,index_label=None)

**clean positive words and store them**

In [None]:





# Ensure you're working on the original DataFrame or make an explicit copy
positive = positive.copy()

# Modify the DataFrame with .loc to avoid SettingWithCopyWarning
positive.loc[:, 'word_clean'] = positive['word'].apply(cleaningText)

# Other preprocessing steps
positive.loc[:, 'word_preprocessed'] = positive['word_clean'].apply(tokenizingText)
positive.loc[:, 'word_preprocessed'] = positive['word_preprocessed'].apply(filteringText)
positive.loc[:, 'word_preprocessed'] = positive['word_preprocessed'].apply(stemmingText)

# Convert lists to strings to make them hashable
positive.loc[:, 'word_preprocessed'] = positive['word_preprocessed'].apply(lambda x: ' '.join(x))

# Rest of your code
positive.drop_duplicates(subset='word_preprocessed', inplace=True)
nan_value = float("NaN")
positive.replace("", nan_value, inplace=True)
positive.dropna(subset=['word_clean'], inplace=True) 

# Save to CSV
positive.to_csv(r'positive_clean.csv', encoding="utf-8", index=False, header=True, index_label=None)


In [None]:
print("Positive words after preprocessing")
positive.head(10)

**clean negative and save it to csv file**

In [None]:

negative = negative.copy()

# Use .loc to avoid SettingWithCopyWarning
negative.loc[:, 'word_clean'] = negative['word'].apply(cleaningText)

# Dropping the original 'word' column
negative.drop(['word'], axis=1, inplace=True)

# Preprocessing steps
negative.loc[:, 'word_preprocessed'] = negative['word_clean'].apply(tokenizingText)
negative.loc[:, 'word_preprocessed'] = negative['word_preprocessed'].apply(filteringText)
negative.loc[:, 'word_preprocessed'] = negative['word_preprocessed'].apply(stemmingText)

# Convert list to string to make them hashable for drop_duplicates
negative.loc[:, 'word_preprocessed'] = negative['word_preprocessed'].apply(lambda x: ' '.join(x))

# Dropping duplicates
negative.drop_duplicates(subset='word_preprocessed', inplace=True)

# Replacing empty strings with NaN and dropping NaN values
nan_value = float("NaN")
negative.replace("", nan_value, inplace=True)
negative.dropna(subset=['word_clean'], inplace=True)

# Saving to CSV
negative.to_csv(r'negative_clean.csv', encoding="utf-8", index=False, header=True, index_label=None)


In [None]:
negative.head(10)

In [None]:
# define positive word dictionary
dict_positive = dict()

# the path of positive clean words
myfile = 'positive_clean.csv'
positive_data = pd.read_csv(myfile, encoding='utf-8')

# we used this variable to make use of the length
positive = positive_data[['word_clean', 'polarity']]
for i in range(len(positive)): 
    dict_positive[positive_data['word_clean'][i].strip()] = int(positive_data['polarity'][i])


# define negative word dictionary
dict_negative = dict()

# the path of the negative clean words
myfile = 'negative_clean.csv'
negative_data = pd.read_csv(myfile, encoding='utf-8')

# we used this variable to make use of lenght
negative = negative_data[['word_clean', 'polarity']]
for i in range(len(negative)):  
    dict_negative[negative_data['word_clean'][i].strip()] = int(negative_data['polarity'][i])

# the dictionary of posiitive words keys are positive words and values are polarity
print("The polarity of word ممتاز:")
print(dict_positive['ممتاز'])
# the dictionary of negative wors keys are negative words and values are polarity
print("The polarity of word تعيس:")
print(dict_negative['تعيس'])

In [None]:
# دالة حساب قطبية قائمة من الكلمات       
def sentiment_analysis_dict_arabic(words_list):
    score = 0
    for word in words_list:
        if (word in dict_positive):
            score = score + dict_positive[word]
    for word in words_list:
        if (word in dict_negative):
            score = score + dict_negative[word]
    polarity=''
    if (score > 0):
        polarity = 'positive'
    elif (score < 0):
        polarity = 'negative'
    else:
        polarity = 'neutral'
    return score, polarity

In [None]:
# حساب قطبية التغريدات 
results = tweets['tweet_preprocessed'].apply(sentiment_analysis_dict_arabic)
results = list(zip(*results))
tweets['polarity_score'] = results[0]
tweets['polarity'] = results[1]

# كتابة النتائج في ملف
tweets.to_csv(r'tweets_clean_polarity.csv', encoding='utf-8', index = False, header = True,index_label=None)

In [None]:
# رسم نسب قطبية التغريدات
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize = (6, 6))
# حساب عدد التغريدات من كل قطبية
x = [count for count in tweets['polarity'].value_counts()]
# تسميات الرسم
labels = list(tweets['polarity'].value_counts().index)
explode = (0.1, 0, 0)
# تنفيذ الرسم
ax.pie(x = x, labels = labels, autopct = '%1.1f%%', explode = explode, textprops={'fontsize': 14})
# عنوان الرسم
ax.set_title('Tweets Polarities ', fontsize = 16, pad = 20)
# الإظهار
plt.show()

In [None]:
# طباعة أكثر التغريدات إيجابية
pd.set_option('display.max_colwidth', 3000)
positive_tweets = tweets[tweets['polarity'] == 'positive']
positive_tweets = positive_tweets[['tweet_clean', 'polarity_score', 'polarity']].sort_values(by = 'polarity_score', ascending=False).reset_index(drop = True)
positive_tweets.index += 1
positive_tweets[0:10]

In [None]:
# طباعة أكثر التغريدات سلبية
pd.set_option('display.max_colwidth', 3000)
negative_tweets = tweets[tweets['polarity'] == 'negative']
negative_tweets = negative_tweets[['tweet_clean', 'polarity_score', 'polarity']].sort_values(by = 'polarity_score', ascending=True)[0:10].reset_index(drop = True)
negative_tweets.index += 1
negative_tweets[0:10]

In [None]:
# %pip install python-bidi

In [None]:

# سحابة الكلمات
from wordcloud import WordCloud
# مكتبة للغة العربية
import arabic_reshaper
from bidi.algorithm import get_display
# انتقاء بعض الكلمات المعالجة
list_words=''
i=0
for tweet in tweets['tweet_preprocessed']:
    for word in tweet:
        i=i+1
        if i>100:
            break
        list_words += ' '+(word)
# ضبط اللغة العربية
reshaped_text = arabic_reshaper.reshape(list_words)
artext = get_display(reshaped_text)
# إعدادات سحابة الكلمات
wordcloud = WordCloud(font_path='DroidSansMono.ttf', width = 600, height = 400, background_color = 'black', min_font_size = 10).generate(artext)
fig, ax = plt.subplots(figsize = (8, 6))
# عنوان السحابة
ax.set_title('Word Cloud of Tweets', fontsize = 18)
ax.grid(False)
ax.imshow((wordcloud))
fig.tight_layout(pad=0)
ax.axis('off')
plt.show()

In [None]:
# تجميع الكلمات الموجبة والكلمات السالبة
def words_with_sentiment(list_words):
    positive_words=[]
    negative_words=[]
    for word in list_words:
        score_pos = 0
        score_neg = 0
        if (word in dict_positive):
            score_pos = dict_positive[word]
        if (word in dict_negative):
            score_neg = dict_negative[word]
        
        if (score_pos + score_neg > 0):
            positive_words.append(word)
        elif (score_pos + score_neg < 0):
            negative_words.append(word)
            
    return positive_words, negative_words