In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


In [None]:
df = pd.read_csv('combined_cleaned_data.csv')

In [None]:
# eda plan
# show histogram for number of words, hashtags, emojis
# word clouds
# tf-idf within and between


In [None]:
def plot_histogram(data_class1, data_class2, range, value, bins=10, class1_label='Class 1', class2_label='Class 2', save=False, fn='default'):
    """
    Plot a histogram to compare the data from two classes.

    Parameters:
        data_class1 (list or numpy array): Data for class 1.
        data_class2 (list or numpy array): Data for class 2.
        bins (int): Number of bins for the histogram. Default is 10.
        class1_label (str): Label for class 1 on the legend. Default is 'Class 1'.
        class2_label (str): Label for class 2 on the legend. Default is 'Class 2'.
    """


    fig, axs = plt.subplots(1, 2, sharey=True, tight_layout=True)
    axs[0].hist(data_class1, bins=bins, range=range,label=class1_label)
    axs[1].hist(data_class2, bins=bins, range=range, label=class2_label)
    axs[0].title.set_text(class1_label)
    axs[1].title.set_text(class2_label)
    axs[0].set_xlabel(value)
    axs[1].set_xlabel(value)
    if save:
        plt.savefig(f"{fn}.png")
    plt.show()


In [None]:
s_num_words = df[df['spam']==1]['clean_text'].map(lambda x : len(x))
ns_num_words = df[df['spam']==0]['clean_text'].map(lambda x : len(x))

In [None]:
plot_histogram(data_class1=s_num_words, range=(0,200),data_class2=ns_num_words, bins=50,value="number of words per tweet", class1_label="Spam", class2_label="Not Spam", save=True, fn="hist_words")

In [None]:
plot_histogram(data_class1=df[df['spam']==1]['num_emojis'], range=(0,20),data_class2=df[df['spam']==0]['num_emojis'], bins=20,value="number of emojis", class1_label="Spam", class2_label="Not Spam", save=True, fn="hist_emojis")

In [None]:
plot_histogram(data_class1=df[df['spam']==1]['num_links'], range=(0,5),data_class2=df[df['spam']==0]['num_links'], bins=5,value="number of links", class1_label="Spam", class2_label="Not Spam", save=True, fn="hist_links")

In [None]:
# creating word clouds
import arabic_reshaper
from wordcloud import WordCloud

In [None]:
x = df[df['spam'] == 0]['clean_text'].sample(n=30000).tolist()
#sx = ' '.join(sum(x))
import ast 
print(x[0])
y = ast.literal_eval(x[0])
print(y)
words_2d = [ast.literal_eval(j) for j in x]
print(words_2d[1])

In [None]:
from bidi.algorithm import get_display
joined_words = ' '.join(sum(words_2d, []))



In [None]:
import re
def removeWeirdChars(text):
    weirdPatterns = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u'\U00010000-\U0010ffff'
                               u"\u200d"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\u3030"
                               u"\ufe0f"
                               u"\u2069"
                               u"\u2066"
                               u"\u200c"
                               u"\u2068"
                               u"\u2067"
                               "]+", flags=re.UNICODE)
    return weirdPatterns.sub(r'', text)

text = arabic_reshaper.reshape(removeWeirdChars(joined_words))
text = get_display(text)
wordcloud = WordCloud(
    font_path='fonts/NotoNaskhArabic/NotoNaskhArabic-Regular.ttf').generate(text)


In [None]:
wordcloud.to_file("ns_wordcloud.png")

In [None]:


def create_word_cloud(pd_series, filename):
    
    def removeWeirdChars(text):
        weirdPatterns = re.compile("["
                                u"\U0001F600-\U0001F64F"  # emoticons
                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                u"\U00002702-\U000027B0"
                                u"\U000024C2-\U0001F251"
                                u"\U0001f926-\U0001f937"
                                u'\U00010000-\U0010ffff'
                                u"\u200d"
                                u"\u2640-\u2642"
                                u"\u2600-\u2B55"
                                u"\u23cf"
                                u"\u23e9"
                                u"\u231a"
                                u"\u3030"
                                u"\ufe0f"
                                u"\u2069"
                                u"\u2066"
                                u"\u200c"
                                u"\u2068"
                                u"\u2067"
                                "]+", flags=re.UNICODE)
        return weirdPatterns.sub(r'', text)
    x = pd_series.tolist()
    words_2d = [ast.literal_eval(j) for j in x]
    joined_words = ' '.join(sum(words_2d, []))
    
    text = arabic_reshaper.reshape(removeWeirdChars(joined_words))
    text = get_display(text)
    wordcloud = WordCloud(
        font_path='fonts/NotoNaskhArabic/NotoNaskhArabic-Regular.ttf').generate(text)
    wordcloud.to_file(filename)

    
    


In [None]:
create_word_cloud(df[df['spam'] == 1]['clean_text'], "spam_wordclout.png")


In [None]:
pd.set_option('display.max_colwidth', None)
filtered_df = df.loc[(df['spam'] == 1) & df['clean_text'].apply(
    lambda words: 'سلمان' in words)]
display(filtered_df['raw_text'])