In [None]:
import os
import pandas as pd
import itertools
import numpy as np
import string
import re
import matplotlib.pyplot as plt
from sklearn import preprocessing
import matplotlib
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation,SpatialDropout1D
from keras.layers.embeddings import Embedding

import nltk

def removeDiacretics(news_list):

    arabic_diacritics = re.compile("""
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    temp_list = list()
    for news in news_list:
        text = re.sub(arabic_diacritics, '', news)
        temp_list.append(text)

    return temp_list



def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.

    get_top_n_words(["I love Python", "Python is a language programming", "Hello world", "I love the world"]) ->
    [('python', 2),
     ('world', 2),
     ('love', 2),
     ('hello', 1),
     ('is', 1),
     ('programming', 1),
     ('the', 1),
     ('language', 1)]
    """
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    return text

def remove_diacritics(text):
    text = re.sub(arabic_diacritics, '', text)
    return text

def remove_punctuations(text):
    translator = str.maketrans(' ', ' ', punctuations_list)
    return text.translate(translator)

def remove_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)






stop_words = set(stopwords.words('arabic'))

'''
word_tokens = word_tokenize(data)

print(word_tokens)

filtered_sentence = [w for w in word_tokens if not w in stop_words]

print(word_tokens)

print(filtered_sentence)

'''




'''
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

print(word_tokens)
print(filtered_sentence)

'''



def removeDiacretics(news_list):

    arabic_diacritics = re.compile("""
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    temp_list = list()
    for news in news_list:

        text = re.sub(arabic_diacritics, '', news)

        temp_list.append(text)

    return temp_list




def normalizeArabic(text):
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    return(text)


'''
fake = []
for folder in os.listdir("FakeNews/data"):
    for file in os.listdir("FakeNews/data/"+str(folder)):

        with open('FakeNews/data/'+folder+"/"+file, "r", encoding="utf8") as f:
            txt = f.read().rstrip()
            print(file)
            fake.append((txt,folder))

            #preprocessing

df = pd.DataFrame(fake)
print(df.shape)

df.to_csv("dataset.csv", sep=',',index=False)

'''



from nltk.stem import SnowballStemmer

ar_stemmer = SnowballStemmer("arabic")

trainDF1 = pd.read_csv("dataset.csv", delimiter=",", encoding='utf-8')
trainDF2 = pd.read_csv("dataset2.csv", delimiter=",", encoding='utf-8')

trainDF = pd.read_csv("dataset.csv", delimiter=",", encoding='utf-8')

trainDF = pd.concat([trainDF1,trainDF2])

print(trainDF.loc[trainDF['label'] == 'trusted'].shape)
print(trainDF.loc[trainDF['label'] == 'untrusted'].shape)

def normalizeDF(df):

    for index, row in df.iterrows():

        text = df.iloc[index].text
        if type(text) == str:

            text = removeDiacretics([text])

            word_tokens = word_tokenize(text[0])

            filtered_sentence = [w for w in word_tokens if not w in stop_words]

            stemmed_words = [ar_stemmer.stem(word) for word in filtered_sentence]

            stemmed_sentence = ' '.join(stemmed_words)

            df.iloc[index].text = stemmed_sentence

    return df






filtered_df = normalizeDF(trainDF)

filtered_df.to_csv('to_train.csv', sep=',')

print("filtered")

Y = pd.get_dummies(trainDF['label']).values



# train_x, test_x, train_y, test_y = train_test_split(trainDF['text'], Y)
train_x, test_x, train_y, test_y = train_test_split(trainDF['text'], trainDF['label'])

'''
# label encode the target variable
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
print(train_y.shape)
test_y = encoder.fit_transform(test_y)
'''


print(train_x.shape, train_y.shape)
print(test_x.shape, test_y.shape)



max_fatures = 500


# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word',max_features=5000)
tfidf_vect.fit(trainDF['text'].values.astype('U'))
xtrain_tfidf = tfidf_vect.transform(train_x.values.astype('U')).toarray()
xvalid_tfidf = tfidf_vect.transform(test_x.values.astype('U')).toarray()

print(xtrain_tfidf.shape)
print(xvalid_tfidf.shape)



# counter

count_vec = CountVectorizer(analyzer='word', max_features=5000)
count_train = count_vec.fit(trainDF['text'].values.astype('U'))
bag_of_words_train = count_vec.transform(train_x.values.astype('U')).toarray()
bag_of_words_test = count_vec.transform(train_x.values.astype('U')).toarray()



print(bag_of_words_train.shape)
print(bag_of_words_test.shape)




'''
Y = pd.get_dummies(train_y)


embed_dim = 64
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = xtrain_tfidf.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())


batch_size = 32
model.fit(xtrain_tfidf, Y, epochs=7, batch_size=batch_size)




'''

import keras

Y = pd.get_dummies(train_y)
Y_t = pd.get_dummies(test_y)

model = Sequential()

model.add(Conv1D(128, 5,padding='same',
                 input_shape=(5000,1)))
model.add(Activation('relu'))
model.add(Conv1D(128, 5,padding='same'))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(MaxPooling1D(pool_size=(8)))
model.add(Conv1D(128, 5,padding='same',))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Conv1D(128, 5,padding='same',))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(2))
model.add(Activation('softmax'))
opt = keras.optimizers.rmsprop(lr=0.00001, decay=1e-6)
model.summary()
model.compile(loss='categorical_crossentropy', optimizer=opt,metrics=['accuracy'])

cnnhistory=model.fit(bag_of_words_train.reshape((7398,5000,1)), Y, batch_size=32, epochs=20, validation_split=0.20)

model.save('cnnmodel.h5')




  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


(0, 2)
(0, 2)
filtered
(7398,) (7398,)
(2466,) (2466,)
(7398, 5000)
(2466, 5000)
