In [None]:
import fasttext
import fasttext.util
ft = fasttext.load_model('cc.fa.300.bin')

In [None]:
import pandas as pd
import numpy as np
import re
import persian

In [None]:
df = pd.read_excel(r'pishnahdkamel.XLS',usecols="G")

In [None]:
df = df[pd.notnull(df['sharhpishnahad'])]
df = df[df.sharhpishnahad.duplicated()==False]
df.reset_index(inplace = True)
del df['index']

In [None]:
stopwords = []
file = open('stopwords.txt',encoding = 'utf-8').read()
[stopwords.append(x) for x in file.split()]
stopwords = set(stopwords)

In [None]:
def removing_stopwords(text):
    text =str(text)
    filtered_tokens = [token for token in text.split() if token not in stopwords]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [None]:
def convert_char(text):
    text =str(text)
    s = ''
    for word in text:
        s = s + persian.convert_ar_characters(word)
    return s

In [None]:
def sent_to_word(data):
    data = str(data)
    words = []
    for x in data.split():
        words.append(x)
    return words

In [None]:
def sent_vectorizer(sent):
    sent_vec =[]
    numw = 0
    for w in sent:
        try:
            if numw == 0:
                sent_vec = ft.get_word_vector(w)
            else:
                sent_vec = np.add(sent_vec, ft.get_word_vector(w))
            numw += 1
        except:
            pass
    return np.asarray(sent_vec) / numw

In [None]:
sent_words = []
for i in df.sharhpishnahad.index:
    temp = removing_stopwords(df.sharhpishnahad[i])
    temp = convert_char(temp)
    temp = removing_stopwords(temp)
    temp = sent_to_word(temp)
    sent_words.append(temp)

In [None]:
vec = []
for sentence in sent_words:
    vec.append(sent_vectorizer(sentence))

In [None]:
del_row_id = []
del_row_value = []
for i in range(len(vec)):
    if len(vec[i]) == 0:
        del_row_id.append(i)
        tt = vec[i]
        del_row_value.append(tt)

In [None]:
df = df.drop(df.index [del_row_id])

In [None]:
df.reset_index(drop = True, inplace = True)

In [None]:
new_vec = []
for e in vec:
    if e not in del_row_value:
        new_vec.append(e)

In [None]:
vec_array = np.vstack(new_vec)

In [None]:
from keras.layers import Input, Dense
from keras.models import Model
from keras.layers.advanced_activations import LeakyReLU
from keras.models import load_model as load_keras_model
from tensorflow.keras.layers import BatchNormalization

In [None]:
input_vec = Input(shape=(ENCODING_DIM_INPUT,))

encoded = Dense(ENCODING_DIM_OUTPUT_1)(input_vec)
encoded = BatchNormalization()(encoded)
encoded = LeakyReLU()(encoded)

encoded = Dense(ENCODING_DIM_OUTPUT_2)(encoded)
encoded = BatchNormalization()(encoded)
encoded = LeakyReLU()(encoded)

encoded = Dense(ENCODING_DIM_OUTPUT_3)(encoded)
encoded = BatchNormalization()(encoded)
encoded = LeakyReLU()(encoded)

decoded = Dense(ENCODING_DIM_OUTPUT_2)(encoded)
decoded = BatchNormalization()(decoded)
decoded = LeakyReLU()(decoded)

decoded = Dense(ENCODING_DIM_OUTPUT_1)(decoded)
decoded = BatchNormalization()(decoded)
decoded = LeakyReLU()(decoded)

decoded = Dense(ENCODING_DIM_INPUT)(decoded)
decoded = BatchNormalization()(decoded)
decoded = LeakyReLU()(decoded)

autoencoder = Model(inputs = input_vec, outputs = decoded)
encoder = Model(inputs = input_vec, outputs = encoded)

In [None]:
autoencoder.summary()

In [None]:
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
autoencoder.fit(vec_array, vec_array, epochs=EPOCHS, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
encoded_vec = encoder.predict(vec_array)

In [None]:
from sklearn.cluster import KMeans

In [None]:
import matplotlib.pyplot as plt
sse = {}
for k in range(2, 10):
    kmeans_s = KMeans(n_clusters=k, max_iter=100).fit(encoded_vec)
    sse[k] = kmeans_s.inertia_
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 3, n_init = 50, random_state=1)
kmeans.fit(encoded_vec)

In [None]:
labels = kmeans.labels_

In [None]:
df.loc[:,'labels'] = kmeans.labels_

In [None]:
df.labels.value_counts()

In [None]:
from sklearn import metrics
metrics.silhouette_score(encoded_vec, labels, metric='euclidean')

In [None]:
df.to_excel('Barez_1_plus_AE.xlsx', index=False)