In [18]:
import numpy as np
import os
import pandas as pd
import re
import ftfy
import nltk
from nltk.corpus import stopwords
from nltk import PorterStemmer
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Conv1D, Dense, Input, LSTM, Embedding, Dropout, Activation, MaxPooling1D, Flatten
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import tokenizer
from sklearn.utils import shuffle
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
import seaborn as sn
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import time
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import precision_recall_fscore_support
import seaborn
import matplotlib.pyplot as plt
from numpy import savetxt
import joblib 
from sklearn.svm import SVC
from tensorflow.keras.models import model_from_json
import scipy.sparse as sp
import joblib
import pickle

In [9]:
## 01. Incarcarea datelor

comments_df = '../Dataset/DFTrain_v1.csv'
dataset = pd.read_csv(comments_df,  header = 1, names=['Tweets', 'Labels'])
dataset = shuffle(dataset,random_state=1)

## 02. Tratarea null-urilor
dataset[pd.isnull(dataset)]  = 'NaN'
print(dataset.head())


                                                  Tweets   Labels
11818  The least you can do when you find out someone...  Anxiety
19159  if tv makes ME forget that people who have bip...  Bipolar
28110  - name: alex\n- email: astitchhx@gmail.com\n- ...      NaN
18877  also iâ€™m aware i need a haircut  my face loo...  Bipolar
19223  I've always believed in the power of sharing s...  Bipolar


In [10]:
## 03. Functia de curatare a textului din retelele de socializare cu stemming si eliminare Stop Words
def cleanComments(corpus):
    comm_cleaned = []
    for comm in corpus:
        comm = str(comm)
        comm = ' '.join(
            re.sub("(@[A-Za-z0-9]+)|(\#[A-Za-z0-9]+)|(https:\\.*)|(www.*)|(http:\\.*)|(<Emoji:.*>)|(pic\.twitter\.com\/.*)", " ", comm).split())
        
        comm = ftfy.fix_text(comm)
        comm = ' '.join(re.sub("([^0-9A-Za-z \t])", " ", comm).split())

        stop_words = set(stopwords.words('english'))
        word_tokens = nltk.word_tokenize(comm)
        filtered_sentence = [w for w in word_tokens if not w in stop_words]
        
        comm = ' '.join(filtered_sentence)

        comm = PorterStemmer().stem(comm)

        comm_cleaned.append(comm)

    return comm_cleaned


In [11]:
def splitDataSet(dataset):
    ## Impartirea setului de date (70 % date de antrenare, 30 % date de testare)
    split_train = round(len(dataset) *0.7)
    split_validation = split_train + round(len(dataset) *0.3)
    
    ## Separarea si curatarea comentariilor
    x_train = cleanComments([x for x in dataset['Tweets'][:split_train]])
    y_train = [y for y in dataset['Labels'][:split_train]]

    x_val =cleanComments([x for x in dataset['Tweets'][split_train:split_validation]])
    y_val = [y for y in dataset['Labels'][split_train:split_validation]]
    
    return (x_train, y_train, x_val, y_val)

In [13]:
## 0.4 Definirea functiei de stabilire a simbolurilor folosite in corpusul de antrenare

def get_vocabulary_char_level(X_train_str):
    txt = ''
    for doc in X_train_str:
        for s in doc:
            txt += s
    return set(txt)

## 0.5 Definirea functiei de creare a tokenizer-ului, la nivel de caracter
def create_tokenizer_char_level(X_train_str, chars):

    tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')

    tk.fit_on_texts(X_train_str)

    char_dict = {}
    for i, char in enumerate(chars):
        char_dict[char] = i + 1

    tk.word_index = char_dict

    tk.word_index[tk.oov_token] = len(char_dict.values()) + 1
    return tk

## 0.6 Definirea functiei de procesare a comentariilor folosind tokenizer-ul creat anterior
## si uniformizarea lungimii secventelor la 280 de caractere 
def preprocess_dataset(dataset, tk):
    sequences = tk.texts_to_sequences(dataset)

    proc_data = pad_sequences(sequences, maxlen=280, padding='post')
    
    proc_data = np.array(proc_data)

    return proc_data

## 0.7 Definirea functiei ce exporta intr-un fisier csv raportul de clasificare al modelelor
def classification_report_to_csv(ground_truth, predictions, full_path):

    labels = unique_labels(ground_truth, predictions)

    precision, recall, f_score, support = precision_recall_fscore_support(ground_truth, predictions, labels=labels, average=None)
    
    results_pd = pd.DataFrame({"class": labels, "precision": precision, "recall": recall, "f_score": f_score, "support": support})

    results_pd.to_csv(full_path, index=False)
    
 ## 0.8 Definirea  functiei de antrenare al modelului   
def model(model,title,X_train,y_train, X_test, y_test):
    classifier = model
    ## Monitorizarea timpului de executie pentru fiecare model antrenat    
    start = time.process_time()
    trainedmodel=classifier.fit(X_train, y_train)
    print(time.process_time() - start)
    
    y_pred = classifier.predict(X_test)
    ## Afisarea valorii acuratetei modelului
    print('Pentru modelul #', str(title) ,accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    ## Crearea unui fisier csv cu scorurile modelului - pentru partea de raportare   
    classification_report_to_csv(y_test, y_pred,str(title)+'_CR'+'.csv')

    return trainedmodel

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anca.hiliuta\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [15]:
(x_train, y_train,  x_test, y_test) = splitDataSet(dataset)

In [19]:
chars = get_vocabulary_char_level(x_train)
tk = create_tokenizer_char_level(x_train, chars)  
maxlen = 280
vocab_size = len(tk.word_index)
print("vocabulary: ", tk.word_index)
print('Vocabular Size is ', vocab_size)
# Salvarea tokenizer-ului pentru a fi folosit ulterior in aplicatia web
data_dir = '../result'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
with open('../result/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tk, handle, protocol=pickle.HIGHEST_PROTOCOL)


vocabulary:  {'8': 1, 'p': 2, 'u': 3, 'A': 4, 'y': 5, 'o': 6, 'r': 7, '5': 8, '1': 9, 's': 10, 't': 11, 'z': 12, 'g': 13, 'c': 14, '7': 15, '3': 16, '4': 17, 'x': 18, '9': 19, 'm': 20, '2': 21, 'n': 22, 'w': 23, '0': 24, 'h': 25, 'q': 26, 'f': 27, 'i': 28, 'k': 29, 'l': 30, '6': 31, ' ': 32, 'v': 33, 'a': 34, 'j': 35, 'e': 36, 'b': 37, 'I': 38, 'd': 39, 'UNK': 40}
Vocabular Size is  40


In [20]:
# 09. Crearea Setului de date procesat - la nivel de caracter - pentru partea de DL
train_data = preprocess_dataset(x_train, tk)
test_data = preprocess_dataset(x_test, tk)


In [21]:
c = {'NaN':0, 'Depression':1, 'Bipolar':2, 'PTSD':3, 'Anxiety':4}
integer_mapping = {x: i for i,x in enumerate(c)}
y_train_one_hot= [integer_mapping[word] for word in y_train]
y_test_one_hot = [integer_mapping[word] for word in y_test]


In [22]:
## 10. Crearea unei functii de parsare a modelelor corespunzatoare invatarii automate

vec = TfidfVectorizer()
X = vec.fit_transform(x_train)
trainx = vec.transform(x_train)
testx = vec.transform(x_test)

In [24]:
## 11. RANDOM FOREST 
trainedRandomForest = model(RandomForestClassifier(n_estimators=280, n_jobs =3,criterion ='gini',class_weight='balanced_subsample', min_samples_split = 18,verbose = 1), 'Random forest', trainx,y_train_one_hot,testx,y_test_one_hot)

## 11.1 Salvarea modelului intr-un fisier pickle 
data_dir = '../result/Random Forest'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
joblib.dump(trainedRandomForest, '../result/Random Forest/trainedRandomForest.pkl')

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    4.1s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   19.1s
[Parallel(n_jobs=3)]: Done 280 out of 280 | elapsed:   30.4s finished
[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.1s


77.171875


[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.7s
[Parallel(n_jobs=3)]: Done 280 out of 280 | elapsed:    1.1s finished


Pentru modelul # Random forest 0.9026382602936306
              precision    recall  f1-score   support

           0       0.81      0.89      0.85      1568
           1       0.95      0.89      0.92      2298
           2       0.92      0.89      0.90      1544
           3       0.93      0.88      0.91      1543
           4       0.90      0.94      0.92      2106

    accuracy                           0.90      9059
   macro avg       0.90      0.90      0.90      9059
weighted avg       0.91      0.90      0.90      9059



['../result/Random Forest/trainedRandomForest.pkl']

In [25]:
## 11.2 RANDOM FOREST - antrenare cu diferiti parametri
trainedRandomForest_standard = model(RandomForestClassifier(), 'Random forest standard', trainx,y_train_one_hot,testx,y_test_one_hot)
trainedRandomForest_1 = model(RandomForestClassifier(n_estimators=280,criterion='gini',min_samples_split=18), 'Random forest 1', trainx,y_train_one_hot,testx,y_test_one_hot)

49.625
Pentru modelul # Random forest standard 0.9015343856937852
              precision    recall  f1-score   support

           0       0.82      0.87      0.84      1568
           1       0.91      0.90      0.91      2298
           2       0.93      0.89      0.91      1544
           3       0.94      0.89      0.91      1543
           4       0.91      0.94      0.93      2106

    accuracy                           0.90      9059
   macro avg       0.90      0.90      0.90      9059
weighted avg       0.90      0.90      0.90      9059

50.75
Pentru modelul # Random forest 1 0.9039629098134452
              precision    recall  f1-score   support

           0       0.82      0.88      0.85      1568
           1       0.93      0.91      0.92      2298
           2       0.94      0.88      0.91      1544
           3       0.93      0.88      0.91      1543
           4       0.90      0.95      0.93      2106

    accuracy                           0.90      9059
   macr

In [26]:
## 11. ROCCHIO
%time
data_dir = '../result/Rocchio'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
trainedRocchio = model(NearestCentroid(),'Rocchio',trainx,y_train_one_hot,testx,y_test_one_hot)
joblib.dump(trainedRocchio, '../result/Rocchio/trainedRocchio.pkl') 

Wall time: 0 ns
0.09375
Pentru modelul # Rocchio 0.8577105640799205
              precision    recall  f1-score   support

           0       0.67      0.92      0.77      1568
           1       0.92      0.84      0.88      2298
           2       0.94      0.85      0.89      1544
           3       0.95      0.83      0.88      1543
           4       0.88      0.86      0.87      2106

    accuracy                           0.86      9059
   macro avg       0.87      0.86      0.86      9059
weighted avg       0.88      0.86      0.86      9059



['../result/Rocchio/trainedRocchio.pkl']

In [27]:
## 13. NAIVE BAYES MULTINOMIAL
%time
data_dir = '../result/Multinomial NB'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
trainedMultinomialNB = model(MultinomialNB(alpha=0.6,fit_prior=False),'MultiNomial NB',trainx,y_train_one_hot,testx,y_test_one_hot)
joblib.dump(trainedMultinomialNB, '../result/Multinomial NB/trainedMultinomialNB.pkl') 


Wall time: 0 ns
0.03125
Pentru modelul # MultiNomial NB 0.7335246715973065
              precision    recall  f1-score   support

           0       0.92      0.60      0.73      1568
           1       0.65      0.75      0.70      2298
           2       0.87      0.68      0.77      1544
           3       0.86      0.68      0.76      1543
           4       0.63      0.88      0.74      2106

    accuracy                           0.73      9059
   macro avg       0.79      0.72      0.74      9059
weighted avg       0.77      0.73      0.73      9059



['../result/Multinomial NB/trainedMultinomialNB.pkl']

In [28]:
## 13.1 NAIVE BAYES MULTINOMIAL Standard
%time
trainedMultinomialNB = model(MultinomialNB(),'MultiNomial NB standard',trainx,y_train_one_hot,testx,y_test_one_hot)

Wall time: 0 ns
0.046875
Pentru modelul # MultiNomial NB standard 0.6698311071862236
              precision    recall  f1-score   support

           0       0.95      0.47      0.63      1568
           1       0.55      0.80      0.65      2298
           2       0.93      0.55      0.69      1544
           3       0.91      0.50      0.64      1543
           4       0.59      0.89      0.71      2106

    accuracy                           0.67      9059
   macro avg       0.79      0.64      0.67      9059
weighted avg       0.75      0.67      0.67      9059



In [29]:
## 14. LINEAR SVC
%time
data_dir = '../result/SVC'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
trainedSVC = model(LinearSVC(multi_class= 'crammer_singer',max_iter= 10000,class_weight = 'balanced',dual = False), 'Linear SVC', trainx, y_train_one_hot, testx, y_test_one_hot)
joblib.dump(trainedSVC, '../result/SVC/trainedSVC.pkl') 

Wall time: 0 ns
5.78125
Pentru modelul # Linear SVC 0.9067225963130588
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1568
           1       0.94      0.91      0.92      2298
           2       0.91      0.89      0.90      1544
           3       0.91      0.89      0.90      1543
           4       0.92      0.93      0.93      2106

    accuracy                           0.91      9059
   macro avg       0.90      0.90      0.90      9059
weighted avg       0.91      0.91      0.91      9059



['../result/SVC/trainedSVC.pkl']

In [30]:
## 14.1 SVC standard
trainedSVC_standard = model(SVC(), 'SVC standard', trainx, y_train_one_hot, testx, y_test_one_hot)

122.96875
Pentru modelul # SVC standard 0.9039629098134452
              precision    recall  f1-score   support

           0       0.77      0.94      0.85      1568
           1       0.94      0.91      0.92      2298
           2       0.97      0.86      0.91      1544
           3       0.96      0.85      0.90      1543
           4       0.92      0.94      0.93      2106

    accuracy                           0.90      9059
   macro avg       0.91      0.90      0.90      9059
weighted avg       0.91      0.90      0.91      9059



In [31]:
## 15. XGB
%time
data_dir = '../result/XGB'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
trainedXGB = model(XGBClassifier(max_depth=10, n_estimators=100, nthread= 3), 'XGB', trainx,y_train_one_hot,testx,y_test_one_hot)
joblib.dump(trainedXGB, '../result/XGB/trainedXGB.pkl') 

Wall time: 0 ns




279.828125
Pentru modelul # XGB 0.9216249034109725
              precision    recall  f1-score   support

           0       0.80      0.95      0.86      1568
           1       0.97      0.93      0.95      2298
           2       0.93      0.89      0.91      1544
           3       0.94      0.88      0.91      1543
           4       0.96      0.95      0.96      2106

    accuracy                           0.92      9059
   macro avg       0.92      0.92      0.92      9059
weighted avg       0.93      0.92      0.92      9059



['../result/XGB/trainedXGB.pkl']

In [32]:
## 15.1 XGB STandard
trainedXGB_standard = model(XGBClassifier(), 'XGB standard', trainx,y_train_one_hot,testx,y_test_one_hot)

194.125
Pentru modelul # XGB standard 0.9211833535710343
              precision    recall  f1-score   support

           0       0.78      0.97      0.87      1568
           1       0.97      0.92      0.95      2298
           2       0.95      0.89      0.92      1544
           3       0.94      0.87      0.90      1543
           4       0.96      0.95      0.95      2106

    accuracy                           0.92      9059
   macro avg       0.92      0.92      0.92      9059
weighted avg       0.93      0.92      0.92      9059



In [34]:
## 16. Initializarea matricei de embedding 
def load_embedding_weights(tk, vocab_size):
    embedding_weights = []

    embedding_weights.append(np.zeros(vocab_size))

    for char, i in tk.word_index.items():
        onehot = np.zeros(vocab_size)
        onehot[i - 1] = 1
        embedding_weights.append(onehot)

    embedding_weights = np.array(embedding_weights)

    return embedding_weights

embedding_size = vocab_size

embedding_weights = load_embedding_weights(tk, vocab_size)

print("embedding weights shape: ", embedding_weights.shape)
print("embedding weights: \n", embedding_weights)

input_size = 280
num_of_classes = 5
dropout_p = 0.2
optimizer = 'adam'
loss = "sparse_categorical_crossentropy"

embedding weights shape:  (41, 40)
embedding weights: 
 [[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [35]:
conv_layers = [[128, 7, 3], [128, 5, -1], [128, 3, -1], [128, 3, 3]]
def build_model():
    ## 17. Crearea modelului folosind constructorul Sequential
    model = Sequential()

    ## 17.1 Primul strat al retelei convolutionale: Embedding Layer
    model.add(Embedding(vocab_size + 1, embedding_size, input_length=input_size, weights=[embedding_weights]))

    ## 17.2 Adaugarea celor 4 straturi convolutionale si a celor 2 straturi de agregare
    for filter_num, filter_size, pooling_size in conv_layers:
        model.add(Conv1D(filter_num, filter_size, activation='relu'))
        if pooling_size != -1:
            model.add(MaxPooling1D(pool_size=pooling_size,strides = 3))

    ## 17.3 Stratul dintre cel convolutional si cel dens este de tip flatten
    model.add(Flatten())

    ## 17.4 Adaugarea stratului de neuroni complet conectati
    model.add(Dense(256, activation='relu'))

    ## 17.5 Pentru a evita supra-ajutarea modelului, am eliminat aleator 20% neuroni
    model.add(Dropout(dropout_p))

    ## 17.6 Adaugarea ultimului strat de tip dens, ce calculeaza probabilitatea claselor cu ajutorul functiei de activare softmax 
    model.add(Dense(5, activation='softmax'))

    ## 17.7 Compilarea modelului cu optimizatorul "adam", functia de minimizare a erorii de "sparse_categorical_crossentropy"
    model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

    ## 17.8 Afisarea detaliilor retelei create
    model.summary()

    return model

In [36]:
## 18. Antrenarea si evaluarea modelului creat 
y_test_one_hot= np.array(y_test_one_hot)
y_train_one_hot = np.array(y_train_one_hot)

model_dp = build_model()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 280, 40)           1640      
                                                                 
 conv1d (Conv1D)             (None, 274, 128)          35968     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 91, 128)          0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 87, 128)           82048     
                                                                 
 conv1d_2 (Conv1D)           (None, 85, 128)           49280     
                                                                 
 conv1d_3 (Conv1D)           (None, 83, 128)           49280     
                                                        

In [37]:
## 19. Antrenarea retelei convolutionale si afisarea timpului de antrenare
start = time.process_time()

history = model_dp.fit(train_data, y_train_one_hot, validation_data=(test_data, y_test_one_hot), batch_size=64, epochs=5, verbose=False)

print(time.process_time() - start)



1236.53125


In [38]:
## 20. Salvarea retelei neuronale convolutionale antrenate intr-un fisier de tip json
data_dir = '../result/CNN'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    
model_json = model_dp.to_json()
with open('../result/CNN/model_CNN.json', "w") as json_file:
    json_file.write(model_json)
    
## 21. Salvarea ponderilor retelei intr-un fisier de tip HDF5
model_dp.save_weights('../result/CNN/model_CNN.h5')
print("Modelul s-a salvat cu succes")


Modelul s-a salvat cu succes


In [39]:
## 22. Testarea retelei convolutionale
loss, accuracy = model_dp.evaluate(test_data, y_test_one_hot, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

predict_labels = model_dp.predict(test_data)

Testing Accuracy:  0.8934


In [40]:
 ## 23. Crearea raportului de sinteza al retelei 
final_label_prediction=[]

for j in range(0,len(predict_labels)):
    max= 0
    for i in range(0,len(predict_labels[j])):
        if predict_labels[j][i] > max:
            max = predict_labels[j][i]
            i_max= i
    final_label_prediction.append(i_max)

## 23.1 Crearea si salvarea raportului de clasificare al retelei neuronale
print(classification_report(y_test_one_hot, final_label_prediction))
classification_report_to_csv(y_test_one_hot, final_label_prediction,'CNN_CR.csv')

## 23.2 Crearea si stocarea intr-un fisier csv a matricei de confuzie al CNN
y_true = pd.Series(y_test_one_hot, name="Actual")
y_pred = pd.Series(final_label_prediction, name="Predicted")
df_confusion = pd.crosstab(y_true, y_pred)
print (df_confusion)
df_confusion.to_csv('../result/CNN/Matriceadeconfuzie.csv')


## 24. Incarcarea modelului json si recrearea modelului CNN
json_file = open('../result/CNN/model_CNN.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

## 24.1 Incarcarea ponderilor retelei convolutionale create
loaded_model.load_weights('../result/CNN/model_CNN.h5')
print("Incarcarea modelului de pe disk")
model_CNN = model_from_json(loaded_model_json)


              precision    recall  f1-score   support

           0       0.74      0.94      0.83      1568
           1       0.93      0.89      0.91      2298
           2       0.93      0.85      0.89      1544
           3       0.95      0.84      0.89      1543
           4       0.93      0.93      0.93      2106

    accuracy                           0.89      9059
   macro avg       0.90      0.89      0.89      9059
weighted avg       0.90      0.89      0.89      9059

Predicted     0     1     2     3     4
Actual                                 
0          1472    50    28     6    12
1           108  2044    31    26    89
2           162    36  1320     9    17
3           175    20    14  1296    38
4            60    45    19    21  1961
Incarcarea modelului de pe disk


In [41]:
## 25. Initializare parametrilor idf si a vocabularului
idfs = np.array(vec.idf_)
vocabulary = vec.vocabulary_

## 25.1 Crearea unei functii de initializare a acestor valori pe TF-IDF vectorize pentru a salva parametrii
class MyVectorizer(TfidfVectorizer):
    TfidfVectorizer.idf_ = idfs
    TfidfVectorizer.vocabulary_ = vocabulary
    
vectorizer = MyVectorizer(lowercase = False,  min_df = 2,  norm = 'l2',smooth_idf = True)

vectorizer._tfidf._idf_diag = sp.spdiags(idfs, diags = 0, m = len(idfs),n = len(idfs))

## 25.2 Salvarea in fisere de tip pickle a parametrii vectorizarii
with open ('../result/vocabulary_all.pkl', 'wb') as f:
    pickle.dump(vectorizer.vocabulary_,f)
with open ('../result/idfs_all.pkl', 'wb') as f:
    pickle.dump(vectorizer.idf_,f)


In [45]:
## 26. Introducerea comentariului de verificat
x_test_nou = ["My depression period is fine now, hope for the best :)"]


## 27. Testarea modelului CNN 
test_data_nou = preprocess_dataset(x_test_nou, tk)
predict_labels_nou = model_CNN.predict(test_data_nou)

prob_max=[]

for j in range(0,len(predict_labels_nou)):
    max= 0
    for i in range(0,len(predict_labels_nou[j])):
        if predict_labels_nou[j][i] > max:
            max = predict_labels_nou[j][i]
            i_max= i
    prob_max.append(i_max)
    
c = {'NaN':0, 'Depression':1, 'Bipolar':2, 'PTSD':3, 'Anxiety':4}
integer_mapping = {i: x for i,x in enumerate(c)}
final_prediction_CNN = [integer_mapping[word] for word in prob_max]



## 28. Initializarea obiectului de tip TF-IDF Vectorizer cu parametrii: vocabulary si idfs
vectorizer = TfidfVectorizer(decode_error='ignore')
vocabulary = pickle.load(open('../result/vocabulary_all.pkl', mode = 'rb'))
idfs = pickle.load(open('../result/idfs_all.pkl', mode = 'rb'))

vectorizer.vocabulary_ = vocabulary
vectorizer.idf_ = idfs

## 29. Transformarea datelor de intrare

testx= vec.transform(x_test_nou)

## 30. Incarcarea modelelor apriori antrenate
model_RF = joblib.load('../result/Random Forest/trainedRandomForest.pkl') 
model_XGB = joblib.load('../result/XGB/trainedXGB.pkl') 
model_SVC = joblib.load('../result/SVC/trainedSVC.pkl') 
model_Rocchio = joblib.load('../result/Rocchio/trainedRocchio.pkl') 
model_MNB = joblib.load('../result/Multinomial NB/trainedMultinomialNB.pkl') 

## 31. Afisarea predictiei
y_RF = model_RF.predict(testx)
y_XGB = model_XGB.predict(testx)
y_SVC = model_SVC.predict(testx)
y_ROCCHIO = model_Rocchio.predict(testx)
y_MNB = model_MNB.predict(testx)

## 32. Maparea indicilor numerici cu tulburarea afectiva
c = {'NaN':0, 'Depression':1, 'Bipolar':2, 'PTSD':3, 'Anxiety':4}
integer_mapping = {i: x for i,x in enumerate(c)}
final_prediction_RF = [integer_mapping[word] for word in y_RF]
final_prediction_XGB = [integer_mapping[word] for word in y_XGB]
final_prediction_SVC = [integer_mapping[word] for word in y_SVC]
final_prediction_ROCCHIO = [integer_mapping[word] for word in y_ROCCHIO]
final_prediction_MNB = [integer_mapping[word] for word in y_MNB]

for comms in x_test_nou:
    print('Comentariul: ', comms, ' este catalogat de \n')
    print(' Algoritmul Forestier Aleator: ',final_prediction_RF)
    print(' XGB: ',final_prediction_XGB)
    print(' SVC Liniar: ',final_prediction_SVC)
    print(' ROCCHIO: ',final_prediction_ROCCHIO)
    print(' Naive Bayes Multinomial: ',final_prediction_MNB)
    print(' CNN : ',final_prediction_CNN)

[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:    0.0s
[Parallel(n_jobs=3)]: Done 280 out of 280 | elapsed:    0.1s finished


Comentariul:  My depression period is fine now, hope for the best :)  este catalogat de 

 Algoritmul Forestier Aleator:  ['Depression']
 XGB:  ['Depression']
 SVC Liniar:  ['Depression']
 ROCCHIO:  ['Depression']
 Naive Bayes Multinomial:  ['Depression']
 CNN :  ['Depression']
