<a href="https://colab.research.google.com/github/zahraDehghanian97/classify_text/blob/master/doc2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Mount google drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **prerequisit**

In [3]:
import pandas as pd
import numpy as np
from itertools import groupby
import re
from gensim.models import Doc2Vec 
from gensim.models.doc2vec import TaggedDocument
from sklearn.metrics import balanced_accuracy_score

# **load dataset**

In [4]:
train_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/train_set_1.csv'
test_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/test_set_1.csv'
result_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/prediction_doc2vec.csv'
model_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/doc2vec.model'
stop_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/stop_word.csv'

In [5]:
# this function extract important word of each game
def prepare_string(string):
    result = []
    # remove html tags from string
    TAG_RE = re.compile(r'<[^>]+>')
    string = TAG_RE.sub('', string)
    # remove punctuation mark
    string = re.sub("[!()@.?؛:،-]",'', string)
    strings=string.split()
    # remove stop word
    for s in strings : 
      s = s.strip()
      if not s in stop_word.values and s!='' : 
        result.append(s)
    return result

# load short list of stop word 
stop_word = pd.read_csv(stop_path ,header = None, encoding = 'utf8')

# load train set
docs = []
id_docs = []
sentences = []
word_count = []
labels = []
train_data = pd.read_csv(train_path, encoding='utf8')
for d in train_data.values:
    id_docs.append(d[0])
    p_string = prepare_string(d[1])
    docs.append(p_string)
    sentences.extend(p_string)
    word_count.append(len(p_string))
    labels.append(d[2])
distinct_words = list(set(sentences))

# split validation data
id_docs_validation = id_docs[-150:]
id_docs = id_docs[:-150]
docs_validation = docs[-150:]
docs = docs [:-150]
word_count_validation = word_count[-150:]
word_count = word_count[:-150]
labels_validation = labels[-150:]
labels = labels[:-150]


# load test set
docs_test = []
id_docs_test = []
test_data = pd.read_csv(test_path, encoding='utf8')
for d in test_data.values:
    id_docs_test.append(d[0])
    docs_test.append(prepare_string(d[1]))

print(docs[0])
print(id_docs_test[0])

# **doc2Vec Model**

In [16]:
# this func creates Doc2VecModel of given docs
def create_doc2Vec_model(docs, len_model):
    print("start creating word2Vec model with length word = " + str(len_model))
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs)]
    model = Doc2Vec(documents, window=3, min_count=1, workers=8, vector_size=len_model)
    model.save(model_path+str(len_model))
    print("model creation finished")
    return


# this function loads the implemented word2vec model
def loadModel(len_model):
    print("start loading models...")
    return Doc2Vec.load(model_path+str(len_model))


# this func evaluates all given queries 
def evaluate_queries(queries, model,validation):
    labels_prediction = []
    temp = 0
    for query in queries:
      new_queries=model.infer_vector(query) 
      sims = model.docvecs.most_similar([new_queries]) #top 10 document tags and their cosine similarity
      labels_prediction.append(labels[int(sims[0][0])])
      # result = []
      # for sim in sims :
      #   result.append(labels[int(sim[0])])
      # group = groupby(result)
      # final_tag =  max(group, key=lambda k: len(list(k[1])))
      # labels_prediction.append(final_tag[0])
      if validation :
        gold = labels_validation[temp]
        print("result doc2vec retrieval : " + str(labels_prediction[-1])+"   gold result : " + str(gold))
      # else :
      #   print("evaluate result "+str(id_docs_test[temp])+"  predicted label : "+str(labels_prediction[-1]))
      #   dist_save=20
      #   if temp% dist_save == 0 :
      #     if (temp ==0):
      #       data =  { 'id' : id_docs_test[0] ,'label' : labels_prediction[:] }
      #       df = pd.DataFrame(data)
      #       df.to_csv(result_path, index=False)
      #     else :
      #       data =  { 'id' : id_docs_test[temp-dist_save+1:temp+1] ,'label' : labels_prediction[temp-dist_save+1 :] }
      #       df = pd.DataFrame(data)
      #       df.to_csv(result_path, mode='a',header = None,index=False)
      #       print(">>>> save result from the begining to data number "+str(id_docs_test[temp]))
      # temp += 1
    if validation :
        print(balanced_accuracy_score(labels_validation,labels_prediction))
    return labels_prediction

len_model = 400

# create Model
create_doc2Vec_model(docs, len_model)

# load model
model = loadModel(len_model)
print("model with lentgh word = " + str(len_model))

# # make data ready
# new_docs = []
# counter=0
# for doc in id_docs:
#     new_doc = []
#     for word in docs[counter]:
#         if word in model.wv.vocab : new_doc.append(word)
#     counter+=1
#     new_docs.append(new_doc)

# evaluation
validation = True
labels_prediction = evaluate_queries(docs_validation, model,validation)
print("evaluation finished... at last !")
if not valiation :
  data =  { 'id' : id_docs_test ,'label' : labels_prediction }
  df = pd.DataFrame(data)
  df.to_csv(result_path, index=False)


start creating word2Vec model with length word = 450
model creation finished
start loading models...
model with lentgh word = 450
0.125
evaluation finished... at last !
