<a href="https://colab.research.google.com/github/zahraDehghanian97/classify_text/blob/master/doc2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Mount google drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **prerequisit**

In [30]:
import pandas as pd
import numpy as np
from itertools import groupby
import re
from gensim.models import Doc2Vec 
from gensim.models.doc2vec import TaggedDocument
from sklearn.metrics import balanced_accuracy_score

# **load dataset**

In [32]:
train_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/train_set.csv'
test_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/test_set.csv'
result_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/prediction_doc2vec2.csv'
model_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/doc2vec.model'
stop_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/stop_word.csv'

In [33]:
# this function extract important word of each game
def prepare_string(string):
    result = []
    # remove html tags from string
    TAG_RE = re.compile(r'<[^>]+>')
    string = TAG_RE.sub('', string)
    # remove punctuation mark
    string = re.sub("[!()@.?؛:،-]",'', string)
    strings=string.split()
    # remove stop word
    for s in strings : 
      s = s.strip()
      if not s in stop_word.values and s!='' : 
        result.append(s)
    return result

# load short list of stop word 
stop_word = pd.read_csv(stop_path ,header = None, encoding = 'utf8')

# load train set
docs = []
id_docs = []
sentences = []
word_count = []
labels = []
train_data = pd.read_csv(train_path, encoding='utf8')
for d in train_data.values:
    id_docs.append(d[0])
    p_string = prepare_string(d[1])
    docs.append(p_string)
    sentences.extend(p_string)
    word_count.append(len(p_string))
    labels.append(d[2])
distinct_words = list(set(sentences))

# # split validation data
# id_docs_validation = id_docs[-150:]
# id_docs = id_docs[:-150]
# docs_validation = docs[-150:]
# docs = docs [:-150]
# word_count_validation = word_count[-150:]
# word_count = word_count[:-150]
# labels_validation = labels[-150:]
# labels = labels[:-150]


# load test set
docs_test = []
id_docs_test = []
test_data = pd.read_csv(test_path, encoding='utf8')
for d in test_data.values:
    id_docs_test.append(d[0])
    docs_test.append(prepare_string(d[1]))

print(docs[0])
print(id_docs_test[0])

['بازی', 'مین', 'برنامه', 'فکری', 'باید', 'مین', 'موجود', 'صفحه', 'بازی', 'کشف', 'قابلیت', 'برنامه', 'عبارتند', 'دارای', 'سطوح', 'دشواری', 'آسان', 'سخت', 'امکان', 'تعریف', 'بازی', 'سفارشی', 'نمایش', 'آمار', 'بازی', 'عملکرد', 'ذخیره', 'بازی', 'ناتمام', 'میتوانید', 'آینده', 'قسمت', 'بازی', 'ذخیره', 'مراجعه', 'بازی', 'ادامه', 'دهید', 'سایر', 'برنامه', 'بازی', 'متنوع', 'نیز', 'دیدن', 'سپاسگزارم']
40000


# **doc2Vec Model**

In [35]:
# this func creates Doc2VecModel of given docs
def create_doc2Vec_model(docs, len_model,save_model):
    print("start creating doc2Vec model with length word = " + str(len_model))
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(docs)]
    model = Doc2Vec(documents, window=3, min_count=1, workers=8, vector_size=len_model)
    if save_model:
      model.save(model_path+str(len_model))
    print("model creation finished")
    return


# this function loads the implemented word2vec model
def loadModel(len_model):
    print("start loading models...")
    return Doc2Vec.load(model_path+str(len_model))


# this func evaluates all given queries 
def evaluate_queries(queries, model,validation):
    labels_prediction = []
    temp = 0
    for query in queries:
      new_queries=model.infer_vector(query) 
      sims = model.docvecs.most_similar([new_queries]) #top 10 document tags and their cosine similarity
      # labels_prediction.append(labels[int(sims[0][0])])
      result = []
      for sim in sims :
        result.append(labels[int(sim[0])])
      group = groupby(result)
      final_tag =  max(group, key=lambda k: len(list(k[1])))
      labels_prediction.append(final_tag[0])
      # if validation :
      #   gold = labels_validation[temp]
      #   print("result doc2vec retrieval : " + str(labels_prediction[-1])+"   gold result : " + str(gold))
      # else :
      #   print("evaluate result "+str(id_docs_test[temp])+"  predicted label : "+str(labels_prediction[-1]))
      temp += 1
    if validation :
        print(balanced_accuracy_score(labels_validation,labels_prediction))
    return labels_prediction

len_model = 300

# create Model
create_doc2Vec_model(docs, len_model,True)

# load model
model = loadModel(len_model)
print("model with lentgh word = " + str(len_model))

# evaluation
validation = False
test_file = docs_test
if validation :
  test_file = docs_validation
labels_prediction = evaluate_queries(test_file, model,validation)
print("evaluation finished... at last !")
if not validation :
  data =  { 'id' : id_docs_test ,'label' : labels_prediction }
  df = pd.DataFrame(data)
  df.to_csv(result_path, index=False)


start creating doc2Vec model with length word = 300
model creation finished
start loading models...
model with lentgh word = 300
evaluation finished... at last !


find best param

In [29]:

len_models  = [200,250,300,325,350,375,400,450,500,550] 
for len_model in len_models :
  # create Model
  create_doc2Vec_model(docs, len_model,False)

  # # load model
  # model = loadModel(len_model)
  # print("model with lentgh word = " + str(len_model))

  # evaluation
  validation = True
  test_file = docs_test
  if validation :
    test_file = docs_validation
  labels_prediction = evaluate_queries(test_file, model,validation)

start creating doc2Vec model with length word = 200
model creation finished




0.5753633067970496
start creating doc2Vec model with length word = 250
model creation finished
0.600412707865068
start creating doc2Vec model with length word = 300
model creation finished
0.6496947506514673
start creating doc2Vec model with length word = 325
model creation finished
0.5749519421634385
start creating doc2Vec model with length word = 350
model creation finished
0.5707735595335106
start creating doc2Vec model with length word = 375
model creation finished
0.5476104658625187
start creating doc2Vec model with length word = 400
model creation finished
0.5841532642016877
start creating doc2Vec model with length word = 450
model creation finished
0.5480276138537907
start creating doc2Vec model with length word = 500
model creation finished
0.6029696981908165
start creating doc2Vec model with length word = 550
model creation finished
0.5471142164632417
