<a href="https://colab.research.google.com/github/zahraDehghanian97/classify_text/blob/master/word2vec_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Mount google drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **prerequisit**

In [1]:
import pandas as pd
import numpy as np
from itertools import groupby
import re
from gensim.models import Word2Vec
from sklearn.metrics import balanced_accuracy_score

# **load dataset**

In [13]:
train_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/train_set.csv'
test_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/test_set.csv'
result_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/prediction3.csv'
model_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/word2vec.model'
stop_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/stop_word.csv'

In [15]:
# this function extract important word of each game
def prepare_string(string):
    result = []
    # remove html tags from string
    TAG_RE = re.compile(r'<[^>]+>')
    string = TAG_RE.sub('', string)
    # remove punctuation mark
    string = re.sub("[!()@.?؛:،-]",'', string)
    strings=string.split()
    # remove stop word
    for s in strings : 
      s = s.strip()
      if not s in stop_word.values and s!='' : 
        result.append(s)
    return result

# load short list of stop word 
stop_word = pd.read_csv(stop_path ,header = None, encoding = 'utf8')

# load train set
docs = []
id_docs = []
sentences = []
word_count = []
labels = []
train_data = pd.read_csv(train_path, encoding='utf8')
for d in train_data.values:
    id_docs.append(d[0])
    p_string = prepare_string(d[1])
    docs.append(p_string)
    sentences.extend(p_string)
    word_count.append(len(p_string))
    labels.append(d[2])
distinct_words = list(set(sentences))

# # split validation data
# id_docs_validation = id_docs[-150:]
# id_docs = id_docs[:-150]
# docs_validation = docs[-150:]
# docs = docs [:-150]
# word_count_validation = word_count[-150:]
# word_count = word_count[:-150]
# labels_validation = labels[-150:]
# labels = labels[:-150]


# load test set
docs_test = []
id_docs_test = []
test_data = pd.read_csv(test_path, encoding='utf8')
for d in test_data.values:
    id_docs_test.append(d[0])
    docs_test.append(prepare_string(d[1]))

# print(docs[0])
# print(id_docs_test[0])

# **word2Vec Model**

In [None]:
# this func creates word2VecModel of given docs
def create_word2Vec_model(docs, len_model):
    print("start creating word2Vec model with length word = " + str(len_model))
    model = Word2Vec(docs, window=3, min_count=1, workers=8, size=len_model)
    model.save(model_path+str(len_model))
    print("model creation finished")
    return


# this function loads the implemented word2vec model
def loadModel(len_model):
    print("start loading models...")
    return Word2Vec.load(model_path+str(len_model))


# this func is used to evaluate similarity between a query and all docs and return sorted docs
def findSimilar(query_terms, model):
    results = []
    new_query = []
    for q in query_terms :
        if q in model.wv.vocab : new_query.append(q)
    counter=0
    for doc in id_docs:
        similarity = model.wv.n_similarity(new_query,new_docs[counter])
        counter+=1
        results.append((doc, similarity))
    return sorted(results, key=lambda t: t[1], reverse=True)


# this func evaluates all given queries with given precisions (@k)
def evaluate_queries(queries, model,validation):
    labels_prediction = []
    temp = 0
    for query in queries:
        res = findSimilar(query, model)
        labels_prediction.append(labels[res[0][0]])

        # result = []
        # for r in res[:10]:
        #     result.append(labels[r[0]])
        # group = groupby(result)
        # final_tag =  max(group, key=lambda k: len(list(k[1])))
        # labels_prediction.append(final_tag[0])

        if validation :
          gold = labels_validation[temp]
          print("result word2vec retrieval : " + str(labels_prediction[-1])+"   gold result : " + str(gold))
        else :
          print("evaluate result "+str(id_docs_test[temp])+"  predicted label : "+str(labels_prediction[-1]))
          dist_save=20
          if temp% dist_save == 0 :
            if (temp ==0):
              data =  { 'id' : id_docs_test[0] ,'label' : labels_prediction[:] }
              df = pd.DataFrame(data)
              df.to_csv(result_path, index=False)
            else :
              data =  { 'id' : id_docs_test[temp-dist_save+1:temp+1] ,'label' : labels_prediction[temp-dist_save+1 :] }
              df = pd.DataFrame(data)
              df.to_csv(result_path, mode='a',header = None,index=False)
              print(">>>> save result from the begining to data number "+str(id_docs_test[temp]))
        temp += 1
    if validation :
        print(balanced_accuracy_score(labels_validation,labels_prediction))
    return labels_prediction

len_model = 200

# # create Model
create_word2Vec_model(docs, len_model)

# load model
model = loadModel(len_model)
print("model with lentgh word = " + str(len_model))

# make data ready
new_docs = []
counter=0
for doc in id_docs:
    new_doc = []
    for word in docs[counter]:
        if word in model.wv.vocab : new_doc.append(word)
    counter+=1
    new_docs.append(new_doc)

# evaluation
validation = False
labels_prediction = evaluate_queries(docs_test, model,validation)
print("evaluation finished... at last !")



start creating word2Vec model with length word = 200
model creation finished
start loading models...
model with lentgh word = 200
evaluate result 40000  predicted label : 6
evaluate result 40001  predicted label : 1
evaluate result 40002  predicted label : 5
evaluate result 40003  predicted label : 9
evaluate result 40004  predicted label : 3
evaluate result 40005  predicted label : 2
evaluate result 40006  predicted label : 2
evaluate result 40007  predicted label : 9
evaluate result 40008  predicted label : 9
evaluate result 40009  predicted label : 3
evaluate result 40010  predicted label : 9
evaluate result 40011  predicted label : 4
evaluate result 40012  predicted label : 2
evaluate result 40013  predicted label : 9
evaluate result 40014  predicted label : 5
evaluate result 40015  predicted label : 3
evaluate result 40016  predicted label : 2
evaluate result 40017  predicted label : 8
evaluate result 40018  predicted label : 2
evaluate result 40019  predicted label : 9
evaluate r