<a href="https://colab.research.google.com/github/zahraDehghanian97/classify_text/blob/master/word2vec_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Mount google drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **prerequisit**

In [5]:
import pandas as pd
import numpy as np
from itertools import groupby
import re
from gensim.models import Word2Vec
from sklearn.metrics import balanced_accuracy_score

# **load dataset**

In [16]:
train_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/train_set.csv'
test_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/test_set.csv'
result_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/prediction.csv'
model_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/word2vec.model'
stop_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/stop_word.csv'

In [17]:
# this function extract important word of each game
def prepare_string(string):
    result = []
    # remove html tags from string
    TAG_RE = re.compile(r'<[^>]+>')
    string = TAG_RE.sub('', string)
    # remove punctuation mark
    string = re.sub("[!()@.?؛:،-]",'', string)
    strings=string.split()
    # remove stop word
    for s in strings : 
      if not s in stop_word.values and s!='' : 
        result.append(s)
    return result

# load short list of stop word 
stop_word = pd.read_csv(stop_path ,header = None, encoding = 'utf8')

# load train set
docs = []
id_docs = []
sentences = []
word_count = []
labels = []
train_data = pd.read_csv(train_path, encoding='utf8')
for d in train_data.values:
    id_docs.append(d[0])
    p_string = prepare_string(d[1])
    docs.append(p_string)
    sentences.extend(p_string)
    word_count.append(len(p_string))
    labels.append(d[2])
distinct_words = list(set(sentences))

# split validation data
id_docs_validation = id_docs[-150:]
id_docs = id_docs[:-150]
docs_validation = docs[-150:]
docs = docs [:-150]
word_count_validation = word_count[-150:]
word_count = word_count[:-150]
labels_validation = labels[-150:]
labels = labels[:-150]


# load test set
docs_test = []
id_docs_test = []
test_data = pd.read_csv(test_path, encoding='utf8')
for d in test_data.values:
    id_docs_test.append(d[0])
    docs_test.append(prepare_string(d[1]))

print(docs[0])
print("parsing files finished")

parsing files finished


# **word2Vec Model**

In [19]:
# this func creates word2VecModel of given docs
def create_word2Vec_model(docs, len_word):
    print("start creating word2Vec model with length word = " + str(len_word))
    model = Word2Vec(docs, window=3, min_count=1, workers=7, size=len_word)
    model.save(model_path)
    print("model creation finished")
    return


# this function loads the implemented word2vec model
def loadModel(len_model):
    print("start loading models...")
    return Word2Vec.load(model_path)


# this func is used to evaluate similarity between a query and all docs and return sorted docs
def findSimilar(query_terms, model):
    results = []
    counter=0
    new_query = []
    for q in query_terms :
        if q in model.wv.vocab : new_query.append(q)
    for doc in id_docs:
        new_doc = []
        for word in docs[counter]:
            if word in model.wv.vocab : new_doc.append(word)
        similarity = model.wv.n_similarity(new_query,new_doc)
        results.append((doc, similarity))
        counter+=1
    return sorted(results, key=lambda t: t[1], reverse=True)


# this func evaluates all given queries with given precisions (@k)
def evaluate_queries(queries, model,validation):
    labels_prediction = []
    temp = 0
    for query in queries:
        res = findSimilar(query, model)
        result = []
        for r in res[:10]:
            result.append(labels[r[0]])
        group = groupby(result)
        final_tag =  max(group, key=lambda k: len(list(k[1])))
        labels_prediction.append(final_tag[0])
        temp += 1
        if validation :
          gold = labels_validation[temp]
          print("result word2vec retrieval : " + str(final_tag[0])+"   gold result : " + str(gold))
    if validation :
        print(balanced_accuracy_score(labels_validation,labels_prediction))
    else :
        data =  { 'id' : id_docs_test ,
                  'label' : labels_prediction }
        df = pd.dataFrame(data)
        df.to_csv(result_path)
    return

len_model = 300

# # create Model
create_word2Vec_model(docs, len_model)

# load model
model = loadModel(len_model)
print("model with lentgh word = " + str(len_model))

# evaluation
evaluate_queries(docs_validation, model,True)
    


start creating word2Vec model with length word = 200
model creation finished
start loading models...
model with lentgh word = 200
result word2vec retrieval : 1   gold result : 1
result word2vec retrieval : 7   gold result : 7
result word2vec retrieval : 2   gold result : 5
result word2vec retrieval : 0   gold result : 7
result word2vec retrieval : 6   gold result : 7
result word2vec retrieval : 2   gold result : 2
result word2vec retrieval : 6   gold result : 1
result word2vec retrieval : 2   gold result : 2
result word2vec retrieval : 2   gold result : 2
result word2vec retrieval : 7   gold result : 7
result word2vec retrieval : 4   gold result : 2
result word2vec retrieval : 3   gold result : 3
result word2vec retrieval : 1   gold result : 1
result word2vec retrieval : 2   gold result : 2
result word2vec retrieval : 5   gold result : 5
result word2vec retrieval : 3   gold result : 3
result word2vec retrieval : 2   gold result : 2
result word2vec retrieval : 1   gold result : 1
result

