<a href="https://colab.research.google.com/github/zahraDehghanian97/classify_text/blob/master/classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Mount google drive**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **prerequisit**

In [4]:
!pip install hickle



In [5]:
import nltk as nltk
import pandas as pd
import numpy as np
from itertools import groupby
import re
import pickle
import hickle as hkl
from sklearn.metrics import balanced_accuracy_score

# **load dataset**

In [2]:
train_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/train_set.csv'
test_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/test_set_1.csv'
results_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/result.csv'
model_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/DataModel.txt'
TF_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/TFModel.txt'
stop_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/stop_word.csv'

In [6]:
# this function extract important word of each game
def prepare_string(string):
    result = []
    # remove html tags from string
    TAG_RE = re.compile(r'<[^>]+>')
    string = TAG_RE.sub('', string)
    # remove punctuation mark
    string = re.sub("[!()@.?؛:،-]",'', string)
    strings=string.split(" ")
    # remove stop word
    for s in strings : 
      if not s in stop_word.values and s!='' : 
        result.append(s)
    return result

# load short list of stop word 
stop_word = pd.read_csv(stop_path ,header = None, encoding = 'utf8')

# load train set
docs = []
id_docs = []
sentences = []
word_count = []
labels = []
train_data = pd.read_csv(train_path, encoding='utf8')
for d in train_data.values:
    id_docs.append(d[0])
    p_string = prepare_string(d[1])
    docs.append(p_string)
    sentences.extend(p_string)
    word_count.append(len(p_string))
    labels.append(d[2])
distinct_words = list(set(sentences))

# split validation data
id_docs_validation = id_docs[-150:]
id_docs = id_docs[:-150]
docs_validation = docs[-150:]
docs = docs [:-150]
word_count_validation = word_count[-150:]
word_count = word_count[:-150]
labels_validation = labels[-150:]
labels = labels[:-150]

# # load test set
# docs_test = []
# id_docs_test = []
# test_data = pd.read_csv(test_path, encoding='utf8')
# for d in test_data.values:
#     id_docs_test.append(d[0])
#     docs_test.append(prepare_string(d[1]))

print("parsing files finished")

parsing files finished


# **TF-IDF**

In [None]:
# this function calculates term frequency per doc
def calculate_TF(TF_array, docs, distinct_words):
    docID = 0
    for doc in docs:
        for word in doc:
            i, = np.where(distinct_words == word)
            TF_array[i, docID] += 1
        docID += 1
        # print("compute TF array for "+str(docID)+" docs")
    return


# this function calculate CF of each word in all file
def calculate_CF(CF, distinct, text):
    len_CF=len(CF)
    for i in range(len(CF)):
        for j in range(len(text)):
            CF[i] += text[j].count(distinct[i])
        # print( i , len_CF , distinct[i] , CF[i])



distinct_words = np.transpose(distinct_words)
CF = np.zeros([np.shape(distinct_words)[0], 1])
TF_array = np.zeros([np.shape(distinct_words)[0], np.shape(id_docs)[0]])
calculate_CF(CF, distinct_words, docs )
print("calculating CF finished")
calculate_TF(TF_array, docs, distinct_words)
print("calculate TF finished")
# with open(model_path, 'wb') as f:
#     pickle.dump((id_docs, docs, distinct_words, word_count,labels , CF, id_docs_test, docs_test), f)
# hkl.dump(TF_array, TF_path)
# print("built pickle file")

# **Unigram Model**

In [None]:
# load data model 
u_coefficient = 50
# with open(model_path, "rb") as f:
#     id_docs, docs, distinct_words, word_count ,labels, CF, id_docs_test, docs_test = pickle.load(f)
# TF_array = hkl.load(TF_path)
corpus_size = np.sum(word_count)
print("load files finished")


In [1]:
# this func chooses a subsample of TF array for given query
def createTFForQuery(query_terms):
    idx = 0
    TempTF = []
    for word in distinct_words:
        if word in query_terms:
            TempTF.append(TF_array[idx, :])
        idx += 1
    return TempTF


# this function calculates similarity of one query with given document
def sim_of_query_to_doc(docID, tempTF):
    similarity = 1
    tmp = 0
    row = np.shape(tempTF)[0]
    for i in range(0, row):
        if (CF[i] > 0):
            tmp = 1
            arg = (tempTF[i][docID] + ((u_coefficient * CF[i]) / corpus_size)) / (word_count[docID] + u_coefficient)
            similarity *= arg
    if (tmp == 0):
        similarity = 0
    return similarity


# this function returns the ranked documents according to given query
def evalOneQuery(query_terms):
    indx = 0
    results = []
    # print("query is: ")
    # print(query_terms)
    tempTF = createTFForQuery(query_terms)
    for doc in id_docs:
        similarity = sim_of_query_to_doc(indx, tempTF)
        results.append((doc, similarity))
        indx += 1
    return sorted(results, key=lambda t: t[1], reverse=True)



# this func evaluates all given queries with given precisions (@k)
def evaluate_queries(queries,validation):
    labels_prediction = []
    temp = 0
    for query in queries:
        res = evalOneQuery(query)
        result = []
        for r in res[:10]:
            result.append(labels[r[0]])
        group = groupby(result)
        final_tag =  max(group, key=lambda k: len(list(k[1])))
        gold = labels_validation[temp]
        print("result TF-IDF retrieval : " + str(final_tag[0])+"   gold result : " + str(gold))
        labels_prediction.append(final_tag[0])
        temp += 1
    if validation :
        print(balanced_accuracy_score(labels_validation,labels_prediction))
    else :
        data =  { 'id' : id_docs_test ,
                  'label' : labels_prediction }
        df = pd.dataFrame(data)
        df.to_csv('/content/drive/MyDrive/Colab Notebooks/labeling_application/prediction.csv')
    return

evaluate_queries(docs_validation,True)

NameError: ignored