<a href="https://colab.research.google.com/github/zahraDehghanian97/classify_text/blob/master/tf_idf%2Bvoting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Mount google drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **prerequisit**

In [24]:
import pandas as pd
import numpy as np
from itertools import groupby
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import balanced_accuracy_score

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density

# **load dataset**

In [45]:
train_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/train_set.csv'
test_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/test_set.csv'
result_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/prediction_all.csv'
model_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/tfidf.model'
stop_path = '/content/drive/MyDrive/Colab Notebooks/labeling_application/stop_word.csv'

In [17]:
# this function extract important word of each game
def prepare_string(string):
    result = ""
    # remove html tags from string
    TAG_RE = re.compile(r'<[^>]+>')
    string = TAG_RE.sub('', string)
    # remove punctuation mark
    string = re.sub("[!()@.?؛:،-]",'', string)
    strings=string.split()
    # remove stop word
    for s in strings : 
      s = s.strip()
      if not s in stop_word.values and s!='' : 
        result = result + " "+s
    return result

# load short list of stop word 
stop_word = pd.read_csv(stop_path ,header = None, encoding = 'utf8')

# load train set
docs = []
id_docs = []
sentences = []
word_count = []
labels = []
train_data = pd.read_csv(train_path, encoding='utf8')
for d in train_data.values:
    id_docs.append(d[0])
    p_string = prepare_string(d[1])
    docs.append(p_string)
    sentences.extend(p_string)
    word_count.append(len(p_string))
    labels.append(d[2])
# distinct_words = list(set(sentences))

# split validation data
id_docs_validation = id_docs[-150:]
id_docs = id_docs[:-150]
docs_validation = docs[-150:]
docs = docs [:-150]
word_count_validation = word_count[-150:]
word_count = word_count[:-150]
labels_validation = labels[-150:]
labels = labels[:-150]


# load test set
docs_test = []
id_docs_test = []
test_data = pd.read_csv(test_path, encoding='utf8')
for d in test_data.values:
    id_docs_test.append(d[0])
    docs_test.append(prepare_string(d[1]))

print(docs[0])
print(id_docs_test[0])

 بازی مین برنامه فکری باید مین موجود صفحه بازی کشف قابلیت برنامه عبارتند دارای سطوح دشواری آسان سخت امکان تعریف بازی سفارشی نمایش آمار بازی عملکرد ذخیره بازی ناتمام میتوانید آینده قسمت بازی ذخیره مراجعه بازی ادامه دهید سایر برنامه بازی متنوع نیز دیدن سپاسگزارم
40000


# **tf-idf**

In [21]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(docs)
X_validation = vectorizer.transform(docs_validation)
X_test_f = vectorizer.transform(docs_test)


# **Model**

In [47]:
def benchmark(model):
  model.fit(X_train, labels)
  prediction = model.predict(X_test)
  return prediction

validation = False
X_test = X_test_f
if validation :
  X_test = X_validation

results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
        (Perceptron(max_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(max_iter=50) ,"Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(), "Random forest")):
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3)))     # Train Liblinear model
    results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50,penalty=penalty))) # Train SGD model
results.append(benchmark(SGDClassifier(alpha=.0001, max_iter=50,penalty="elasticnet")))# Train SGD with Elastic Net penalty
results.append(benchmark(NearestCentroid())) # Train NearestCentroid without threshold
results.append(benchmark(MultinomialNB(alpha=.01))) # TrainMultinomial Naive Bayes classifier
results.append(benchmark(BernoulliNB(alpha=.01))) # Train Bernoulli Naive Bayes classifier 
results.append(benchmark(ComplementNB(alpha=.1))) # Train complement Naive Bayes classifier
results.append(benchmark(Pipeline([('feature_selection', SelectFromModel(LinearSVC(penalty="l1", 
                          dual=False,tol=1e-3))),('classification', LinearSVC(penalty="l2"))]))) # LinearSVC with L1-based feature selection


  '"sag" solver requires many iterations to fit '


find most probable class

In [48]:
new_results = []
for i in range(len(results[0])):
  temp = []
  for j in range(len(results)):
    temp.append(results[j][i])
  new_results.append(temp)
labels_prediction = []
for result in new_results :
    group = groupby(result)
    final_tag =  max(group, key=lambda k: len(list(k[1])))
    labels_prediction.append(final_tag[0])


show accuracy or save labels

In [51]:
if validation :
  score =balanced_accuracy_score(labels_validation,labels_prediction)
  print(score)
else:
  data =  { 'id' : id_docs_test ,'label' : labels_prediction }
  df = pd.DataFrame(data)
  df.to_csv(result_path, index=False)
  print("done")

done
