In [1]:
import numpy as np
from TurkishStemmer import TurkishStemmer 
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")
from sklearn.svm import LinearSVC
import time

In [2]:
test_data_file_name = 'test_tweets.txt'
train_data_file_name = 'train_tweets.txt'
stop_words_file_name = 'stop_words_tr_147.txt'

In [3]:
# input text data file name
# output text to class dictionary
def get_data_class_pairs(file_name):
  data2class = dict()
  with open(file_name, 'r', encoding='utf-8') as f:
    for l in f.readlines():
      arr = l.split('\t')
      data2class[arr[0]] = arr[1]
  return data2class

# docs is list of list of words
def get_corpus(docs):
  d = dict()
  for doc in docs:
    for w in doc:
      d[w] = True
  
  i = 0
  for w in d:
    d[w] = i
    i += 1

  return d

# here doc is a dictionary of word to count of word in the doc
def get_features(doc, corpus):
  l = []
  for w in corpus:
    if w in doc:
      l.append(doc[w])
    else:
      l.append(0)
  return l

def remove_stop_words(stop_words_file_name, words):
  with open(stop_words_file_name, 'r', encoding="utf-8") as myfile:
    stop_words = myfile.read().lower().strip().split()

  return [x for x in words if x not in stop_words]

@np.vectorize
def turkish_stemmer_vectorize(words):
  if len(words) == 0:
    return []
  stemmer = TurkishStemmer()
  return stemmer.stem(words)

# docs is list of list of words
# corpus is dictionary of word to index
def get_features_as_freq_dist(docs, corpus):
  l = np.zeros((len(docs), len(corpus)))
  for i,doc in enumerate(docs):
    d = dict()
    for word in doc:
      if word in d:
        d[word] += 1
      else:
        d[word] = 1
    for word in doc:
      l[i, corpus[word]] = d[word]
      
  return l

# docs is list of list of words
def get_features_as_binary_freq_dist(docs, corpus):
  l = np.zeros((len(docs), len(corpus)))
  for i,doc in enumerate(docs):
    for word in doc:
      l[i, corpus[word]] = 1
      
  return l

def get_cleaned_docs_from_file(file_name):
  datas = get_data_class_pairs(file_name)
  raw_docs = []
  cleaned_docs = []
  for d in datas.keys():
    raw_docs.append(d.split())
  for d in raw_docs:
    stop_words_removed = remove_stop_words(stop_words_file_name, d)
    cleaned_docs.append(turkish_stemmer_vectorize(stop_words_removed))

  return cleaned_docs, np.array(list(datas.values()))


In [4]:
%%time
cleaned_docs, y = get_cleaned_docs_from_file(train_data_file_name)
cleaned_docs2, y2 = get_cleaned_docs_from_file(test_data_file_name)
corpus = get_corpus(cleaned_docs + cleaned_docs2)

# generate classifier
clf = LinearSVC(random_state=0, tol=1e-5)


Wall time: 13.7 s


In [5]:
%%time
# generate features, fit the model and test
features = get_features_as_freq_dist(cleaned_docs, corpus)
clf.fit(features, y)

features2 = get_features_as_freq_dist(cleaned_docs2, corpus)
predictions = clf.predict(features2)
print(sum(predictions==y2) / len(predictions))

0.6527777777777778
Wall time: 19.1 s


In [6]:
%%time
# generate features, fit the model and test
bin_features = get_features_as_binary_freq_dist(cleaned_docs, corpus)
clf = LinearSVC(random_state=0, tol=1e-5)
clf.fit(bin_features, y)

bin_features2 = get_features_as_binary_freq_dist(cleaned_docs2, corpus)
predictions = clf.predict(bin_features2)
print(sum(predictions==y2) / len(predictions))

0.6516203703703703
Wall time: 18.3 s


In [7]:
%%time
merged_features = np.concatenate((features,bin_features[:,:100]), axis=1)
# clear some RAM
features = 0
bin_features = 0
merged_features2 = np.concatenate((features2,bin_features2[:,:100]), axis=1)
# clear some RAM
features2 = 0
bin_features2 = 0
clf.fit(merged_features, y)
predictions = clf.predict(merged_features2)
print(sum(predictions==y2) / len(predictions))

0.6533564814814815
Wall time: 43.5 s


In [8]:
bin_features = 0
bin_features2 = 0

features = 0
features2 = 0