In [21]:
from estnltk.text import Text
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk

Imports data made by title_lemmas.py, which has converted all words in the title to their lemma form. Divides articles into two categories, articles with less than 1000 reads and articles with more than 1000 reads. This split was chosen, because this divides the articles into half minimizing class imbalance.

In [23]:
data = pd.read_csv("../data/postimees_lemma_title.txt", sep="\t", names=['id', 'datetime', 'title', 'share_count', 'comment_count', 'read_count', 'author'])
data = data[1:]
data = data.drop_duplicates(subset=['id'])
data = data.drop(columns=['id', 'datetime', 'author'])

data['comment_count'] = pd.to_numeric(data['comment_count'],errors='coerce')
data['read_count'] = pd.to_numeric(data['read_count'],errors='coerce')
data['share_count'] = pd.to_numeric(data['share_count'],errors='coerce')

def divide(row):
    if row.read_count < 1000:
        return 0
    else:
        return 1

data['category'] = data.apply(lambda i: divide(i), axis=1)

data.head()

Unnamed: 0,id,datetime,title,share_count,comment_count,read_count,author,category
0,6839008,2019-11-30T13:50:49+02:00,Tänavatantsijad tõid Leedu võistluselt Tartuss...,0.0,0,14,{'Lenel Karu'},0
1,6838978,2019-11-30T12:49:30+02:00,Tartu ärinõuandla andis välja Hea Tegu auhinnad,0.0,0,22,{'Tartu Postimees'},0
2,6838938,2019-11-30T11:08:41+02:00,Jalgrattur pööras sõiduautole ette ja sai viga,0.0,0,60,{'Tartu Postimees'},0
3,6838937,2019-11-30T11:04:54+02:00,Vales kohas vasakpööret sooritanud autojuht põ...,1.0,0,121,{'Tartu Postimees'},0
4,6838618,2019-11-29T19:04:38+02:00,Galerii: Toomas Asser sai ülikooli muuseumi pü...,73.0,0,1425,{'Tartu Postimees'},1


Sizes of categories

In [24]:
print(data[data['read_count'] < 1000].shape)
print(data[data['read_count'] >= 1000].shape)

(111458, 8)
(102563, 8)


In [25]:
titles = data.title.tolist()
titles = [str(i) for i in titles]

The text in the titles is turned into numerical feature vectors of length 1 to 3 (1-grams to 3-grams) and assigned a frequency.

In [39]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(ngram_range=(1,4))
#count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(titles)

Occurences of words are turned to frequencies of the word in title.

In [40]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

In [41]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

Data is divided into test and train and LinearSVC is used for the model.

In [42]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVR
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import SGDRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix


X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, data.category, test_size=0.3)

#clf = SGDRegressor(max_iter=10000).fit(X_train, y_train)
#clf = SGDClassifier().fit(X_train, y_train)
clf = LinearSVC()
#clf = LinearSVR().fit(X_train, y_train)
#clf = MultinomialNB(alpha=0.8).fit(X_train, y_train)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)
print(confusion)
print("Classification accuracy is: ", (confusion[0][0] + confusion[1][1]) / np.sum(confusion))

[[25471  7858]
 [ 9772 21106]]
Classification accuracy is:  0.7254193468001932


In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

#mean_squared_error(y_test, y_pred)
accuracy_score(y_test, y_pred)
#roc_auc_score(y_test, y_pred)

Test it yourself by inserting Postimees titles into the list

In [None]:
titles = ['Rakverest pärit teatrimehed pälvisid stipendiumi', 
         'Erootilised jõulud - kuidas oma suhe pühade ajal uuele tasemele viia',
         'Vanamehe lapšelapš Priidik hakkas räppariks ja avaldas nubluga hiti']

lemma_titles = []
for title in titles:
    text = Text(str(title)).analyse('morphology')
    lemmas = text.morph_analysis.lemma
    new = ""
    for s in lemmas:
        new += str(s[0]) + " "
    lemma_titles.append(new)

X_new_counts = count_vect.transform(lemma_titles)
X_new = tfidf_transformer.transform(X_new_counts)

y_pred = clf.predict(X_new)

pd.DataFrame({'title': titles, 'predicted_category': y_pred})