In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('turkish_data.csv')

In [3]:
df.head(2)

Unnamed: 0,category,text
0,siyaset,3 milyon ile ön seçim vaadi mhp nin 10 olağan...
1,siyaset,mesut_yılmaz yüce_divan da ceza alabilirdi pr...


In [4]:
print(f"Total number of records present in dataset: {df.shape[0]}")

Total number of records present in dataset: 4900


This Dataset consist of news with different category like 'dunya'(word), siyaset(Politics), ekonomi(economics) etc. 

In [5]:
print("Total different category in Datatset:")
print(df['category'].unique())

Total different category in Datatset:
['siyaset ' 'dunya ' 'ekonomi ' 'kultur ' 'saglik ' 'spor ' 'teknoloji ']


In [6]:
#uncomment belwo line to install ktrain 
#!pip install ktrain

In [7]:
#import required packaged
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import re

import nltk
nltk.download('words')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from TurkishStemmer import TurkishStemmer
stemmer = TurkishStemmer()


import ktrain
from ktrain import text
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package words to /home/ubuntu/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
def dataset_cleaning(df_data):
    """This function helps to remove row with missing value or if there is any dupicate records"""
    df_data = df_data.dropna()
    df_data = df_data.drop_duplicates()
    df_data = df_data.reset_index(drop=True)
    return df_data

def text_cleaning(text):
    """This function helps to clean a text after removing stop words, short words, special character,
    any link present and use stemmer to provide near to root word"""
    stop = set(stopwords.words("turkish"))
    text = text.lower()
    text = re.sub('[^abcçdefgğhıijklmnoöprsştuüvyzmi̇z]',' ', text)
    text = re.sub(r'http\S+', ' ', text)
    text = " ".join([stemmer.stem(word) for word in text.split() if (word not in stop) and len(word)>1])
    return text

In [9]:
dataset = dataset_cleaning(df)

In [10]:
dataset.shape

(4540, 2)

In [11]:
#We are taking only 25 percent of data from whole dataset to fine tune our model with less time.
dataset = dataset.sample(frac=0.25)

In [12]:
dataset['cleaned_text'] = dataset['text'].apply(lambda x:text_cleaning(x))
dataset.head(2)

Unnamed: 0,category,text,cleaned_text
1295,dunya,türkiye_nato dan patriot talebinde bulunmadı ...,türki nato dan patriot talep bulun nato genel ...
1627,ekonomi,internet bankacılığı kullananlar dikkat ! ist...,internet bankacılık kullanan dikkat istanbul b...


# Split Dataset into Training and Test

In [13]:
#split the dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(np.array(dataset['cleaned_text']), np.array(dataset['category']),
                                                   test_size = 0.20, random_state=42, stratify = dataset['category'])

# Model Tuning

Electra model ref: https://huggingface.co/dbmdz/electra-base-turkish-cased-discriminator
i am going to use ELECTRA instead of BERT as it peformed beter with high GLEU score in different research. Electra perfomance is better when you have computing resource limitation. It takes less training time comparing to BERT. 

In [14]:

MODEL_NAME = 'dbmdz/electra-base-turkish-cased-discriminator'
t= text.Transformer(MODEL_NAME, maxlen=500, classes= dataset['category'].unique())
train = t.preprocess_train(X_train, y_train)
val = t.preprocess_test(X_test, y_test)
model = t.get_classifier()
leaner = ktrain.get_learner(model, train_data = train, val_data = val, batch_size=20)

preprocessing train...
language: tr
train sequence lengths:
	mean : 233
	95percentile : 572
	99percentile : 848


Is Multi-Label? False
preprocessing test...
language: tr
test sequence lengths:
	mean : 257
	95percentile : 633
	99percentile : 1009


In [15]:
#leaner.lr_find() #to find good leaning rate

In [16]:
leaner.fit_onecycle(5e-5, 3) 



begin training using onecycle policy with max lr of 5e-05...
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f0b11551290>

In [22]:
#Saving fine-tuned model

In [19]:
predictor = ktrain.get_predictor(leaner.model, preproc=t)
predictor.save("./ktrain/electra_text_classifier")

In [27]:
#Lets take a sample news
news_text = dataset['cleaned_text'].iloc[10]
actual_category = dataset['category'].iloc[10]

In [32]:
print(f"Actual category is {actual_category}")

Actual category is spor 


In [31]:
print(f"predicted category is {predictor.predict(dataset['cleaned_text'].iloc[10])}")

predicted category is spor 


In [23]:
#Reload saved model for prediction

In [24]:
reloaded_predictor = ktrain.load_predictor('./ktrain/electra_text_classifier')
reloaded_predictor.predict(dataset['cleaned_text'].iloc[10])

'spor '

In [38]:
print(f"confidence score: {np.max(reloaded_predictor.predict_proba(dataset['cleaned_text'].iloc[10]))}")

confidence score: 0.8901882767677307
