In [None]:
import nltk
nltk.download()

In [None]:
import sys
from datasets import load_dataset
import csv
import nltk 
from nltk.tokenize import RegexpTokenizer
from collections import Counter 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import numpy as np 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost 
from sklearn.metrics  import classification_report
from sklearn import metrics
import time
import re
import pandas as pd

In [None]:
csv.field_size_limit(sys.maxsize)

#https://huggingface.co/datasets/interpress_news_category_tr_lite
dataset = load_dataset("interpress_news_category_tr_lite")

print(dataset)

In [None]:
category_name = {0: "kültürsanat", 1: "ekonomi", 2: "siyaset", 3: "eğitim", 4: "dünya", 5: "spor", 6: "teknoloji", 7: "magazin", 8: "sağlık", 9: "gündem"}
train = dataset['train'][0]
train_x = []
train_y = []
test_x = []
test_y = []

for t in dataset['train']:
  train_x.append(t['content'])
  train_y.append(category_name[t['category']])
for t in dataset['test']:
  test_x.append(t['content'])
  test_y.append(category_name[t['category']])

df = pd.DataFrame()
df['content'] = train_x + test_x
df['category'] = train_y + test_y

df.head()

**Remove** **Stop Words**


In [None]:
# Text preprocessing
def preprocess(text):
    
    """
    Function: split text into words and return the root form of the words
    Args:
      text(str): the article
    Return:
      lem(list of str): a list of the root form of the article words
    """
        
    # Normalize text
    # text = re.sub(r"[^a-zA-Z]", " ", str(text).lower())
    text = str(text).lower().split(' ')
    
    # Tokenize text
    token = text
    
    # Remove stop words
    stop = stopwords.words("turkish")
    words = [t for t in token if t not in stop]
    
    # Lemmatization
    lem = [WordNetLemmatizer().lemmatize(w) for w in words]
    
    return lem

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
df = df[:10000]
df["Preprocessed_Text"] = df['content'][:10000].apply(lambda x: preprocess(x))
df.head(10)

**Text Exploration**

In [None]:
# Find the common words in each category
def find_common_words(df, category):
        
    """
    Function: find the most frequent words in the category and return the them
    Args:
      df(dataframe): the dataframe of articles
      category(str): the category name
    Return:
      the most frequant words in the category
    """
        
    # Create dataframes for the category
    cat_df = df[df["category"]==category]
    
    # Initialize words list for the category
    words = [word for tokens in cat_df["Preprocessed_Text"] for word in tokens]
    
    # Count words frequency
    words_counter = Counter(words)
 
    return words_counter.most_common(10)

In [None]:
print("Most common words in each category")
category = list(df['category'].unique())
for c in category:
    print(c, " News")
    print(find_common_words(df, c))
    print()

In [None]:
df['Preprocessed_Text2'] = df['Preprocessed_Text'].apply(' '.join)
df.head()

In [None]:
# Determine data and target
X = df['Preprocessed_Text2']
y = df['category']

**Feature Extraction**

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
# Use TF-IDF
tf_vec = TfidfVectorizer()
train_features = tf_vec.fit(X_train)
train_features = tf_vec.transform(X_train)
test_features = tf_vec.transform(X_test)

**Modeling**

In [None]:
# Train and evaluate model
def fit_eval_model(model, train_features, y_train, test_features, y_test):
    
    """
    Function: train and evaluate a machine learning classifier.
    Args:
      model: machine learning classifier
      train_features: train data extracted features
      y_train: train data lables
      test_features: train data extracted features
      y_test: train data lables
    Return:
      results(dictionary): a dictionary of the model training time and classification report
    """
    results ={}
    
    # Start time
    start = time.time()
    # Train the model
    model.fit(train_features, y_train)
    # End time
    end = time.time()
    # Calculate the training time
    results['train_time'] = end - start
    
    # Test the model
    train_predicted = model.predict(train_features)
    test_predicted = model.predict(test_features)
    
     # Classification report
    results['classification_report'] = classification_report(y_test, test_predicted)
        
    return results
    

In [None]:
# Initialize the models
sv = svm.SVC()
ab = AdaBoostClassifier(random_state = 1)
gb = GradientBoostingClassifier(random_state = 1)
xgb = xgboost.XGBClassifier(random_state = 1)
tree = DecisionTreeClassifier()
nb = MultinomialNB()


# Fit and evaluate models
results = {}
for cls in [sv, ab, gb, xgb, tree, nb]:
    cls_name = cls.__class__.__name__
    results[cls_name] = {}
    results[cls_name] = fit_eval_model(cls, train_features, y_train, test_features, y_test)

In [None]:
# Print classifiers results
for res in results:
    print (res)
    print()
    for i in results[res]:
        print (i, ':')
        print(results[res][i])
        print()
    print ('-----')
    print()

In [None]:
def classify_article(artcl):

    # Text preprocessing
    artcl = preprocess(artcl)
    artcl = ' '.join(artcl)

    # Use TF_IDF
    test = tf_vec.transform([artcl])

    # Use MultinomialNB model to classify the article
    predict = nb.predict(test)
    category = predict[0]

    return category

In [None]:
print(classify_article("UEFA, Tottenham'daki koronavirüs vakaları nedeniyle oynanmayan Tottenham-Rennes maçında kararını verdi. UEFA, Rennes'i 3-0 hükmen galip ilan etti, Tottenham Avrupa'dan elendi.")) #spor
print(classify_article("TL mevduatları için yeni düzenleme! Bakan ayrıntıları açıkladı")) #ekonomi
print(classify_article("Lerzan Mutlu'dan Demet Akalın'a sert sözler: Sabah 5’te dedikoduya uyanır")) #magazin
print(classify_article("Gepgeniş monitör! LG Electronics in. Kasım ayında duyurusunu yaptığı dünyanın İlk 21:9 görüntü oranına sahip ultra geniş monitörü 29EA93 Türkiye de pazara sunuldu. 2013 IF Tasarım Ödülünde birincilik unvanı elde eden ve Hollanda da yayınlanan Hardvvare Info nun gümüş ödülüne değer görülen ürün. çoklu işlemler için kullanıcılara ideal çözüm sunuyor. LG nın ultra geniş monitörü tüm çalışma ortamlarına uygun olma özelliği taşıyor. Büyük ekran niteliği birden fazla doküman ve pencerenin aynı anda görüntülenmesine olanak tanıyor. Ayrıca sütunlar bir kerede açılabiliyor veya iki sayfalı dokümanlar yan yana okunabiliyor. Ürünün 21:9 görüntü oranı sinema ekranlarına çok yakın olma özelliği taşıyor. Sinematik ölçüler filmlerin aynen istendiği şekilde izlenmesini sağlıyor. IPS teknolojisini arkasına alan monitör, LG nin Cinema Screen Desıgn özelliğini de ekleyince film izleme ve oyun oynama için ideal hale geliyor. Kullanıcılar ayrıca MHL bağlantısı aracılığıyla akıllı telefonlarındakı oyunları monitörde oynayabiliyor. Türkiye de satışa sunulan LG 29EA93 monitörün fiyatı 1599 TL olarak belirlendi. MH[{Id:2]29EA93, the first ultra-wide monitor of the world with a 21:9 image ratio which LG announced in November is now available in Turkish market. The product, which holds winning prize in 2013 IF Design Awards and silver prize by Hardware Info published in Netherhlands, offers ideal solutions for multiple tasks. LG s ultrawide monitor is convenient for all work places. Its wide screen makes it possible to view more than one document. Besides, two coloumns can be opened at once and 2-page documents can be viewed in one screen. LG s new ultra-wide screens are really convenient for graphic designers. Through its hardware calibration, it facilitates multiple-graphic tasks with deep colour intensity. This piece of news is also available in IT PRO.")) #teknoloji
