In [1]:
!pip install xlrd==1.2.0



# Importing the Libraries

In [2]:
import re
import os
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

from urduhack.preprocessing import *
from urduhack.tokenization import *
from urduhack.normalization import *

from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

# Reading the train data from all the csv files in the dataset folder and merging into a single dataframe 

In [3]:
train_data = pd.read_csv("urdu-news-dataset-1M.csv", usecols = ['Headline', "Category"])

In [4]:
train_data.head()

Unnamed: 0,Headline,Category
0,عالمی بینک عسکریت پسندی سے متاثرہ خاندانوں کی ...,Business & Economics
1,مالی سال 2020 ریٹرن فائل کرنے والوں کی تعداد م...,Business & Economics
2,جاپان کو سندھ کے خصوصی اقتصادی زون میں سرمایہ ...,Business & Economics
3,برامدات 767 فیصد بڑھ کر ارب 16 کروڑ ڈالر سے زائد,Business & Economics
4,کے الیکٹرک کو اضافی بجلی گیس کی فراہمی کے قانو...,Business & Economics


In [5]:
train_data["Category"].value_counts()

Sports                                   454335
Entertainment                            253730
Business & Economics                     251169
Science & Technology                      79105
https://www.dawnnews.tv/news/1134162/         1
Name: Category, dtype: int64

In [6]:
train_data = train_data[(train_data["Category"] == "Sports") | (train_data["Category"] == "Entertainment") | (train_data["Category"] == "Business & Economics") | (train_data["Category"] == "Science & Technology")]

In [7]:
train_data["Category"].value_counts()

Sports                  454335
Entertainment           253730
Business & Economics    251169
Science & Technology     79105
Name: Category, dtype: int64

In [8]:
X = train_data["Headline"]
y = train_data["Category"]

from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(X,y, test_size= 0.2)

In [9]:
print(X_train.shape)
print(X_test.shape)

(830671,)
(207668,)


# Encoding the labels of the dataset into a numric form

In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [11]:
np.unique(y_test)

array([0, 1, 2, 3])

# Creating and applying the cleaning function to clean the text of the while dataset

In [12]:
stopwords_list = ["اب","ابھی","اپنا","اپنے","اپنی","اٹھا","اس","اسے","اسی","اگر","ان","انہوں","انہی","انہیں","انھیں","او","اور","اے","ایسا","ایسے","ایسی","ایک","آ","آتا","آتے","آتی","آگے","آنا","آنے","آنی","آئے","آئی","آئیں","آیا","با","بڑا","بڑے","بڑی","بعد","بعض","بلکہ","بہت","بھی","بے","پاس","پر","پہلے","پھر","تا","تاکہ","تب","تجھ","تجھے","تک","تم","تمام","تمہارا","تمہارے","تمھارے","تمہاری","تمہیں","تمھیں","تھا","تھے","تھی","تھیں","تو","تیری","تیرے","جا","جاتا","جاتی","جاتے","جاتی","جانے","جانی","جاؤ","جائے","جائیں","جب","جس","جن","جنہوں","جنہیں","جو","جیسا","جیسے","جیسی","جیسوں","چاہیئے","چلا","چاہے","چونکہ","حالاں","حالانکہ","دو","دونوں","دوں","دے","دی","دیا","دیں","دیے","دیتا","دیتے","دیتی","دینا","دینے","دینی","دیئے","ڈالا","ڈالنا","ڈالنے","ڈالنی","ڈالے","ڈالی","ذرا","رکھا","رکھتا","رکھتے","رکھتی","رکھنا","رکھنے","رکھنی","رکھے","رکھی","رہ","رہا","رہتا","رہتے","رہتی","رہنا","رہنے","رہنی","رہو","رہے","رہی","رہیں","زیادہ","سا","سامنے","سب","سکتا","سو","سے","سی","شاید","صرف","طرح","طرف","عین","کا","کبھی","کچھ","کہہ","کر","کرتا","کرتے","کرتی","کرنا","کرنے","کرو","کروں","کرے","کریں","کس","کسے","کسی","کہ","کہا","کہے","کو","کون","کوئی","کے","کی","کیا","کیسے","کیوں","کیونکہ","کیے","کئے","گا","گویا","گے","گی","گیا","گئے","گئی","لا","لاتا","لاتے","ل","اتی","لانا","لانے","لانی","لایا","لائے","لائی","لگا","لگے","لگی","لگیں","لو","لے","لی","لیا","لیتا","لیتے","لیتی","لیکن","لیں","لیے","لئے","مجھ","مجھے","مگر","میرا","میرے","میری","میں","نا","نہ","نہایت","نہیں","نے","ہاں","ہر","ہم","ہمارا","ہمارے","ہماری","ہو","ہوا","ہوتا","ہوتے","ہوتی","ہوتیں","ہوں","ہونا","ہونگے","ہونے","ہونی","ہوئے","ہوئی","ہوئیں","ہے","ہی","ہیں","و","والا","والوں","والے","والی","وہ","وہاں","وہی","وہیں","یا","یعنی","یہ","یہاں","یہی","یہیں"]

In [13]:
def clean_text(text):
    text = normalize_whitespace(text)
    text = remove_punctuation(text)
    text = remove_accents(text)
    text = replace_urls(text)
    text = replace_emails(text)
    text = replace_currency_symbols(text)
    text = normalize_characters(text)
    text = normalize_combine_characters(text)
    text = english_characters_space(text)
    text = digits_space(text)
    text = text.lower()
    text = normalize(text)
    words = text.split()
    words = [word for word in words if not word in stopwords_list]
    text = " ".join(words)
    return text

In [15]:
X_train_map = map(clean_text, list(X_train.values))
X_test_map = map(clean_text, list(X_test.values))

In [16]:
X_train = list(X_train_map)
X_test = list(X_test_map)

# Vectorizing the dataset 

In [17]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

import pickle

filename = 'Vectorizer_News.sav'
pickle.dump(vectorizer, open(filename, 'wb'))

# Applying the random forest Classifier

In [18]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

In [19]:
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [20]:
y_pred = rfc.predict(X_test)

In [21]:
accuracy_score(y_test,y_pred)

0.8481

In [22]:
print(y_pred[:30])
print(y_test[:30])

[3 0 1 0 3 3 0 3 0 3 3 1 3 3 0 3 0 0 0 1 0 1 1 3 1 3 1 1 3 0]
[3 2 1 0 3 3 2 0 0 3 3 1 3 2 0 3 3 0 0 1 0 0 1 3 2 3 1 1 3 3]


In [23]:
import pickle

filename = 'RF_Model.sav'
pickle.dump(rfc, open(filename, 'wb'))

# Applying the SVC Classifier

In [24]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

SVC()

In [25]:
y_pred = svc.predict(X_test)

In [26]:
accuracy_score(y_test,y_pred)

0.8844

In [27]:
import pickle

filename = 'SVC_Model.sav'
pickle.dump(svc, open(filename, 'wb'))

# Applying the SGDC

In [28]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
SGDClassifier()
y_pred = sgd.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))

print(y_pred[:10])
print(y_test[:10])

import pickle

filename = 'SGD_Model.sav'
pickle.dump(sgd, open(filename, 'wb'))

Accuracy 0.8954
[3 0 1 0 3 3 0 0 0 3]
[3 2 1 0 3 3 2 0 0 3]


# Applying the decision tree classifier

In [29]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

DecisionTreeClassifier()

In [30]:
y_pred = dtc.predict(X_test)
accuracy_score(y_test,y_pred)

0.8015

In [31]:
import pickle

filename = 'Decision_tree_Model.sav'
pickle.dump(dtc, open(filename, 'wb'))

# Testing the best model over a single text file

In [32]:
# Test over a single text file
text = "آسان کام ہوگا"
text = clean_text(text)
text = vectorizer.transform([text])
label = sgd.predict(text)

In [33]:
label

array([1])

In [34]:
le.classes_

array(['Business & Economics', 'Entertainment', 'Science & Technology',
       'Sports'], dtype=object)