In [1]:
import os
import glob
import pathlib
import pandas as pd
import numpy as np
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import string
import nltk
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [2]:
# defining function that contains punctuation removal
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

## Extracting files


In [3]:
def extract_files(folder_path):
    csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
    dataframes = []

    for file in csv_files:
        df = pd.read_csv(file)
        dataframes.append(df)
    return dataframes


extracted = extract_files("Topic_modelling")

In [13]:
alldata = pd.DataFrame()
for listitems in extracted:
    alldata = pd.concat([alldata, listitems])

## Feature extraction

In [14]:
alldata.head()

Unnamed: 0,headlines,description,content,url,category
0,Nirmala Sitharaman to equal Morarji Desai’s re...,With the presentation of the interim budget on...,"Sitharaman, the first full-time woman finance ...",https://indianexpress.com/article/business/bud...,business
1,"‘Will densify network, want to be at least no....","'In terms of market share, we aim to double it...",The merger of Tata group’s budget airlines Air...,https://indianexpress.com/article/business/avi...,business
2,Air India group to induct an aircraft every si...,Air India currently has 117 operational aircra...,The Air India group plans to induct one aircra...,https://indianexpress.com/article/business/avi...,business
3,Red Sea woes: Exporters seek increased credit ...,Rising attacks forced shippers to consider the...,Indian exporters have asked the central govern...,https://indianexpress.com/article/business/red...,business
4,Air India group to induct a plane every 6 days...,"Apart from fleet expansion, 2024 will also see...",The Air India group plans to induct one aircra...,https://indianexpress.com/article/business/avi...,business


In [15]:
targets = alldata["category"]
feature = alldata["headlines"]

Remove punctuation, numbers and stopwords from the text. Then, apply stemming to the words.

In [16]:
feature = feature.apply(lambda x: remove_punctuation(x))
feature = feature.apply(lambda x: x.lower())

In [8]:
feature

0       nirmala sitharaman to equal morarji desai’s re...
1       ‘will densify network want to be at least no 2...
2       air india group to induct an aircraft every si...
3       red sea woes exporters seek increased credit a...
4       air india group to induct a plane every 6 days...
                              ...                        
1995    vivaldi’s privacy and customisabilityfocused b...
1996    from meta quest 3 to ray ban smart glasses her...
1997    samsung galaxy s24 rumoured to launch on janua...
1998    ‘we continuously iterate… building prototypes ...
1999    chatgpt users can now browse internet openai says
Name: headlines, Length: 10000, dtype: object

In [80]:
# Tokenization
import re
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

feature = feature.apply(lambda x: tokenize(x))

In [9]:
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

feature = sent_tokenize(feature)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abiro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


TypeError: expected string or bytes-like object, got 'Series'

In [17]:
feature.head()

0    nirmala sitharaman to equal morarji desai’s re...
1    ‘will densify network want to be at least no 2...
2    air india group to induct an aircraft every si...
3    red sea woes exporters seek increased credit a...
4    air india group to induct a plane every 6 days...
Name: headlines, dtype: object

In [11]:
# Removing stopwords
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopwords]
    return text

feature = feature.apply(lambda x: remove_stopwords(x))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abiro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
feature.head()

0    [n, r, l,  , h, r, n,  ,  , e, q, u, l,  , r, ...
1    [‘, w, l, l,  , e, n, f,  , n, e, w, r, k,  , ...
2    [r,  , n,  , g, r, u, p,  ,  , n, u, c,  , n, ...
3    [r, e,  , e,  , w, e,  , e, x, p, r, e, r,  , ...
4    [r,  , n,  , g, r, u, p,  ,  , n, u, c,  ,  , ...
Name: headlines, dtype: object

In [18]:
# Lemmatization
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

feature = feature.apply(lambda x: lemmatizing(x))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abiro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
print(feature)

0       [n, i, r, m, a, l, a,  , s, i, t, h, a, r, a, ...
1       [‘, w, i, l, l,  , d, e, n, s, i, f, y,  , n, ...
2       [a, i, r,  , i, n, d, i, a,  , g, r, o, u, p, ...
3       [r, e, d,  , s, e, a,  , w, o, e, s,  , e, x, ...
4       [a, i, r,  , i, n, d, i, a,  , g, r, o, u, p, ...
                              ...                        
1995    [v, i, v, a, l, d, i, ’, s,  , p, r, i, v, a, ...
1996    [f, r, o, m,  , m, e, t, a,  , q, u, e, s, t, ...
1997    [s, a, m, s, u, n, g,  , g, a, l, a, x, y,  , ...
1998    [‘, w, e,  , c, o, n, t, i, n, u, o, u, s, l, ...
1999    [c, h, a, t, g, p, t,  , u, s, e, r, s,  , c, ...
Name: headlines, Length: 10000, dtype: object


In [21]:
feature

0       [n, i, r, m, a, l, a,  , s, i, t, h, a, r, a, ...
1       [‘, w, i, l, l,  , d, e, n, s, i, f, y,  , n, ...
2       [a, i, r,  , i, n, d, i, a,  , g, r, o, u, p, ...
3       [r, e, d,  , s, e, a,  , w, o, e, s,  , e, x, ...
4       [a, i, r,  , i, n, d, i, a,  , g, r, o, u, p, ...
                              ...                        
1995    [v, i, v, a, l, d, i, ’, s,  , p, r, i, v, a, ...
1996    [f, r, o, m,  , m, e, t, a,  , q, u, e, s, t, ...
1997    [s, a, m, s, u, n, g,  , g, a, l, a, x, y,  , ...
1998    [‘, w, e,  , c, o, n, t, i, n, u, o, u, s, l, ...
1999    [c, h, a, t, g, p, t,  , u, s, e, r, s,  , c, ...
Name: headlines, Length: 10000, dtype: object

### Using TfIDvectorizer

In [28]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, KFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier

import pickle

cv = KFold(n_splits=10, shuffle=True, random_state=42)

**Preparing TFID vectors for OneVsRest Model**

In [85]:
x_train_tf, x_test_tf, y_train_tf, y_test_tf = train_test_split(feature, targets, test_size = 0.3, random_state=42)

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(x_train_tf)
x_train_tfid = tfidf_vectorizer.transform(x_train_tf)
x_test_tfid = tfidf_vectorizer.transform(x_test_tf)

AttributeError: 'list' object has no attribute 'lower'

In [9]:
joblib.dump(tfidf_vectorizer, 'transformer_model.joblib')

['transformer_model.joblib']

In [10]:
tree = DecisionTreeClassifier()

In [11]:
parameters = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.01, 0.001]
    }

ada_clf = AdaBoostClassifier(estimator=tree, algorithm="SAMME", random_state=42)


In [12]:
grid_search = GridSearchCV(ada_clf, param_grid=parameters, cv=cv)

In [13]:
grid_search.fit(x_train_tfid, y_train_tf)

In [14]:
ada_clf_tfid = grid_search.best_estimator_

In [15]:
ovr = OneVsRestClassifier(ada_clf_tfid)

In [16]:
ovr.fit(x_train_tfid, y_train_tf)

In [17]:
ypreds_ovr = ovr.predict(x_test_tfid)

scorestree = cross_val_score(ovr, x_train_tfid, y_train_tf)
print("Cross-validation scores Tree: ", np.round(np.mean(scorestree),3))

Cross-validation scores Tree:  0.818


In [18]:
joblib.dump(ovr, 'ovr_model.joblib')

['ovr_model.joblib']

In [19]:
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
cm_ovr = classification_report(y_test_tf, ypreds_ovr)
print(cm_ovr)

               precision    recall  f1-score   support

     business       0.97      0.68      0.80       626
    education       0.96      0.86      0.91       591
entertainment       0.97      0.74      0.84       610
       sports       0.88      0.84      0.86       584
   technology       0.56      0.97      0.71       589

     accuracy                           0.81      3000
    macro avg       0.87      0.82      0.82      3000
 weighted avg       0.87      0.81      0.82      3000

