In [1]:
import os
import glob
import pathlib
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

## Extracting files


In [2]:
def extract_files(folder_path):
    csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
    dataframes = []

    for file in csv_files:
        df = pd.read_csv(file)
        dataframes.append(df)
    return dataframes


extracted = extract_files("Topic_modelling")

In [3]:
alldata = pd.DataFrame()
for listitems in extracted:
    alldata = pd.concat([alldata, listitems])

## Feature extraction

In [4]:
alldata.head()

Unnamed: 0,headlines,description,content,url,category
0,Nirmala Sitharaman to equal Morarji Desai’s re...,With the presentation of the interim budget on...,"Sitharaman, the first full-time woman finance ...",https://indianexpress.com/article/business/bud...,business
1,"‘Will densify network, want to be at least no....","'In terms of market share, we aim to double it...",The merger of Tata group’s budget airlines Air...,https://indianexpress.com/article/business/avi...,business
2,Air India group to induct an aircraft every si...,Air India currently has 117 operational aircra...,The Air India group plans to induct one aircra...,https://indianexpress.com/article/business/avi...,business
3,Red Sea woes: Exporters seek increased credit ...,Rising attacks forced shippers to consider the...,Indian exporters have asked the central govern...,https://indianexpress.com/article/business/red...,business
4,Air India group to induct a plane every 6 days...,"Apart from fleet expansion, 2024 will also see...",The Air India group plans to induct one aircra...,https://indianexpress.com/article/business/avi...,business


In [5]:
targets = alldata["category"]
feature = alldata["headlines"]

In [6]:
print(feature)

0       Nirmala Sitharaman to equal Morarji Desai’s re...
1       ‘Will densify network, want to be at least no....
2       Air India group to induct an aircraft every si...
3       Red Sea woes: Exporters seek increased credit ...
4       Air India group to induct a plane every 6 days...
                              ...                        
1995    Vivaldi’s privacy and customisability-focused ...
1996    From Meta Quest 3 to Ray Ban Smart Glasses, he...
1997    Samsung Galaxy S24 rumoured to launch on Janua...
1998    ‘We continuously iterate… building prototypes,...
1999    ChatGPT users can now browse internet, OpenAI ...
Name: headlines, Length: 10000, dtype: object


### Using TfIDvectorizer

In [11]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, KFold
from sklearn.ensemble import AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing.text import TfidfVectorizer

import pickle

cv = KFold(n_splits=10, shuffle=True, random_state=42)

**Preparing TFID vectors for OneVsRest Model**

In [8]:
x_train_tf, x_test_tf, y_train_tf, y_test_tf = train_test_split(feature, targets, test_size = 0.3, random_state=42)

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_vectorizer.fit(x_train_tf)
x_train_tfid = tfidf_vectorizer.transform(x_train_tf)
x_test_tfid = tfidf_vectorizer.transform(x_test_tf)

NameError: name 'train_test_split' is not defined

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_tf = le.fit_transform(y_train_tf)
y_test_tf = le.transform(y_test_tf)

In [None]:
pickle.dump(tfidf_vectorizer, open('transformer_model.pkl', 'wb'))

In [None]:
tree = DecisionTreeClassifier()

In [None]:
parameters = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.01, 0.001]
    }

ada_clf = AdaBoostClassifier(estimator=tree, algorithm="SAMME", random_state=42)


In [None]:
grid_search = GridSearchCV(ada_clf, param_grid=parameters, cv=cv)

In [None]:
grid_search.fit(x_train_tfid, y_train_tf)

In [None]:
ada_clf_tfid = grid_search.best_estimator_

In [None]:
ovr = OneVsRestClassifier(ada_clf_tfid)

In [None]:
ovr.fit(x_train_tfid, y_train_tf)

In [None]:
ypreds_ovr = ovr.predict(x_test_tfid)

scorestree = cross_val_score(ovr, x_train_tfid, y_train_tf)
print("Cross-validation scores Tree: ", np.round(np.mean(scorestree),3))

Cross-validation scores Tree:  0.818


In [None]:
pickle.dump(ovr, open("ovr_model.pkl", "wb"))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
cm_ovr = classification_report(y_test_tf, ypreds_ovr)
print(cm_ovr)

              precision    recall  f1-score   support

           0       0.97      0.68      0.80       626
           1       0.96      0.86      0.91       591
           2       0.97      0.74      0.84       610
           3       0.88      0.84      0.86       584
           4       0.56      0.97      0.71       589

    accuracy                           0.81      3000
   macro avg       0.87      0.82      0.82      3000
weighted avg       0.87      0.81      0.82      3000

