In [1]:
import pandas as pd #Dataframe Manipulation library
import numpy as np #Data Manipulation library

#sklearn modules for Feature Extraction & Modelling
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity

#Libraries for Plotting 
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import joblib
import os
import glob

import re
import string

In [2]:
df = pd.read_json(os.path.join(os.getcwd(),'data','News_Category_Dataset_v3.json'), lines=True)
df = df.dropna()

In [3]:
df['text_combine'] = df['headline'] + " " + df['short_description']
df = df[['text_combine','category']]
df_train = df.iloc[:int(len(df)*0.8)]
df_test = df.iloc[int(len(df)*0.8):len(df)]

In [4]:
df_train.head()

Unnamed: 0,text_combine,category
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY
3,The Funniest Tweets From Parents This Week (Se...,PARENTING
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS


In [5]:
import nltk

nltk.download('wordnet') # for download to this path
nltk.download('punkt')
nltk.data.path.append('corpora')
nltk.data.path.append('tokenizers')

try:
    from nltk.stem.wordnet import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    def lemmatize(text : str) -> str:
        return lemmatizer.lemmatize(text)
except Exception as e:
    print(f'failed to load WordNetLemmatizer {e}')
    def lemmatize(text : str) -> str:
        return text

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Andrey\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Andrey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
try:
    from nltk.tokenize import word_tokenize
except Exception as e:
    print(f'error load nltk tokenize {e}')
    def word_tokenize(text:str)->list:
        return text.split()

In [7]:
def text_cleaning(text:str)->str:
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def list_to_text(l:list)->str:
    return ' '.join(l)

In [8]:
def preprocess_text(text : str) -> str:
    res = []
    text = text_cleaning(text)
    list_text = word_tokenize(text)
    for word in list_text:
        res.append(lemmatize(word))
    return list_to_text(res)

In [9]:
preprocess_text('Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.')

'over million american roll up sleeve for omicrontargeted covid booster health expert said it is too early to predict whether demand would match up with the million dos of the new booster the u ordered for the fall'

In [10]:
df_train['text_combine_cleaned'] = df_train['text_combine'].apply(lambda x:preprocess_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['text_combine_cleaned'] = df_train['text_combine'].apply(lambda x:preprocess_text(x))


In [11]:
df_train.to_csv(os.path.join(os.getcwd(),'data','local_train_data_clean.csv'))

In [12]:
df_train.head()

Unnamed: 0,text_combine,category,text_combine_cleaned
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,over million american roll up sleeve for omicr...
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,american airline flyer charged banned for life...
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,of the funniest tweet about cat and dog this w...
3,The Funniest Tweets From Parents This Week (Se...,PARENTING,the funniest tweet from parent this week sept ...
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,woman who called cop on black birdwatcher lose...


## feature engineering

### word2vec

In [13]:
import gensim.models
from time import time
from gensim import utils
import multiprocessing

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        for line in df_train['text_combine_cleaned']:
            yield utils.simple_preprocess(line)

cores = multiprocessing.cpu_count() # Count the number of cores in a computer


In [14]:
sentences = MyCorpus()
model_w2v = gensim.models.Word2Vec(workers=cores-1)

In [15]:
t = time()

model_w2v.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.11 mins


In [16]:
t = time()

model_w2v.train(sentences, total_examples=model_w2v.corpus_count, epochs=10, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 1.17 mins


In [17]:
model_w2v.wv['million']

array([-1.67070889e+00,  2.97764629e-01, -3.87172191e-03, -4.15316433e-01,
       -2.21456575e+00, -1.45508075e+00, -2.70006227e+00, -3.54158354e+00,
        9.09223735e-01,  9.96697068e-01,  4.24754173e-01,  1.63320243e+00,
       -1.81076229e-01, -9.81446087e-01,  1.66050291e+00, -5.83251238e-01,
       -7.58398592e-01, -2.54763675e+00,  1.32229006e+00,  2.54999399e+00,
       -1.59519756e+00,  9.42233428e-02,  1.20004356e+00,  1.77716827e+00,
        2.40126562e+00,  2.42855096e+00, -2.75748777e+00,  2.74765402e-01,
       -1.85393643e+00,  1.62261510e+00, -1.60162663e+00,  6.00381136e-01,
       -1.54121566e+00, -6.10493183e-01,  4.96884555e-01,  2.60938287e+00,
       -2.46851373e+00,  8.68205070e-01,  2.25233245e+00, -2.17981029e+00,
       -1.02518928e+00, -3.68876696e-01, -2.83617884e-01, -9.72736120e-01,
       -1.72846043e+00, -2.68135995e-01, -3.52674460e+00, -1.60081840e+00,
       -1.12326443e-01, -6.68782532e-01, -3.44077969e+00,  1.97472858e+00,
       -2.56467968e-01,  

In [18]:
# model.wv['million']
model_w2v.wv.most_similar(positive=["million"])

[('billion', 0.8345927000045776),
 ('percent', 0.622978925704956),
 ('trillion', 0.6179170608520508),
 ('thousand', 0.6094152927398682),
 ('hundred', 0.596312403678894),
 ('multimillion', 0.5870281457901001),
 ('dollar', 0.5757449865341187),
 ('approximately', 0.5638946294784546),
 ('charity', 0.556996762752533),
 ('taxpayer', 0.5466832518577576)]

In [19]:
model_w2v.wv.similarity("million", 'money')

0.37195346

In [20]:
model_w2v.wv.doesnt_match(["million", "money", "europe"])

'europe'

In [21]:
if not os.path.exists(os.path.join(os.getcwd(),'saved_model')):
    os.makedirs(os.path.join(os.getcwd(),'saved_model'))

if not os.path.exists(os.path.join(os.getcwd(),'saved_model','word2vec')):
    os.makedirs(os.path.join(os.getcwd(),'saved_model','word2vec'))

model_w2v.save(os.path.join(os.getcwd(),'saved_model','word2vec','gensim-word2vec-model'))

In [22]:
loaded_model = gensim.models.Word2Vec.load(os.path.join(os.getcwd(),'saved_model','word2vec','gensim-word2vec-model'))

In [23]:
loaded_model.wv.most_similar(positive=["million"])

[('billion', 0.8345927000045776),
 ('percent', 0.622978925704956),
 ('trillion', 0.6179170608520508),
 ('thousand', 0.6094152927398682),
 ('hundred', 0.596312403678894),
 ('multimillion', 0.5870281457901001),
 ('dollar', 0.5757449865341187),
 ('approximately', 0.5638946294784546),
 ('charity', 0.556996762752533),
 ('taxpayer', 0.5466832518577576)]

### doc2vec

In [24]:
tagged_df_train = []
count = 0
for sentence in df_train['text_combine_cleaned']:
    tagged_df_train.append(gensim.models.doc2vec.TaggedDocument(sentence.split(), [count]))
    count+=1

In [25]:
model_d2v = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=10, workers=cores-1)

In [26]:
model_d2v.build_vocab(tagged_df_train)

In [27]:
print(f"Word 'million' appeared {model_d2v.wv.get_vecattr('million', 'count')} times in the training corpus.")

Word 'million' appeared 2593 times in the training corpus.


In [28]:
model_d2v.train(tagged_df_train, total_examples=model_d2v.corpus_count, epochs=model_d2v.epochs)

In [29]:
vector = model_d2v.infer_vector(df_train['text_combine_cleaned'].iloc[0].split())

In [30]:
vector

array([ 0.19170862,  0.13979508, -0.22436477, -0.00709117, -0.05876046,
       -0.0101215 ,  0.13376978,  0.45984292, -0.27711463, -0.14896072,
        0.02321006, -0.14625786,  0.17133518,  0.22200453, -0.01037575,
       -0.20642774, -0.03902422, -0.19873174, -0.24897559, -0.12321483,
       -0.01739778,  0.01084521,  0.17079864,  0.06789946, -0.14071313,
       -0.03883867, -0.18509503, -0.13101707, -0.03815425, -0.12378145,
        0.04625451,  0.02155207, -0.04854417,  0.16249017,  0.0481233 ,
        0.07740457,  0.00474954,  0.04811189, -0.01997378,  0.11343069,
       -0.06328455, -0.02949234,  0.13160086,  0.11624507,  0.2014222 ,
       -0.11299323, -0.20481513, -0.19465661, -0.08726005,  0.37011474],
      dtype=float32)

In [31]:
ranks = []
second_ranks = []
for doc_id in range(len(tagged_df_train)):
    inferred_vector = model_d2v.infer_vector(tagged_df_train[doc_id].words)
    sims = model_d2v.dv.most_similar([inferred_vector], topn=len(model_d2v.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])
    break

In [32]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(tagged_df_train[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model_d2v)
for label, index in [('MOST', 1), ('SECOND-MOST', 2), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(tagged_df_train[sims[index][0]].words)))

Document (0): «over million american roll up sleeve for omicrontargeted covid booster health expert said it is too early to predict whether demand would match up with the million dos of the new booster the u ordered for the fall»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dm/m,d50,n5,w5,mc2,s0.001,t11>:

MOST (50434, 0.7549108862876892): «drug price senior program may deliver blow to state budget state may face billion in new medicaid cost next year by michael ollove higher prescription drug price combined»


MEDIAN (110335, 0.3055989742279053): «doomed airasia flight black box found»

LEAST (85287, -0.5359851121902466): «way to fix baltimore police problem a few “ bad apple ” can change the culture of a whole police department honorable cop in baltimore feel pressured to honor»



In [33]:
print(df_train.iloc[0]['text_combine'])
print(df_train.iloc[58388]['text_combine'])
print(df_train.iloc[26343]['text_combine'])
print(df_train.iloc[83245]['text_combine'])

Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.
Mylan Offers Discounts On EpiPen After Clinton Criticism The $600 list price of the drug will remain the same, but the company said it would increase the maximum copay assistance program to $300 from $100.
SEC Commissioners Rejected Settlement With ITT Tech CEO The U.S. Securities and Exchange Commission has rejected a proposed settlement of the SEC’s lawsuit against two former executives
Deepak Chopra Takes Shot At Donald Trump Spiritual leader and Super Genes author Deepak Chopra joined HuffPost Rise recently and said the biggest solution happening


### tfidf vectorizer

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
doc1="petrol cars cheaper diesel cars"
doc2="diesel cheaper petrol"
doc_corpus=[doc1,doc2]
for x in doc_corpus:
    print(x)

petrol cars cheaper diesel cars
diesel cheaper petrol


In [35]:
from collections import Counter

counter = Counter((doc1+' '+doc2).split())


In [36]:
columns_temp = list(counter.keys())
res_temp = []
count = 0
for sentence in doc_corpus:
    res_temp.append([])
    counter_temp = Counter(sentence.split())
    for key in counter:
        if key in counter_temp:
            res_temp[count].append(counter_temp[key])
        else:
            res_temp[count].append(0)
    count+=1

In [37]:
pd.DataFrame(res_temp,columns=columns_temp)

Unnamed: 0,petrol,cars,cheaper,diesel
0,1,2,1,1
1,1,0,1,1


In [38]:
vec=TfidfVectorizer(stop_words='english')
vec.fit(doc_corpus)
matrix = vec.transform(doc_corpus)
print("Feature Names n",vec.get_feature_names_out())

Feature Names n ['cars' 'cheaper' 'diesel' 'petrol']


In [39]:
print(matrix.toarray())

[[0.85135433 0.30287281 0.30287281 0.30287281]
 [0.         0.57735027 0.57735027 0.57735027]]


In [40]:
# n = Total number of documents available
# t = term for which idf value has to be calculated
# df(t) = Number of documents in which the term t appears

# idf(t) = log e [ (1+n) / ( 1 + df(t) ) ] + 1

# Here n=2 (no. Of docs)

# for d1

# idf(“cars”) = log e (3/2) +1 => 1.405465083
# idf(“cheaper”) = log e (3/3) + 1 => 1
# idf(“diesel”) = log e (3/3) + 1 => 1
# idf(“petrol”) = log e (3/3) + 1 => 1

# tf idf For d1

# tf-idf(“cars”) = tf(“cars”) x idf (“cars”) = 2 x 1.405465083 => 2.810930165
# tf-idf(“cheaper”) = tf(“cheaper”) x idf (“cheaper”) = 1 x 1 => 1
# tf-idf(“diesel”) = tf(“diesel”) x idf (“diesel”) = 1×1 => 1
# tf-idf(“petrol”) = tf(“petrol”) x idf (“petrol”) = 1×1 => 1

# normalize value d1

# 2.810930165 / sqrt( 2.810930165^2 + 1^2 + 1^2 + 1^2) => 0.851354321
# 1 / sqrt( 2.810930165^2 + 1^2 + 1^2 + 1^2) =>  0.302872811
# 1 / sqrt( 2.810930165^2 + 1^2 + 1^2 + 1^2) => 0.302872811
# 1 / sqrt( 2.810930165^2 + 1^2 + 1^2 + 1^2) => 0.302872811

In [41]:
vec.transform(['cars cars car']).toarray()

array([[1., 0., 0., 0.]])

In [42]:
# for d1

# idf(“cars”) = log e (3/2) +1 => 1.405465083

# tf idf For d1

# tf-idf(“cars”) = tf(“cars”) x idf (“cars”) = 2 x 1.405465083 => 2.810930165

# normalize value d1

# 2.810930165 / sqrt( 2.810930165^2) => 1

## Train

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [44]:
category_used = ('POLITICS', 'ENTERTAINMENT', 'WELLNESS', 'HEALTHY LIVING', 'QUEER VOICES', 'TRAVEL', 'BUSINESS', 'SPORTS', 'COMEDY')

In [45]:
df_train_selected = df_train[df_train['category'].isin(category_used)][['text_combine_cleaned','category']].reset_index(drop=True)

In [46]:
unique_col = df_train_selected['category'].unique()

In [47]:
map_col = {}

count = 0
for x in unique_col:
    map_col[count] = x
    count+=1

In [48]:
def to_category_id(x, unique_col):
    count = 0
    for col in unique_col:
        if str(x) == str(col):
            return count
        count+=1 

df_train_selected['category_id'] = df_train_selected['category'].apply(lambda x : to_category_id(x,unique_col))

In [49]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, max_features=10000,
                        stop_words='english')

# We transform each complaint into a vector
features = tfidf.fit_transform(df_train_selected['text_combine_cleaned']).toarray()

labels = df_train_selected['category_id']

In [50]:
models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

# 5 Cross-validation
CV = 3
cv_df = pd.DataFrame(index=range(CV * len(models)))

entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [51]:
cv_df

Unnamed: 0,model_name,fold_idx,accuracy
0,RandomForestClassifier,0,0.383338
1,RandomForestClassifier,1,0.383383
2,RandomForestClassifier,2,0.383351
3,MultinomialNB,0,0.688245
4,MultinomialNB,1,0.712431
5,MultinomialNB,2,0.70106
6,LogisticRegression,0,0.735795
7,LogisticRegression,1,0.758561
8,LogisticRegression,2,0.733654


In [52]:
df_train_selected.head()

Unnamed: 0,text_combine_cleaned,category,category_id
0,of the funniest tweet about cat and dog this w...,COMEDY,0
1,maury will basestealing shortstop for dodger d...,SPORTS,1
2,golden globe returning to nbc in january after...,ENTERTAINMENT,2
3,biden say u force would defend taiwan if china...,POLITICS,3
4,‘ beautiful and sad at the same time ’ ukraini...,POLITICS,3


In [53]:
X = df_train_selected['text_combine_cleaned'] # Collection of documents
y = df_train_selected['category'] # Target or the labels we want to predict (i.e., the 13 different complaints of products)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    random_state = 0)

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2), 
                        stop_words='english')

fitted_vectorizer = tfidf.fit(X_train)
tfidf_vectorizer_vectors = fitted_vectorizer.transform(X_train)

model = LogisticRegression().fit(tfidf_vectorizer_vectors, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [54]:
test = ['eat apple loss weight']
model.predict(fitted_vectorizer.transform(test))

array(['HEALTHY LIVING'], dtype=object)

In [55]:
test = ['apple inc loss weight']
model.predict(fitted_vectorizer.transform(test))

array(['HEALTHY LIVING'], dtype=object)

In [56]:
from sklearn.metrics import classification_report

y_pred = model.predict(fitted_vectorizer.transform(X_test))
print(classification_report(y_test, y_pred, target_names=model.classes_))

                precision    recall  f1-score   support

      BUSINESS       0.75      0.53      0.62      1229
        COMEDY       0.70      0.39      0.50      1144
 ENTERTAINMENT       0.78      0.85      0.82      4217
HEALTHY LIVING       0.55      0.39      0.46      1693
      POLITICS       0.83      0.95      0.89      8790
  QUEER VOICES       0.87      0.67      0.76      1382
        SPORTS       0.85      0.69      0.76      1201
        TRAVEL       0.85      0.82      0.83      1485
      WELLNESS       0.61      0.72      0.66      2077

      accuracy                           0.78     23218
     macro avg       0.76      0.67      0.70     23218
  weighted avg       0.78      0.78      0.77     23218



In [57]:
tagged_df_train = []
count = 0
for sentence in df_train_selected['text_combine_cleaned']:
    tagged_df_train.append(gensim.models.doc2vec.TaggedDocument(sentence.split(), [count]))
    count+=1

model_d2v = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=10, workers=cores-1)
model_d2v.build_vocab(tagged_df_train)
model_d2v.train(tagged_df_train, total_examples=model_d2v.corpus_count, epochs=model_d2v.epochs)
vector = model_d2v.infer_vector(df_train_selected['text_combine_cleaned'].iloc[0].split())

In [58]:
import numpy as np

res_zero = model_d2v.infer_vector(df_train_selected['text_combine_cleaned'].iloc[0].split())
init_array = np.reshape(res_zero, (-1, 50))
count = 0
for word in df_train_selected['text_combine_cleaned']:
    # print(count)
    if count>0:
        res_ = model_d2v.infer_vector(word.split())
        init_array = np.vstack((init_array, res_))
    count+=1

In [59]:
len(df_train_selected['text_combine_cleaned'])

92869

In [72]:
y = df_train_selected['category'] 
X_train, X_test, y_train, y_test = train_test_split(init_array, y, 
                                                    test_size=0.25,
                                                    random_state = 0)
labels = df_train_selected['category_id']

In [73]:
models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

# 5 Cross-validation
CV = 3
cv_df = pd.DataFrame(index=range(CV * len(models)))

entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [74]:
cv_df

Unnamed: 0,model_name,fold_idx,accuracy
0,RandomForestClassifier,0,0.383338
1,RandomForestClassifier,1,0.383383
2,RandomForestClassifier,2,0.383351
3,MultinomialNB,0,0.688245
4,MultinomialNB,1,0.712431
5,MultinomialNB,2,0.70106
6,LogisticRegression,0,0.735795
7,LogisticRegression,1,0.758561
8,LogisticRegression,2,0.733654


In [69]:
y = df_train_selected['category'] 
X_train, X_test, y_train, y_test = train_test_split(init_array, y, 
                                                    test_size=0.25,
                                                    random_state = 0)

model = LogisticRegression().fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [71]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=model.classes_))

                precision    recall  f1-score   support

      BUSINESS       0.53      0.26      0.35      1229
        COMEDY       0.37      0.08      0.13      1144
 ENTERTAINMENT       0.60      0.74      0.66      4217
HEALTHY LIVING       0.42      0.16      0.23      1693
      POLITICS       0.70      0.89      0.78      8790
  QUEER VOICES       0.58      0.40      0.47      1382
        SPORTS       0.63      0.36      0.46      1201
        TRAVEL       0.72      0.66      0.69      1485
      WELLNESS       0.57      0.65      0.61      2077

      accuracy                           0.64     23218
     macro avg       0.57      0.47      0.49     23218
  weighted avg       0.61      0.64      0.61     23218

