In [94]:
import pandas as pd #Dataframe Manipulation library
import numpy as np #Data Manipulation library

#sklearn modules for Feature Extraction & Modelling
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity

#Libraries for Plotting 
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import joblib
import os
import glob

import re
import string

In [95]:
try:
    df = pd.read_json(os.path.join(os.getcwd(),'data','News_Category_Dataset_v3.json'), lines=True)
except Exception as e:
    df = pd.read_csv(os.path.join(os.getcwd(),'data','category_news_selected.csv'))
df = df.dropna()

In [3]:
df['text_combine'] = df['headline'] + " " + df['short_description']
df = df[['text_combine','category']]
df_train = df.iloc[:int(len(df)*0.8)]
df_test = df.iloc[int(len(df)*0.8):len(df)]

In [4]:
df_train.head()

Unnamed: 0,text_combine,category
0,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY
1,"Maury Wills, Base-Stealing Shortstop For Dodge...",SPORTS
4,‘Beautiful And Sad At The Same Time’: Ukrainia...,POLITICS
5,"Las Vegas Aces Win First WNBA Title, Chelsea G...",SPORTS
6,James Cameron Says He 'Clashed' With Studio Be...,ENTERTAINMENT


In [5]:
import nltk

nltk.download('wordnet') # for download to this path
nltk.download('punkt')
nltk.data.path.append('corpora')
nltk.data.path.append('tokenizers')

try:
    from nltk.stem.wordnet import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    def lemmatize(text : str) -> str:
        return lemmatizer.lemmatize(text)
except Exception as e:
    print(f'failed to load WordNetLemmatizer {e}')
    def lemmatize(text : str) -> str:
        return text

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Andrey\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Andrey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
try:
    from nltk.tokenize import word_tokenize
except Exception as e:
    print(f'error load nltk tokenize {e}')
    def word_tokenize(text:str)->list:
        return text.split()

In [7]:
def text_cleaning(text:str)->str:
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def list_to_text(l:list)->str:
    return ' '.join(l)

In [8]:
def preprocess_text(text : str) -> str:
    res = []
    text = text_cleaning(text)
    list_text = word_tokenize(text)
    for word in list_text:
        res.append(lemmatize(word))
    return list_to_text(res)

In [9]:
preprocess_text('Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.')

'over million american roll up sleeve for omicrontargeted covid booster health expert said it is too early to predict whether demand would match up with the million dos of the new booster the u ordered for the fall'

In [10]:
df_train['text_combine_cleaned'] = df_train['text_combine'].apply(lambda x:preprocess_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['text_combine_cleaned'] = df_train['text_combine'].apply(lambda x:preprocess_text(x))


In [11]:
df_train.to_csv(os.path.join(os.getcwd(),'data','local_train_data_clean.csv'))

In [12]:
df_train.head()

Unnamed: 0,text_combine,category,text_combine_cleaned
0,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,of the funniest tweet about cat and dog this w...
1,"Maury Wills, Base-Stealing Shortstop For Dodge...",SPORTS,maury will basestealing shortstop for dodger d...
4,‘Beautiful And Sad At The Same Time’: Ukrainia...,POLITICS,‘ beautiful and sad at the same time ’ ukraini...
5,"Las Vegas Aces Win First WNBA Title, Chelsea G...",SPORTS,la vega ace win first wnba title chelsea gray ...
6,James Cameron Says He 'Clashed' With Studio Be...,ENTERTAINMENT,james cameron say he clashed with studio befor...


In [91]:
df_train.shape

(68642, 3)

## feature engineering

### tfidf vectorizer

In [13]:
# We will check with different data and use this method again in training
from sklearn.feature_extraction.text import TfidfVectorizer
doc1="petrol cars cheaper diesel cars"
doc2="diesel cheaper petrol"
doc_corpus=[doc1,doc2]
for x in doc_corpus:
    print(x)

petrol cars cheaper diesel cars
diesel cheaper petrol


In [14]:
from collections import Counter

counter = Counter((doc1+' '+doc2).split())


In [15]:
columns_temp = list(counter.keys())
res_temp = []
count = 0
for sentence in doc_corpus:
    res_temp.append([])
    counter_temp = Counter(sentence.split())
    for key in counter:
        if key in counter_temp:
            res_temp[count].append(counter_temp[key])
        else:
            res_temp[count].append(0)
    count+=1

In [16]:
pd.DataFrame(res_temp,columns=columns_temp)

Unnamed: 0,petrol,cars,cheaper,diesel
0,1,2,1,1
1,1,0,1,1


In [17]:
vec=TfidfVectorizer(stop_words='english')
vec.fit(doc_corpus)
matrix = vec.transform(doc_corpus)
print("Feature Names n",vec.get_feature_names_out())

Feature Names n ['cars' 'cheaper' 'diesel' 'petrol']


In [18]:
print(matrix.toarray())

[[0.85135433 0.30287281 0.30287281 0.30287281]
 [0.         0.57735027 0.57735027 0.57735027]]


In [19]:
# n = Total number of documents available
# t = term for which idf value has to be calculated
# df(t) = Number of documents in which the term t appears

# idf(t) = log e [ (1+n) / ( 1 + df(t) ) ] + 1

# Here n=2 (no. Of docs)

# for d1

# idf(“cars”) = log e (3/2) +1 => 1.405465083
# idf(“cheaper”) = log e (3/3) + 1 => 1
# idf(“diesel”) = log e (3/3) + 1 => 1
# idf(“petrol”) = log e (3/3) + 1 => 1

# tf idf For d1

# tf-idf(“cars”) = tf(“cars”) x idf (“cars”) = 2 x 1.405465083 => 2.810930165
# tf-idf(“cheaper”) = tf(“cheaper”) x idf (“cheaper”) = 1 x 1 => 1
# tf-idf(“diesel”) = tf(“diesel”) x idf (“diesel”) = 1×1 => 1
# tf-idf(“petrol”) = tf(“petrol”) x idf (“petrol”) = 1×1 => 1

# normalize value d1

# 2.810930165 / sqrt( 2.810930165^2 + 1^2 + 1^2 + 1^2) => 0.851354321
# 1 / sqrt( 2.810930165^2 + 1^2 + 1^2 + 1^2) =>  0.302872811
# 1 / sqrt( 2.810930165^2 + 1^2 + 1^2 + 1^2) => 0.302872811
# 1 / sqrt( 2.810930165^2 + 1^2 + 1^2 + 1^2) => 0.302872811

In [20]:
vec.transform(['cars cars car']).toarray()

array([[1., 0., 0., 0.]])

In [21]:
# for d1

# idf(“cars”) = log e (3/2) +1 => 1.405465083

# tf idf For d1

# tf-idf(“cars”) = tf(“cars”) x idf (“cars”) = 2 x 1.405465083 => 2.810930165

# normalize value d1

# 2.810930165 / sqrt( 2.810930165^2) => 1

### word2vec

In [22]:
import gensim.models
from time import time
from gensim import utils
import multiprocessing

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        for line in df_train['text_combine_cleaned']:
            yield utils.simple_preprocess(line)

cores = multiprocessing.cpu_count() # Count the number of cores in a computer


In [23]:
sentences = MyCorpus()
model_w2v = gensim.models.Word2Vec(workers=cores-1)

In [24]:
t = time()

model_w2v.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.04 mins


In [25]:
t = time()

model_w2v.train(sentences, total_examples=model_w2v.corpus_count, epochs=10, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 0.57 mins


In [26]:
model_w2v.wv['million']

array([ 0.12731877,  0.7004593 , -0.38843068, -1.4380304 ,  0.1127122 ,
       -1.7582327 ,  0.9973596 , -0.92865074,  2.1139367 , -1.031768  ,
        0.07705914,  1.6947329 , -1.719214  , -0.62598634, -0.6141067 ,
        1.7429329 ,  2.257469  , -0.40298876,  1.7815447 ,  3.1612105 ,
       -1.7909504 ,  3.5075374 ,  0.8425165 , -1.4787859 ,  2.941333  ,
        0.6435059 , -0.4635814 , -0.86058694, -2.829369  , -0.0326717 ,
        0.85002637,  0.67172664, -1.8341613 ,  0.2758235 , -1.2075851 ,
       -0.2550808 , -1.7684009 ,  0.9830752 , -0.6362522 , -0.9478137 ,
        0.48691228,  1.582207  , -2.1133778 ,  0.08790875, -0.30527428,
       -0.666891  , -0.61124134,  1.7523847 ,  1.5287738 , -0.03888857,
        1.4808089 ,  0.36731514, -0.5734472 ,  0.05290028, -1.0429794 ,
        1.8851348 ,  0.3507833 ,  0.05946754, -0.9343644 ,  0.21388546,
        1.1243306 , -0.10084579, -2.9172482 , -0.13404709,  0.161958  ,
        0.81922656, -1.3036067 , -1.5698212 , -0.99485475,  0.78

In [27]:
# model.wv['million']
model_w2v.wv.most_similar(positive=["million"])

[('billion', 0.7381351590156555),
 ('dollar', 0.6780257225036621),
 ('percent', 0.6763569116592407),
 ('thousand', 0.6651930212974548),
 ('hundred', 0.6272211074829102),
 ('nearly', 0.601820707321167),
 ('taxpayer', 0.5976290702819824),
 ('charity', 0.5879110097885132),
 ('trillion', 0.5599369406700134),
 ('hike', 0.551967978477478)]

In [28]:
model_w2v.wv.similarity("million", 'money')

0.39496014

In [29]:
model_w2v.wv.doesnt_match(["million", "money", "europe"])

'europe'

In [30]:
if not os.path.exists(os.path.join(os.getcwd(),'saved_model')):
    os.makedirs(os.path.join(os.getcwd(),'saved_model'))

if not os.path.exists(os.path.join(os.getcwd(),'saved_model','word2vec')):
    os.makedirs(os.path.join(os.getcwd(),'saved_model','word2vec'))

model_w2v.save(os.path.join(os.getcwd(),'saved_model','word2vec','gensim-word2vec-model'))

In [31]:
loaded_model = gensim.models.Word2Vec.load(os.path.join(os.getcwd(),'saved_model','word2vec','gensim-word2vec-model'))

In [32]:
loaded_model.wv.most_similar(positive=["million"])

[('billion', 0.7381351590156555),
 ('dollar', 0.6780257225036621),
 ('percent', 0.6763569116592407),
 ('thousand', 0.6651930212974548),
 ('hundred', 0.6272211074829102),
 ('nearly', 0.601820707321167),
 ('taxpayer', 0.5976290702819824),
 ('charity', 0.5879110097885132),
 ('trillion', 0.5599369406700134),
 ('hike', 0.551967978477478)]

### doc2vec

In [74]:
tagged_df_train = []
count = 0
for sentence in df_train['text_combine_cleaned']:
    tagged_df_train.append(gensim.models.doc2vec.TaggedDocument(sentence.split(), [count]))
    count+=1

In [75]:
model_d2v = gensim.models.doc2vec.Doc2Vec(vector_size=200, min_count=2, epochs=20, workers=cores-1)

In [76]:
model_d2v.build_vocab(tagged_df_train)

In [77]:
print(f"Word 'million' appeared {model_d2v.wv.get_vecattr('million', 'count')} times in the training corpus.")

Word 'million' appeared 1112 times in the training corpus.


In [78]:
model_d2v.train(tagged_df_train, total_examples=model_d2v.corpus_count, epochs=model_d2v.epochs)

In [79]:
vector = model_d2v.infer_vector(df_train['text_combine_cleaned'].iloc[0].split())

In [80]:
vector

array([ 0.2339821 , -0.1220803 , -0.09300172,  0.26324868,  0.03976153,
       -0.0991462 ,  0.09826183,  0.13130124,  0.1791708 ,  0.2444231 ,
        0.09562145, -0.1548553 , -0.05314378,  0.153964  , -0.01433426,
       -0.08794248,  0.15181649, -0.15645322,  0.14349115, -0.10716932,
        0.12722124, -0.20974667,  0.18901801,  0.04673327, -0.08752485,
       -0.09929431,  0.18107665, -0.20287281, -0.08058515,  0.0653967 ,
        0.22349824,  0.03104123,  0.37100613,  0.0397216 , -0.0035311 ,
        0.14860617,  0.00680171,  0.06424663, -0.09308906, -0.04910458,
       -0.13124786, -0.02382417,  0.02896591,  0.0757371 , -0.04795402,
       -0.11993109,  0.07988014, -0.03982693, -0.1395995 ,  0.3883583 ,
       -0.15365298,  0.2327805 , -0.11685895, -0.0441923 ,  0.2789711 ,
       -0.07769088, -0.17566225,  0.14162639, -0.09826218,  0.00123879,
       -0.05805755, -0.02212106,  0.0028285 ,  0.01568701, -0.06692484,
       -0.04664307,  0.08141559,  0.15464234, -0.03596251, -0.33

In [105]:
ranks = []
second_ranks = []
for doc_id in range(len(tagged_df_train)):
    if doc_id == 4:
        inferred_vector = model_d2v.infer_vector(tagged_df_train[doc_id].words)
        sims = model_d2v.dv.most_similar([inferred_vector], topn=len(model_d2v.dv))
        rank = [docid for docid, sim in sims].index(doc_id)
        ranks.append(rank)

        second_ranks.append(sims[1])
        break

In [106]:
df_train.head()

Unnamed: 0,text_combine,category,text_combine_cleaned
0,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,of the funniest tweet about cat and dog this w...
1,"Maury Wills, Base-Stealing Shortstop For Dodge...",SPORTS,maury will basestealing shortstop for dodger d...
4,‘Beautiful And Sad At The Same Time’: Ukrainia...,POLITICS,‘ beautiful and sad at the same time ’ ukraini...
5,"Las Vegas Aces Win First WNBA Title, Chelsea G...",SPORTS,la vega ace win first wnba title chelsea gray ...
6,James Cameron Says He 'Clashed' With Studio Be...,ENTERTAINMENT,james cameron say he clashed with studio befor...


In [109]:
print('{} Document ({}): «{}»\n'.format(df_train['category'].iloc[doc_id], doc_id, ' '.join(tagged_df_train[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model_d2v)
for label, index in [('MOST', 1), ('SECOND-MOST', 2), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s %s: «%s»\n' % (df_train['category'].iloc[index] ,label, sims[index], ' '.join(tagged_df_train[sims[index][0]].words)))

ENTERTAINMENT Document (4): «james cameron say he clashed with studio before avatar release the avatar director said aspect of his movie are still competitive with everything that ’ s out there these day»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dm/m,d200,n5,w5,mc2,s0.001,t7>:

SPORTS MOST (21919, 0.5122023820877075): «trump say he shocked by all the meanness out there the president admits the combativeness of his white house could be my fault»

POLITICS SECOND-MOST (45244, 0.4905182719230652): «this note left in robert griffin iii ’ s locker sure seems like a clue to his future it certainly look like rgiii is ready to get out of dc»

ENTERTAINMENT MEDIAN (37694, 0.1514284461736679): «before scotus blockade this gop senator wanted obama to hurry up and fill federal vacancy ron johnson said last year that president should fill vacancy within day»

COMEDY LEAST (40922, -0.22145676612854004): «the army tell it soldier to get some sleep it leader are looking to improve troop performance 

In [43]:
# print(df_train.iloc[0]['text_combine'])
# print(df_train.iloc[58388]['text_combine'])
# print(df_train.iloc[26343]['text_combine'])
# print(df_train.iloc[83245]['text_combine'])

## Train

### using tf idf vectorizer

#### use cv

In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [54]:
category_used = ('POLITICS', 'ENTERTAINMENT', 'WELLNESS', 'HEALTHY LIVING', 'QUEER VOICES', 'TRAVEL', 'BUSINESS', 'SPORTS', 'COMEDY')
df_train_selected = df_train[df_train['category'].isin(category_used)][['text_combine_cleaned','category']].reset_index(drop=True)

In [55]:
unique_col = df_train_selected['category'].unique()

In [56]:
map_col = {}

count = 0
for x in unique_col:
    map_col[count] = x
    count+=1

In [57]:
def to_category_id(x, unique_col):
    count = 0
    for col in unique_col:
        if str(x) == str(col):
            return count
        count+=1 

df_train_selected['category_id'] = df_train_selected['category'].apply(lambda x : to_category_id(x,unique_col))

In [58]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, max_features=10000,
                        stop_words='english')

features = tfidf.fit_transform(df_train_selected['text_combine_cleaned']).toarray()

labels = df_train_selected['category_id']

In [59]:
models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

# 5 Cross-validation
CV = 3
cv_df = pd.DataFrame(index=range(CV * len(models)))

entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [60]:
cv_df

Unnamed: 0,model_name,fold_idx,accuracy
0,RandomForestClassifier,0,0.432499
1,RandomForestClassifier,1,0.432455
2,RandomForestClassifier,2,0.432474
3,MultinomialNB,0,0.672698
4,MultinomialNB,1,0.703422
5,MultinomialNB,2,0.699607
6,LogisticRegression,0,0.740964
7,LogisticRegression,1,0.76605
8,LogisticRegression,2,0.7375


In [61]:
df_train_selected.head()

Unnamed: 0,text_combine_cleaned,category,category_id
0,of the funniest tweet about cat and dog this w...,COMEDY,0
1,maury will basestealing shortstop for dodger d...,SPORTS,1
2,‘ beautiful and sad at the same time ’ ukraini...,POLITICS,2
3,la vega ace win first wnba title chelsea gray ...,SPORTS,1
4,james cameron say he clashed with studio befor...,ENTERTAINMENT,3


#### use selected model

In [62]:
X = df_train_selected['text_combine_cleaned'] # Collection of documents
y = df_train_selected['category'] # Target or the labels we want to predict (i.e., the 13 different complaints of products)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    random_state = 0)

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2), 
                        stop_words='english')

fitted_vectorizer = tfidf.fit(X_train)
tfidf_vectorizer_vectors = fitted_vectorizer.transform(X_train)

model = LogisticRegression().fit(tfidf_vectorizer_vectors, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [63]:
test = ['eat apple loss weight']
model.predict(fitted_vectorizer.transform(test))

array(['HEALTHY LIVING'], dtype=object)

In [64]:
test = ['apple inc loss weight']
model.predict(fitted_vectorizer.transform(test))

array(['HEALTHY LIVING'], dtype=object)

In [65]:
from sklearn.metrics import classification_report

y_pred = model.predict(fitted_vectorizer.transform(X_test))
print(classification_report(y_test, y_pred, target_names=model.classes_))

                precision    recall  f1-score   support

      BUSINESS       0.78      0.56      0.66       870
        COMEDY       0.73      0.42      0.53       899
 ENTERTAINMENT       0.78      0.86      0.81      3165
HEALTHY LIVING       0.52      0.58      0.55      1237
      POLITICS       0.84      0.96      0.89      7408
  QUEER VOICES       0.85      0.62      0.72       988
        SPORTS       0.86      0.67      0.75       844
        TRAVEL       0.87      0.75      0.80       875
      WELLNESS       0.49      0.32      0.39       875

      accuracy                           0.79     17161
     macro avg       0.75      0.64      0.68     17161
  weighted avg       0.78      0.79      0.78     17161



### using doc2vec

#### select model with cv

In [112]:
tagged_df_train = []
count = 0
for sentence in df_train_selected['text_combine_cleaned']:
    tagged_df_train.append(gensim.models.doc2vec.TaggedDocument(sentence.split(), [count]))
    count+=1

model_d2v = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=10, workers=cores-1)
model_d2v.build_vocab(tagged_df_train)
model_d2v.train(tagged_df_train, total_examples=model_d2v.corpus_count, epochs=model_d2v.epochs)
vector = model_d2v.infer_vector(df_train_selected['text_combine_cleaned'].iloc[0].split())

In [113]:
import numpy as np

res_zero = model_d2v.infer_vector(df_train_selected['text_combine_cleaned'].iloc[0].split())
init_array = np.reshape(res_zero, (-1, 50))
count = 0
for word in df_train_selected['text_combine_cleaned']:
    # print(count)
    if count>0:
        res_ = model_d2v.infer_vector(word.split())
        init_array = np.vstack((init_array, res_))
    count+=1

In [114]:
len(df_train_selected['text_combine_cleaned'])

68642

In [115]:
y = df_train_selected['category'] 
X_train, X_test, y_train, y_test = train_test_split(init_array, y, 
                                                    test_size=0.25,
                                                    random_state = 0)
labels = df_train_selected['category_id']

In [116]:
models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

# 5 Cross-validation
CV = 3
cv_df = pd.DataFrame(index=range(CV * len(models)))

entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [117]:
cv_df

Unnamed: 0,model_name,fold_idx,accuracy
0,RandomForestClassifier,0,0.432499
1,RandomForestClassifier,1,0.432455
2,RandomForestClassifier,2,0.432474
3,MultinomialNB,0,0.672698
4,MultinomialNB,1,0.703422
5,MultinomialNB,2,0.699607
6,LogisticRegression,0,0.740964
7,LogisticRegression,1,0.76605
8,LogisticRegression,2,0.7375


#### use selected model

In [111]:
df_train_selected.head()

Unnamed: 0,text_combine_cleaned,category,category_id
0,of the funniest tweet about cat and dog this w...,COMEDY,0
1,maury will basestealing shortstop for dodger d...,SPORTS,1
2,‘ beautiful and sad at the same time ’ ukraini...,POLITICS,2
3,la vega ace win first wnba title chelsea gray ...,SPORTS,1
4,james cameron say he clashed with studio befor...,ENTERTAINMENT,3


In [118]:
y = df_train_selected['category'] 
X_train, X_test, y_train, y_test = train_test_split(df_train_selected['text_combine_cleaned'], y, 
                                                    test_size=0.25,
                                                    random_state = 0)
labels = df_train_selected['category_id']

In [119]:
tagged_df_train = []
count = 0
for sentence in X_train:
    tagged_df_train.append(gensim.models.doc2vec.TaggedDocument(sentence.split(), [count]))
    count+=1

model_d2v = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=10, workers=cores-1)
model_d2v.build_vocab(tagged_df_train)
model_d2v.train(tagged_df_train, total_examples=model_d2v.corpus_count, epochs=model_d2v.epochs)
vector = model_d2v.infer_vector(df_train_selected['text_combine_cleaned'].iloc[0].split())

In [120]:
import numpy as np

res_zero = model_d2v.infer_vector(X_train.iloc[0].split())
X_train_array = np.reshape(res_zero, (-1, 50))
count = 0
for words in X_train:
    if count>0:
        res_ = model_d2v.infer_vector(words.split())
        X_train_array = np.vstack((X_train_array, res_))
    count+=1

In [121]:
model = LogisticRegression().fit(X_train_array, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [130]:
import numpy as np

res_zero = model_d2v.infer_vector(X_test.iloc[0].split())
X_test_array = np.reshape(res_zero, (-1, 50))
count = 0
for words in X_test:
    if count>0:
        res_ = model_d2v.infer_vector(words.split())
        X_test_array = np.vstack((X_test_array, res_))
    count+=1

In [131]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_array)
print(classification_report(y_test, y_pred, target_names=model.classes_))

                precision    recall  f1-score   support

      BUSINESS       0.57      0.29      0.38       870
        COMEDY       0.29      0.08      0.12       899
 ENTERTAINMENT       0.61      0.75      0.67      3165
HEALTHY LIVING       0.39      0.37      0.38      1237
      POLITICS       0.72      0.89      0.80      7408
  QUEER VOICES       0.55      0.33      0.41       988
        SPORTS       0.60      0.33      0.43       844
        TRAVEL       0.66      0.61      0.63       875
      WELLNESS       0.41      0.19      0.26       875

      accuracy                           0.64     17161
     macro avg       0.53      0.43      0.45     17161
  weighted avg       0.61      0.64      0.61     17161

