## Importing Libraries

In [41]:
## Importing libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

## Loading Data Set

In [2]:
## Loading Data in dataframe
df = pd.read_csv("abcnews-date-text.csv")

## Performing Data Wrangling

In [5]:
df.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1103663 entries, 0 to 1103662
Data columns (total 2 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   publish_date   1103663 non-null  int64 
 1   headline_text  1103663 non-null  object
dtypes: int64(1), object(1)
memory usage: 16.8+ MB


In [8]:
## Checking count of data
df.groupby("publish_date").count()

Unnamed: 0_level_0,headline_text
publish_date,Unnamed: 1_level_1
20030219,198
20030220,250
20030221,250
20030222,126
20030223,136
...,...
20171227,70
20171228,73
20171229,89
20171230,61


In [10]:
## Count of null values
df.isnull().count()

publish_date     1103663
headline_text    1103663
dtype: int64

In [12]:
## Check if there is any null values
df.isnull().values.any()

False

In [11]:
## Getting the shape of dataframe
df.shape

(1103663, 2)

## Document Term Matrix

In [17]:
vectorizer = CountVectorizer(max_df=0.95,min_df=2,max_features=1000,ngram_range=(1,2),stop_words='english')

In [36]:
data_vec = vectorizer.fit_transform(df['headline_text'])

In [40]:
data_vec.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

## Creating LDA Model

In [42]:
## Creating the model
lda_model = LatentDirichletAllocation(n_components=20,               # Number of topics
                                    max_iter=10,               # Max learning iteration     
                                    learning_method='online',   
                                    random_state=40,           # Random state      
                                    batch_size=128,            # n docs in each learning iter
                                    evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                    n_jobs = -1               # Use all available CPUs
                               )


In [43]:
## Fitting the model on vectrorized data
lda = lda_model.fit_transform(data_vec)

In [44]:
print(lda_model)

LatentDirichletAllocation(learning_method='online', n_components=20, n_jobs=-1,
                          random_state=40)


In [45]:
print(lda_model.get_params())

{'batch_size': 128, 'doc_topic_prior': None, 'evaluate_every': -1, 'learning_decay': 0.7, 'learning_method': 'online', 'learning_offset': 10.0, 'max_doc_update_iter': 100, 'max_iter': 10, 'mean_change_tol': 0.001, 'n_components': 20, 'n_jobs': -1, 'perp_tol': 0.1, 'random_state': 40, 'topic_word_prior': None, 'total_samples': 1000000.0, 'verbose': 0}


## Getting Top 5 topics

In [46]:
import mglearn as mg

In [49]:
sorting = np.argsort(lda_model.components_)[:,::-1]

In [50]:
features = np.array(vectorizer.get_feature_names())

In [53]:
mg.tools.print_topics(topics=range(5),feature_names=features,sorting=sorting,topics_per_chunk=5, n_words=25)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
australia     funding       report        man           new           
crash         budget        car           qld           south         
china         help          adelaide      murder        coast         
hour          power         hit           deal          gold          
sex           urged         indigenous    face          act           
country hour  mining        dead          mp            record        
final         abuse         centre        charges       change        
weather       opposition    fears         life          takes         
2015          return        residents     alleged       city          
fatal         michael       port          court         housing       
probe         million       federal       run           gold coast    
war           team          jobs          young         food          
korea 