# Read the News Analysis
## Newspapers and their online formats supply the public with the information we need to understand the events occurring in the world around us. From politics to sports, the news keeps us informed, in the loop, and ready to make decisions about how to act in a rapidly changing world.
## Given the vast amount of news articles in circulation, identifying and organizing articles by topic is a useful activity. This can help you sift through the enormous amount of information out there so you can find the news relevant to your interests, or even allow you to build a news recommendation engine!
## In this project you will use term frequency-inverse document frequency (tf-idf) to analyze each article’s content and uncover the terms that best describe each article, providing quick insight into each article’s topic.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import CountVectorizer , TfidfTransformer , TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import re
import nltk
from nltk import pos_tag
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/news-articles/Articles.csv


In [2]:
df = pd.read_csv('/kaggle/input/news-articles/Articles.csv' , encoding= 'unicode_escape')

In [3]:
stop_words = stopwords.words('english')
normalizer = WordNetLemmatizer()

In [4]:
def get_part_of_speech(word):
  probable_part_of_speech = wordnet.synsets(word)
  pos_counts = Counter()
  pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
  pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
  pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
  pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
  return most_likely_part_of_speech

In [5]:
def preprocess(article):
    cleaned = re.sub(r'\W+' , ' ' , article ).lower()
    tokenized = word_tokenize(cleaned)
    normalized = " ".join( [ normalizer.lemmatize( token , get_part_of_speech( token ) ) for token in tokenized if not re.match(r'\d+',token) and token not in stop_words])
    
    return normalized

In [6]:
df.head()

Unnamed: 0,Article,Date,Heading,NewsType
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,business
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,business


In [7]:
articles = []
for i in range(0 , df.shape[0]):
    articles.append(df['Article'].iloc[i])

In [8]:
preprcessed_articles = []
for i in articles :
    preprcessed_articles.append(preprocess(i))

In [9]:
vectorizer = CountVectorizer()

In [10]:
counts = vectorizer.fit_transform(preprcessed_articles)

In [11]:
try:
  article_index = [f"Article {i+1}" for i in range(len(articles))]
except:
  pass
try:
  feature_names = vectorizer.get_feature_names()
except:
  pass

In [12]:
try:
  df_word_counts = pd.DataFrame(counts.T.todense(), index = feature_names, columns = article_index)
  print(df_word_counts.head(10))
except:
  pass

              Article 1  Article 2  Article 3  Article 4  Article 5  \
__cf_email__          0          0          0          0          0   
a300                  0          0          0          0          0   
a320                  0          0          0          0          0   
a321                  0          0          0          0          0   
a330                  0          0          0          0          0   
a330s                 0          0          0          0          0   
a350                  0          0          0          0          0   
a350s                 0          0          0          0          0   
a380                  0          0          0          0          0   
aa                    0          0          0          0          0   

              Article 6  Article 7  Article 8  Article 9  Article 10  ...  \
__cf_email__          0          0          0          0           0  ...   
a300                  0          0          0          0        

In [13]:
transformer = TfidfTransformer(norm = None)

In [14]:
tfidf_scores_transformed = transformer.fit_transform(counts)

In [15]:
try:
  df_tf_idf = pd.DataFrame(tfidf_scores_transformed.T.todense(), index=feature_names, columns=article_index)
  print(df_tf_idf.head(10))
except:
  pass

              Article 1  Article 2  Article 3  Article 4  Article 5  \
__cf_email__        0.0        0.0        0.0        0.0        0.0   
a300                0.0        0.0        0.0        0.0        0.0   
a320                0.0        0.0        0.0        0.0        0.0   
a321                0.0        0.0        0.0        0.0        0.0   
a330                0.0        0.0        0.0        0.0        0.0   
a330s               0.0        0.0        0.0        0.0        0.0   
a350                0.0        0.0        0.0        0.0        0.0   
a350s               0.0        0.0        0.0        0.0        0.0   
a380                0.0        0.0        0.0        0.0        0.0   
aa                  0.0        0.0        0.0        0.0        0.0   

              Article 6  Article 7  Article 8  Article 9  Article 10  ...  \
__cf_email__        0.0        0.0        0.0        0.0         0.0  ...   
a300                0.0        0.0        0.0        0.0        

In [16]:
vectorizer = TfidfVectorizer(norm = None)

In [17]:
tfidf_scores = vectorizer.fit_transform(preprcessed_articles)

In [18]:

try:
  df_tf_idf = pd.DataFrame( tfidf_scores.T.todense() , index=feature_names, columns=article_index)
  print(df_tf_idf.head( 10 ) )
except:
  pass


              Article 1  Article 2  Article 3  Article 4  Article 5  \
__cf_email__        0.0        0.0        0.0        0.0        0.0   
a300                0.0        0.0        0.0        0.0        0.0   
a320                0.0        0.0        0.0        0.0        0.0   
a321                0.0        0.0        0.0        0.0        0.0   
a330                0.0        0.0        0.0        0.0        0.0   
a330s               0.0        0.0        0.0        0.0        0.0   
a350                0.0        0.0        0.0        0.0        0.0   
a350s               0.0        0.0        0.0        0.0        0.0   
a380                0.0        0.0        0.0        0.0        0.0   
aa                  0.0        0.0        0.0        0.0        0.0   

              Article 6  Article 7  Article 8  Article 9  Article 10  ...  \
__cf_email__        0.0        0.0        0.0        0.0         0.0  ...   
a300                0.0        0.0        0.0        0.0        

In [19]:
if np.allclose(tfidf_scores_transformed.todense(), tfidf_scores.todense()):
  print(pd.DataFrame({'Are the tf-idf scores the same?':['YES']}))
else:
  print(pd.DataFrame({'Are the tf-idf scores the same?':['No, something is wrong :(']}))

  Are the tf-idf scores the same?
0                             YES


In [20]:
for i in range(1 , 20):
    print("Topic of  , " , df_tf_idf[[f'Article {i}']].idxmax())

Topic of  ,  Article 1    fare
dtype: object
Topic of  ,  Article 2    percent
dtype: object
Topic of  ,  Article 3    hong
dtype: object
Topic of  ,  Article 4    eurozone
dtype: object
Topic of  ,  Article 5    oil
dtype: object
Topic of  ,  Article 6    arabia
dtype: object
Topic of  ,  Article 7    kse
dtype: object
Topic of  ,  Article 8    ang
dtype: object
Topic of  ,  Article 9    sugar
dtype: object
Topic of  ,  Article 10    oil
dtype: object
Topic of  ,  Article 11    yen
dtype: object
Topic of  ,  Article 12    hong
dtype: object
Topic of  ,  Article 13    barrel
dtype: object
Topic of  ,  Article 14    petrol
dtype: object
Topic of  ,  Article 15    price
dtype: object
Topic of  ,  Article 16    petrol
dtype: object
Topic of  ,  Article 17    notification
dtype: object
Topic of  ,  Article 18    percent
dtype: object
Topic of  ,  Article 19    ecc
dtype: object
