### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from articles import articles
from preprocessing import preprocess_text

# import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

### Viewing article sample

In [2]:
# view article
articles[0]

'KARACHI: The Sindh government has decided to bring down public transport fares by 7 per cent due to massive reduction in petroleum product prices by the federal government, Geo News reported.Sources said reduction in fares will be applicable on public transport, rickshaw, taxi and other means of traveling. Meanwhile, Karachi Transport Ittehad (KTI) has refused to abide by the government decision.KTI President Irshad Bukhari said the commuters are charged the lowest fares in Karachi as compare to other parts of the country, adding that 80pc vehicles run on Compressed Natural Gas (CNG). Bukhari said Karachi transporters will cut fares when decrease in CNG prices will be made.'

### Viewing sample of preprocessed articles

In [3]:
# preprocess articles
processed_articles = [preprocess_text(article) for article in articles]
processed_articles[0]

'karachi the sindh government have decide to bring down public transport fare by per cent due to massive reduction in petroleum product price by the federal government geo news report source say reduction in fare will be applicable on public transport rickshaw taxi and other mean of travel meanwhile karachi transport ittehad kti have refuse to abide by the government decision kti president irshad bukhari say the commuter be charge the low fare in karachi a compare to other part of the country add that vehicle run on compress natural gas cng bukhari say karachi transporter will cut fare when decrease in cng price will be make'

### Initialize and fit CountVectorizer

In [4]:
# initialize and fit CountVectorizer
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(processed_articles)

### Initialize a TfidfTransformer object

In [5]:
# Initialize a TfidfTransformer object
transformer = TfidfTransformer(norm = None)

### Convert counts to tf-idf

In [6]:
# convert counts to tf-idf
tfidf_scores_transformed = transformer.fit_transform(counts)

### Initialize and fit TfidfVectorizer

In [7]:
# initialize and fit TfidfVectorizer
vectorizer = TfidfVectorizer(norm = None)
tfidf_scores = vectorizer.fit_transform(processed_articles)

### Verify if tf-idf scores are equal

In [8]:
# check if tf-idf scores are equal
if np.allclose(tfidf_scores_transformed.todense(), tfidf_scores.todense()):
    print(pd.DataFrame({'Are the tf-idf scores the same?':['YES']}))
else:
    print(pd.DataFrame({'Are the tf-idf scores the same?':['No, something is wrong :(']}))


  Are the tf-idf scores the same?
0                             YES


In [9]:
# get vocabulary of terms
try:
    feature_names = vectorizer.get_feature_names()
except:
    pass

# get article index
try:
    article_index = [f"Article {i+1}" for i in range(len(articles))]
except:
    pass

# create pandas DataFrame with word counts
try:
    df_word_counts = pd.DataFrame(counts.T.todense(), index=feature_names, columns=article_index)
    print(df_word_counts)
except:
    pass

        Article 1  Article 2  Article 3  Article 4  Article 5  Article 6  \
abbasi          0          0          0          1          0          0   
abide           1          0          0          0          0          0   
about           0          0          0          0          0          0   
accord          0          0          1          0          0          0   
add             1          0          0          0          0          0   
...           ...        ...        ...        ...        ...        ...   
world           0          0          0          0          0          3   
would           0          0          0          1          0          0   
year            0          1          0          0          0          0   
yi              0          0          0          0          0          0   
yuan            0          0          0          0          0          0   

        Article 7  Article 8  Article 9  Article 10  
abbasi          0          0     

In [10]:
# create pandas DataFrame(s) with tf-idf scores
try:
    df_tf_idf = pd.DataFrame(tfidf_scores_transformed.T.todense(), index=feature_names, columns=article_index)
    print(df_tf_idf)
except:
    pass

        Article 1  Article 2  Article 3  Article 4  Article 5  Article 6  \
abbasi   0.000000   0.000000   0.000000   2.704748        0.0   0.000000   
abide    2.704748   0.000000   0.000000   0.000000        0.0   0.000000   
about    0.000000   0.000000   0.000000   0.000000        0.0   0.000000   
accord   0.000000   0.000000   2.704748   0.000000        0.0   0.000000   
add      2.299283   0.000000   0.000000   0.000000        0.0   0.000000   
...           ...        ...        ...        ...        ...        ...   
world    0.000000   0.000000   0.000000   0.000000        0.0   8.114244   
would    0.000000   0.000000   0.000000   2.299283        0.0   0.000000   
year     0.000000   2.704748   0.000000   0.000000        0.0   0.000000   
yi       0.000000   0.000000   0.000000   0.000000        0.0   0.000000   
yuan     0.000000   0.000000   0.000000   0.000000        0.0   0.000000   

        Article 7  Article 8  Article 9  Article 10  
abbasi   0.000000        0.0   0.

In [11]:
try:
    df_tf_idf = pd.DataFrame(tfidf_scores.T.todense(), index=feature_names, columns=article_index)
    print(df_tf_idf)
except:
    pass

        Article 1  Article 2  Article 3  Article 4  Article 5  Article 6  \
abbasi   0.000000   0.000000   0.000000   2.704748        0.0   0.000000   
abide    2.704748   0.000000   0.000000   0.000000        0.0   0.000000   
about    0.000000   0.000000   0.000000   0.000000        0.0   0.000000   
accord   0.000000   0.000000   2.704748   0.000000        0.0   0.000000   
add      2.299283   0.000000   0.000000   0.000000        0.0   0.000000   
...           ...        ...        ...        ...        ...        ...   
world    0.000000   0.000000   0.000000   0.000000        0.0   8.114244   
would    0.000000   0.000000   0.000000   2.299283        0.0   0.000000   
year     0.000000   2.704748   0.000000   0.000000        0.0   0.000000   
yi       0.000000   0.000000   0.000000   0.000000        0.0   0.000000   
yuan     0.000000   0.000000   0.000000   0.000000        0.0   0.000000   

        Article 7  Article 8  Article 9  Article 10  
abbasi   0.000000        0.0   0.

In [12]:
# get highest scoring tf-idf term for each article
for i in range(1, 11):
    print(df_tf_idf[[f'Article {i}']].idxmax())

Article 1    fare
dtype: object
Article 2    hong
dtype: object
Article 3    sugar
dtype: object
Article 4    petrol
dtype: object
Article 5    engine
dtype: object
Article 6    australia
dtype: object
Article 7    car
dtype: object
Article 8    railway
dtype: object
Article 9    cabinet
dtype: object
Article 10    china
dtype: object
