##### import statements

In [2]:
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

##### read input csv data

In [3]:
df = pd.read_csv( "data/input/us_financial_news_data.csv")

##### first 5 rows of data

In [4]:
df.head()

Unnamed: 0,index,published_date,source_name,title
0,0,2017-12-07T06:58:00.000+02:00,wsj.com,This entrepreneur is ringing up sales restorin...
1,1,2017-12-07T22:36:00.000+02:00,cnbc.com,Mexican official disputes reports of tainted a...
2,2,2017-12-07T22:45:00.000+02:00,cnbc.com,Saudi prince has history of extravagant impuls...
3,3,2017-12-08T02:00:00.000+02:00,fortune.com,Golden Globes Predictions for Netflix's The Cr...
4,4,2017-12-08T02:00:00.000+02:00,fortune.com,Bitcoin: Peter Thiel's Founders Fund Goes Big ...


##### data description

In [5]:
df.describe()

Unnamed: 0,index
count,306242.0
mean,153120.5
std,88404.594906
min,0.0
25%,76560.25
50%,153120.5
75%,229680.75
max,306241.0


##### data sources

In [6]:
df[ "source_name"].unique()

array(['wsj.com', 'cnbc.com', 'fortune.com', 'reuters.com'], dtype=object)

##### count of rows of each news source

In [7]:
count_by_source = df.groupby( "source_name").count().iloc[:, 0]

count = count_by_source.sum()

for s in count_by_source.index:
	print( f"{ s}: { count_by_source[ s]} ({ ( count_by_source[ s] / count * 100):.2f}%)")

cnbc.com: 85197 (27.82%)
fortune.com: 5737 (1.87%)
reuters.com: 197514 (64.50%)
wsj.com: 17794 (5.81%)


##### detect missing data

In [8]:
df[ df.isnull().any( axis=1)]

Unnamed: 0,index,published_date,source_name,title
257440,257440,2018-05-05T19:02:00.000+03:00,cnbc.com,


##### missing data percentage

In [9]:
df_missing_percentage = df.isnull().sum() / df.shape[ 0] * 100

print( f"missing data percentage:\n{ df_missing_percentage}")

missing data percentage:
index             0.000000
published_date    0.000000
source_name       0.000000
title             0.000327
dtype: float64


##### filling missing data

In [4]:
df = df.fillna( "missing")

df.shape

(306242, 4)

#### Features

##### words frequency table

In [5]:
def create_words_frequency( features, feature_names):
	features_df = pd.DataFrame( features)
	features_df.columns = feature_names
	sorted_features = features_df.sum( axis=0).sort_values( ascending=False)
	sorted_faetures = sorted_features.reset_index()
	sorted_features.columns = [ "Top Words", "Counts"]
	return sorted_features

##### stop words

In [6]:
more_stop_words = [
    "ap1", "00", "000", "0", "561", "190", "09", "24", "2017","2018", "000 00", "2018",
		"jan","feb","mar","apr","may","jun","jul","aug","sep","oct","nov","dec",
		"ag", "ap3", "000 00 ap3", "ap2", "00 ap2", "000 00 ap2", "10", "00 ap1",
		"000 00 ap1", "oct 2018", "000 000", "000 000 00", "october 2018", "10 2018",
		"11 2018", "november 2018", "12 2018", "december 2018"
]

my_stop_words = text.ENGLISH_STOP_WORDS.union( more_stop_words)

##### TF-IDF: Term Frequency–Inverse Document Frequency

> a measure of importance of a word to a document in a collection or corpus, adjusted for the fact that some words appear more frequently in general.

![Formula](https://wikimedia.org/api/rest_v1/media/math/render/svg/dd4f8a91dd0d28a11c00c94a13a315a5b49a8070)

In [13]:
def create_tfidf( df, feature_column, max_feature_size):
	tfidf_vec = TfidfVectorizer(
		sublinear_tf=True,
		min_df=2,
		norm="l2",
		encoding="latin-1",
		ngram_range=( 1, 3),
		stop_words=list( my_stop_words),
		max_features=max_feature_size
	)

	features = tfidf_vec.fit_transform( df[ feature_column]).toarray()

	return features, tfidf_vec

In [14]:
features_tfidf, tfidf_vec = create_tfidf( df, feature_column="title", max_feature_size=5000)



In [15]:
features_tfidf_names = tfidf_vec.get_feature_names_out()

features_tfidf_names

array(['01', '02', '03', ..., 'zte', 'zuckerberg', 'zuma'], dtype=object)

In [16]:
tfidf_sorted_table = create_words_frequency( features_tfidf, features_tfidf_names)

tfidf_sorted_table.head( 20)

brief        8693.242813
says         5021.040697
announces    3783.467672
quarter      3085.344876
new          3005.418400
results      2962.650926
reports      2771.693482
trump        2735.298792
china        2561.690213
update       2413.968231
mln          2348.799917
share        2331.718249
year         2261.975542
profit       2242.533916
financial    2215.760903
earnings     2160.169661
ceo          2140.329696
million      2089.348005
group        2037.012636
shares       1733.949403
dtype: float64

##### BOW: Bag Of Words

> a model of text represented as an unordered collection of words, the frequency (occurrence) of each word is used as a feature for training a classifier.

In [7]:
def create_bow( df, feature_column, max_feature_size):
    counter_vec = CountVectorizer(
        encoding="latin-1",
        ngram_range=( 1, 3),
        stop_words=list( my_stop_words),
        max_features=max_feature_size
    )

    features = counter_vec.fit_transform( df[ feature_column]).toarray()

    return features, counter_vec

In [8]:
features_bow, bow_vec = create_bow( df, feature_column="title", max_feature_size=5000)



In [9]:
features_bow_names = bow_vec.get_feature_names_out()

features_bow_names

array(['01', '02', '03', ..., 'zte', 'zuckerberg', 'zuma'], dtype=object)

In [10]:
bow_sorted_table = create_words_frequency( features_bow, features_bow_names)

bow_sorted_table.head( 20)

brief        59666
says         25117
announces    18867
quarter      15846
results      14257
new          12753
reports      12736
mln          11039
share        10841
year         10624
trump        10540
china        10309
profit       10290
financial    10081
update        9944
earnings      9363
million       8698
group         8331
ceo           7521
pct           7388
dtype: int64