In [2]:
import pandas as pd
import numpy as np

News headlines and short description with classification

Making balanced dataset

In [22]:
news = pd.read_json('../data/news/News_Category_Dataset_v2.json', lines=True)
news.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


In [23]:
news.shape

(200853, 6)

In [24]:
news = news[news['short_description'].notna()]

In [25]:
news_cleaned = pd.DataFrame(columns=['text', 'finance'])

In [26]:
news_cleaned.shape

(0, 2)

Removing duplicates, beacuse they do not contain useful information for classification by category

In [27]:
vc = news['short_description'].value_counts()
to_drop = vc[vc > 1].index.tolist()
news = news[~news['short_description'].isin(to_drop)]

In [28]:
news.shape

(177438, 6)

Taking rows from other categories proprtionally so that |finance| == |non-finance|

In [29]:
other_categories = news['category'].value_counts()
finance_categories = other_categories.loc[['BUSINESS', 'MONEY']]
other_categories = other_categories.drop(['BUSINESS', 'MONEY'])
proportion = sum(finance_categories) / sum(other_categories)
int_vector = np.vectorize(int)
other_categories = np.round(other_categories * proportion).astype(int)

In [30]:
for index, num in other_categories.items():
    sample = news[news['category'] == index].sample(num)
    news_cleaned = news_cleaned.append(pd.DataFrame(list(zip(sample['short_description'], np.zeros(num))), columns=['text', 'finance']))

In [31]:
news_cleaned.shape

(6750, 2)

In [32]:
for index, num in finance_categories.items():
    sample = news[news['category'] == index]
    news_cleaned = news_cleaned.append(pd.DataFrame(list(zip(sample['short_description'], np.ones(num))), columns=['text', 'finance']))

In [33]:
import re
import string

remove_punct_map = dict.fromkeys(map(ord, string.punctuation))

for i, row in news_cleaned.iterrows():
    news_cleaned.loc[i, 'text'] = row['text'].translate(remove_punct_map).lower()

In [34]:
news_cleaned.iloc[0]['text']

'we are a generation of maximizers and its both a blessing and a curse sure we can identify what we like online on our phones and in store but liking a product simply isnt enough to make us buy it'

In [35]:
news_cleaned.to_csv('../data/news/news_cleaned_balanced.csv', index=False)

Making imbalanced dataset

In [44]:
news_cleaned = pd.DataFrame(columns=['text', 'finance'])

In [45]:
other_categories = news['category'].unique()
other_categories = other_categories[(other_categories != 'BUSINESS') & (other_categories !='MONEY')]
finance_categories = np.array(['BUSINESS', 'MONEY'])

In [46]:
for cat in other_categories:
    sample = news[news['category'] == cat].sample(frac=0.2)
    num = sample.shape[0]
    news_cleaned = news_cleaned.append(pd.DataFrame(list(zip(sample['short_description'], np.zeros(num))), columns=['text', 'finance']))

In [47]:
for cat in finance_categories:
    sample = news[news['category'] == cat]
    num = sample.shape[0]
    news_cleaned = news_cleaned.append(pd.DataFrame(list(zip(sample['short_description'], np.ones(num))), columns=['text', 'finance']))
    

In [49]:
news_cleaned.shape

(40888, 2)

In [50]:
for i, row in news_cleaned.iterrows():
    news_cleaned.loc[i, 'text'] = row['text'].translate(remove_punct_map).lower()

In [51]:
news_cleaned.iloc[0]['text']

'billionaire iac chairman barry diller made the comment to politico'

In [52]:
news_cleaned.to_csv('../data/news/news_cleaned.csv', index=False)

Preparing financial tweets dataset for cluster analysis and keywords extraction

In [53]:
stockerbot_data = pd.read_csv('../data/fin_news/stockerbot-export1.csv')
stockerbot_data.head()

Unnamed: 0,id,text,timestamp,source,symbols,company_names,url,verified
0,1.0197e+18,VIDEO: “I was in my office. I was minding my o...,Wed Jul 18 21:33:26 +0000 2018,GoldmanSachs,GS,The Goldman Sachs,https://twitter.com/i/web/status/1019696670777...,True
1,1.01971e+18,The price of lumber $LB_F is down 22% since hi...,Wed Jul 18 22:22:47 +0000 2018,StockTwits,M,Macy's,https://twitter.com/i/web/status/1019709091038...,True
2,1.01971e+18,Who says the American Dream is dead? https://t...,Wed Jul 18 22:32:01 +0000 2018,TheStreet,AIG,American,https://buff.ly/2L3kmc4,True
3,1.01972e+18,Barry Silbert is extremely optimistic on bitco...,Wed Jul 18 22:52:52 +0000 2018,MarketWatch,BTC,Bitcoin,https://twitter.com/i/web/status/1019716662587...,True
4,1.01972e+18,How satellites avoid attacks and space junk wh...,Wed Jul 18 23:00:01 +0000 2018,Forbes,ORCL,Oracle,http://on.forbes.com/6013DqDDU,True


In [54]:
cleaned_fin_tweetes_data = pd.read_csv('../data/fin_news/tweet_sentiment.csv')

In [55]:
cleaned_fin_tweetes_data.head()

Unnamed: 0,cleaned_tweets,sentiment
0,video offic mind busi david solomon tell gs in...,0
1,price lumber lb f sinc hit ytd high maci turna...,0
2,say american dream dead,-1
3,barri silbert extrem optimist bitcoin predict ...,1
4,satellit avoid attack space junk circl earth paid,-1


In [56]:
cleaned_fin_tweetes_data = cleaned_fin_tweetes_data.drop(['sentiment'], axis=1)

In [57]:
cleaned_fin_tweetes_data.to_csv('../data/fin_news/fin_tweets_no_sent.csv')

In [58]:
stockerbot_data_cleaned = pd.DataFrame(stockerbot_data['text'], columns=['text'])

In [83]:
# finally i decided to write a function for cleaning a string from punctuation, numbers and links

remove_punct_map = dict.fromkeys(map(ord, string.punctuation))
def clean_string(s):
    s = re.sub(r'\@\w+|\@', ' ', s)
    s = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', s)
    s = re.sub(r'[0-9]+', ' ', s)
    s = s.translate(remove_punct_map).lower()
    s = re.sub(' +', ' ', s)
    return s
    
    
print(clean_string('Rinkuskiai \'s beer sales fell by 6.5 per cent ... @fdtd hgg'))

rinkuskiai s beer sales fell by per cent hgg


In [84]:
stockerbot_data_cleaned['text'] = stockerbot_data_cleaned['text'].apply(clean_string)

In [85]:
stockerbot_data_cleaned.iloc[0]['text']

'video “i was in my office i was minding my own business” –david solomon tells gs interns how he learned he wa… '

In [86]:
stockerbot_data_cleaned.to_csv('../data/fin_news/stockerbot_data_cleaned.csv')

Financial tweets dataset

In [87]:
fin_tweets = pd.read_csv('../data/financial_tweets/all-data.csv', header=None, usecols=[0,1], names=['sentiment', 'text'], encoding = "ISO-8859-1")
fin_tweets

Unnamed: 0,sentiment,text
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...
...,...,...
4841,negative,LONDON MarketWatch -- Share prices ended lower...
4842,neutral,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,negative,Operating profit fell to EUR 35.4 mn from EUR ...
4844,negative,Net sales of the Paper segment decreased to EU...


In [88]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(['negative', 'neutral', 'positive'])
fin_tweets['sentiment'] = le.transform(fin_tweets['sentiment']) - 1
fin_tweets

Unnamed: 0,sentiment,text
0,0,"According to Gran , the company has no plans t..."
1,0,Technopolis plans to develop in stages an area...
2,-1,The international electronic industry company ...
3,1,With the new production plant the company woul...
4,1,According to the company 's updated strategy f...
...,...,...
4841,-1,LONDON MarketWatch -- Share prices ended lower...
4842,0,Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843,-1,Operating profit fell to EUR 35.4 mn from EUR ...
4844,-1,Net sales of the Paper segment decreased to EU...


In [89]:
fin_tweets['text'] = fin_tweets['text'].apply(clean_string)

In [90]:
fin_tweets

Unnamed: 0,sentiment,text
0,0,according to gran the company has no plans to ...
1,0,technopolis plans to develop in stages an area...
2,-1,the international electronic industry company ...
3,1,with the new production plant the company woul...
4,1,according to the company s updated strategy fo...
...,...,...
4841,-1,london marketwatch share prices ended lower in...
4842,0,rinkuskiai s beer sales fell by per cent to mi...
4843,-1,operating profit fell to eur mn from eur mn in...
4844,-1,net sales of the paper segment decreased to eu...


In [91]:
fin_tweets.to_csv('../data/financial_tweets/fin_tweets_text_setiment.csv')

In [92]:
pd.DataFrame(fin_tweets['text'], columns=['text']).to_csv('../data/financial_tweets/fin_tweets_text.csv')

Twitter and Reddit sentiment dataset

In [69]:
reddit_data = pd.read_csv('../data/twitter-reddit/reddit_data_sentiment.csv')
reddit_data

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1
...,...,...
37244,jesus,0
37245,kya bhai pure saal chutiya banaya modi aur jab...,1
37246,downvote karna tha par upvote hogaya,0
37247,haha nice,1


In [70]:
reddit_data['clean_comment'].to_frame('text').to_csv('../data/twitter-reddit/reddit_text.csv', index=False)

In [71]:
twitter_data = pd.read_csv('../data/twitter-reddit/twitter_data_sentiment.csv')
twitter_data

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0
162977,did you cover her interaction forum where she ...,0.0
162978,there big project came into india modi dream p...,0.0


In [72]:
twitter_data['clean_text'].to_frame('text').to_csv('../data/twitter-reddit/twitter_text.csv')

Combining financial texts datasets in one for keyword extraction

In [93]:
fin_tweets_1 = pd.read_csv('../data/fin_news/stockerbot_data_cleaned.csv')
fin_tweets_2 = pd.read_csv('../data/financial_tweets/fin_tweets_text.csv')

fin_tweets_1 = fin_tweets_1.append(fin_tweets_2)

In [94]:
fin_tweets_1.to_csv('../data/prepared/financial_texts_keywords.csv')

Combining texts for cluster analysis

In [75]:
tw = pd.read_csv('../data/twitter-reddit/twitter_text.csv')
red = pd.read_csv('../data/twitter-reddit/reddit_text.csv')
cluster_data = tw.append(red)
cluster_data.shape

(200111, 2)

In [191]:
fin_tweets_1.shape

(33286, 2)

In [76]:
cluster_data = cluster_data.append(fin_tweets_1)
cluster_data.to_csv('../data/prepared/cluster_data.csv')

Removing duplicates from news dataset