# Preprocessing of data

In [161]:
import numpy as np
import pandas as pd
from datetime import datetime
import re
from tqdm import tqdm

In [38]:
NEWS_FILE = '../../data/raw/investments_news_tweet.csv'
STOCK_FILE = '../../data/raw/cac40_2010_2021.csv'

In [39]:
stock_df = pd.read_csv(STOCK_FILE)

In [40]:
stock_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3019 entries, 0 to 3018
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Date      3019 non-null   object
 1   Price     3019 non-null   object
 2   Open      3019 non-null   object
 3   High      3019 non-null   object
 4   Low       3019 non-null   object
 5   Vol.      3019 non-null   object
 6   Change %  3019 non-null   object
dtypes: object(7)
memory usage: 165.2+ KB


In [41]:
stock_df['Date'] = stock_df['Date'].apply(lambda x: datetime.strptime(x, '%b %d, %Y'))
stock_df['Date'] = pd.to_datetime(stock_df['Date'])

In [42]:
stock_df

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2021-10-15,6727.52,6714.26,6746.81,6703.26,84.11M,0.63%
1,2021-10-14,6685.21,6651.18,6692.94,6618.79,71.37M,1.33%
2,2021-10-13,6597.38,6539.61,6606.36,6510.27,75.94M,0.75%
3,2021-10-12,6548.11,6495.15,6548.11,6491.58,70.00M,-0.34%
4,2021-10-11,6570.54,6546.07,6574.22,6522.55,68.97M,0.16%
...,...,...,...,...,...,...,...
3014,2010-01-08,4045.14,4042.64,4051.41,4013.34,131.78M,0.51%
3015,2010-01-07,4024.80,4005.96,4032.24,3983.90,118.43M,0.18%
3016,2010-01-06,4017.67,4014.45,4022.94,3996.80,110.09M,0.12%
3017,2010-01-05,4012.91,4012.43,4028.34,3993.33,129.52M,-0.03%


In [43]:
stock_df['Vol.'] = stock_df['Vol.'].apply(lambda x: float(x[:-1]) * 1000000)

In [44]:
stock_df

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2021-10-15,6727.52,6714.26,6746.81,6703.26,84110000.0,0.63%
1,2021-10-14,6685.21,6651.18,6692.94,6618.79,71370000.0,1.33%
2,2021-10-13,6597.38,6539.61,6606.36,6510.27,75940000.0,0.75%
3,2021-10-12,6548.11,6495.15,6548.11,6491.58,70000000.0,-0.34%
4,2021-10-11,6570.54,6546.07,6574.22,6522.55,68970000.0,0.16%
...,...,...,...,...,...,...,...
3014,2010-01-08,4045.14,4042.64,4051.41,4013.34,131780000.0,0.51%
3015,2010-01-07,4024.80,4005.96,4032.24,3983.90,118430000.0,0.18%
3016,2010-01-06,4017.67,4014.45,4022.94,3996.80,110090000.0,0.12%
3017,2010-01-05,4012.91,4012.43,4028.34,3993.33,129520000.0,-0.03%


In [45]:
stock_df['Change %'] = stock_df['Change %'].apply(lambda x: float(x[:-1]))

In [46]:
stock_df

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2021-10-15,6727.52,6714.26,6746.81,6703.26,84110000.0,0.63
1,2021-10-14,6685.21,6651.18,6692.94,6618.79,71370000.0,1.33
2,2021-10-13,6597.38,6539.61,6606.36,6510.27,75940000.0,0.75
3,2021-10-12,6548.11,6495.15,6548.11,6491.58,70000000.0,-0.34
4,2021-10-11,6570.54,6546.07,6574.22,6522.55,68970000.0,0.16
...,...,...,...,...,...,...,...
3014,2010-01-08,4045.14,4042.64,4051.41,4013.34,131780000.0,0.51
3015,2010-01-07,4024.80,4005.96,4032.24,3983.90,118430000.0,0.18
3016,2010-01-06,4017.67,4014.45,4022.94,3996.80,110090000.0,0.12
3017,2010-01-05,4012.91,4012.43,4028.34,3993.33,129520000.0,-0.03


In [47]:
stock_df['Price'] = stock_df['Price'].apply(lambda x: float(x.replace(',', '')))

In [48]:
stock_df

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2021-10-15,6727.52,6714.26,6746.81,6703.26,84110000.0,0.63
1,2021-10-14,6685.21,6651.18,6692.94,6618.79,71370000.0,1.33
2,2021-10-13,6597.38,6539.61,6606.36,6510.27,75940000.0,0.75
3,2021-10-12,6548.11,6495.15,6548.11,6491.58,70000000.0,-0.34
4,2021-10-11,6570.54,6546.07,6574.22,6522.55,68970000.0,0.16
...,...,...,...,...,...,...,...
3014,2010-01-08,4045.14,4042.64,4051.41,4013.34,131780000.0,0.51
3015,2010-01-07,4024.80,4005.96,4032.24,3983.90,118430000.0,0.18
3016,2010-01-06,4017.67,4014.45,4022.94,3996.80,110090000.0,0.12
3017,2010-01-05,4012.91,4012.43,4028.34,3993.33,129520000.0,-0.03


In [49]:
stock_df['Open'] = stock_df['Open'].apply(lambda x: float(x.replace(',', '')))

In [50]:
stock_df

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2021-10-15,6727.52,6714.26,6746.81,6703.26,84110000.0,0.63
1,2021-10-14,6685.21,6651.18,6692.94,6618.79,71370000.0,1.33
2,2021-10-13,6597.38,6539.61,6606.36,6510.27,75940000.0,0.75
3,2021-10-12,6548.11,6495.15,6548.11,6491.58,70000000.0,-0.34
4,2021-10-11,6570.54,6546.07,6574.22,6522.55,68970000.0,0.16
...,...,...,...,...,...,...,...
3014,2010-01-08,4045.14,4042.64,4051.41,4013.34,131780000.0,0.51
3015,2010-01-07,4024.80,4005.96,4032.24,3983.90,118430000.0,0.18
3016,2010-01-06,4017.67,4014.45,4022.94,3996.80,110090000.0,0.12
3017,2010-01-05,4012.91,4012.43,4028.34,3993.33,129520000.0,-0.03


In [51]:
stock_df['High'] = stock_df['High'].apply(lambda x: float(x.replace(',', '')))

In [52]:
stock_df

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2021-10-15,6727.52,6714.26,6746.81,6703.26,84110000.0,0.63
1,2021-10-14,6685.21,6651.18,6692.94,6618.79,71370000.0,1.33
2,2021-10-13,6597.38,6539.61,6606.36,6510.27,75940000.0,0.75
3,2021-10-12,6548.11,6495.15,6548.11,6491.58,70000000.0,-0.34
4,2021-10-11,6570.54,6546.07,6574.22,6522.55,68970000.0,0.16
...,...,...,...,...,...,...,...
3014,2010-01-08,4045.14,4042.64,4051.41,4013.34,131780000.0,0.51
3015,2010-01-07,4024.80,4005.96,4032.24,3983.90,118430000.0,0.18
3016,2010-01-06,4017.67,4014.45,4022.94,3996.80,110090000.0,0.12
3017,2010-01-05,4012.91,4012.43,4028.34,3993.33,129520000.0,-0.03


In [53]:
stock_df['Low'] = stock_df['Low'].apply(lambda x: float(x.replace(',', '')))

In [54]:
stock_df

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2021-10-15,6727.52,6714.26,6746.81,6703.26,84110000.0,0.63
1,2021-10-14,6685.21,6651.18,6692.94,6618.79,71370000.0,1.33
2,2021-10-13,6597.38,6539.61,6606.36,6510.27,75940000.0,0.75
3,2021-10-12,6548.11,6495.15,6548.11,6491.58,70000000.0,-0.34
4,2021-10-11,6570.54,6546.07,6574.22,6522.55,68970000.0,0.16
...,...,...,...,...,...,...,...
3014,2010-01-08,4045.14,4042.64,4051.41,4013.34,131780000.0,0.51
3015,2010-01-07,4024.80,4005.96,4032.24,3983.90,118430000.0,0.18
3016,2010-01-06,4017.67,4014.45,4022.94,3996.80,110090000.0,0.12
3017,2010-01-05,4012.91,4012.43,4028.34,3993.33,129520000.0,-0.03


In [55]:
stock_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3019 entries, 0 to 3018
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Date      3019 non-null   datetime64[ns]
 1   Price     3019 non-null   float64       
 2   Open      3019 non-null   float64       
 3   High      3019 non-null   float64       
 4   Low       3019 non-null   float64       
 5   Vol.      3019 non-null   float64       
 6   Change %  3019 non-null   float64       
dtypes: datetime64[ns](1), float64(6)
memory usage: 165.2 KB


In [56]:
stock_df.to_csv('../../data/processed/stocks.csv')

## Tweets

In [124]:
news_df = pd.read_csv(NEWS_FILE)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [125]:
news_df.head()

Unnamed: 0.1,Unnamed: 0,_type,url,date,content,renderedContent,id,user,replyCount,retweetCount,...,media,retweetedTweet,quotedTweet,inReplyToTweetId,inReplyToUser,mentionedUsers,coordinates,place,hashtags,cashtags
0,0,snscrape.modules.twitter.Tweet,https://twitter.com/InvestirFr/status/14490615...,2021-10-15T17:16:09+00:00,40.000 nouveaux actionnaires individuels https...,40.000 nouveaux actionnaires individuels trib....,1449061547381334020,"{'_type': 'snscrape.modules.twitter.User', 'us...",0,0,...,,,,,,,,,,
1,1,snscrape.modules.twitter.Tweet,https://twitter.com/InvestirFr/status/14490499...,2021-10-15T16:30:06+00:00,[Agenda] Retrouvez les grands rendez-vous de l...,[Agenda] Retrouvez les grands rendez-vous de l...,1449049956275429384,"{'_type': 'snscrape.modules.twitter.User', 'us...",0,0,...,,,,,,,,,,
2,2,snscrape.modules.twitter.Tweet,https://twitter.com/InvestirFr/status/14490451...,2021-10-15T16:10:49+00:00,"Clôture : encourageant, le début de la saison ...","Clôture : encourageant, le début de la saison ...",1449045103218249736,"{'_type': 'snscrape.modules.twitter.User', 'us...",0,1,...,,,,,,,,,,
3,3,snscrape.modules.twitter.Tweet,https://twitter.com/InvestirFr/status/14490197...,2021-10-15T14:30:15+00:00,"Après plusieurs semaines de rumeurs, la minist...","Après plusieurs semaines de rumeurs, la minist...",1449019797627363329,"{'_type': 'snscrape.modules.twitter.User', 'us...",0,0,...,,,,,,,,,,
4,4,snscrape.modules.twitter.Tweet,https://twitter.com/InvestirFr/status/14490184...,2021-10-15T14:25:00+00:00,L’action Pizzorno Environnement grimpe désorma...,L’action Pizzorno Environnement grimpe désorma...,1449018475578593281,"{'_type': 'snscrape.modules.twitter.User', 'us...",0,0,...,,,,,,,,,,


In [126]:
news_df.columns

Index(['Unnamed: 0', '_type', 'url', 'date', 'content', 'renderedContent',
       'id', 'user', 'replyCount', 'retweetCount', 'likeCount', 'quoteCount',
       'conversationId', 'lang', 'source', 'sourceUrl', 'sourceLabel',
       'outlinks', 'tcooutlinks', 'media', 'retweetedTweet', 'quotedTweet',
       'inReplyToTweetId', 'inReplyToUser', 'mentionedUsers', 'coordinates',
       'place', 'hashtags', 'cashtags'],
      dtype='object')

In [127]:
news_df = news_df[['date', 'content', 'renderedContent', 'replyCount', 'retweetCount', 'likeCount', 'quoteCount']]

In [128]:
news_df

Unnamed: 0,date,content,renderedContent,replyCount,retweetCount,likeCount,quoteCount
0,2021-10-15T17:16:09+00:00,40.000 nouveaux actionnaires individuels https...,40.000 nouveaux actionnaires individuels trib....,0,0,3,0
1,2021-10-15T16:30:06+00:00,[Agenda] Retrouvez les grands rendez-vous de l...,[Agenda] Retrouvez les grands rendez-vous de l...,0,0,0,0
2,2021-10-15T16:10:49+00:00,"Clôture : encourageant, le début de la saison ...","Clôture : encourageant, le début de la saison ...",0,1,2,0
3,2021-10-15T14:30:15+00:00,"Après plusieurs semaines de rumeurs, la minist...","Après plusieurs semaines de rumeurs, la minist...",0,0,3,0
4,2021-10-15T14:25:00+00:00,L’action Pizzorno Environnement grimpe désorma...,L’action Pizzorno Environnement grimpe désorma...,0,0,1,0
...,...,...,...,...,...,...,...
57992,2010-01-19T17:05:18+00:00,"Paris : Le Cac 40 termine en hausse de 0,81 % ...","Paris : Le Cac 40 termine en hausse de 0,81 % ...",0,0,0,0
57993,2010-01-19T16:36:04+00:00,Or : Léger repli de l'once: http://bit.ly/6h5mX0,Or : Léger repli de l'once: http://bit.ly/6h5mX0,0,0,0,0
57994,2010-01-19T15:05:29+00:00,Analyses et opinions : Nos engagements pour 20...,Analyses et opinions : Nos engagements pour 20...,0,0,0,0
57995,2010-01-19T14:34:52+00:00,Paris : Le Cac 40 reprend son souffle après un...,Paris : Le Cac 40 reprend son souffle après un...,0,0,0,0


In [129]:
def clean_stock_data(stock_df):
    #process the date
    stock_df['Date'] = stock_df['Date'].apply(lambda x: datetime.strptime(x, '%b %d, %Y'))
    stock_df['Date'] = pd.to_datetime(stock_df['Date'])
    
    #process the volume
    stock_df['Vol.'] = stock_df['Vol.'].apply(lambda x: float(x[:-1]) * 1000000)
    
    #process percentage
    stock_df['Change %'] = stock_df['Change %'].apply(lambda x: float(x[:-1]))
    
    #process numerical data
    stock_df['Price'] = stock_df['Price'].apply(lambda x: float(x.replace(',', '')))
    stock_df['Open'] = stock_df['Open'].apply(lambda x: float(x.replace(',', '')))
    stock_df['High'] = stock_df['High'].apply(lambda x: float(x.replace(',', '')))
    stock_df['Low'] = stock_df['Low'].apply(lambda x: float(x.replace(',', '')))
    
    return stock_df

In [130]:
def remove_unused_columns(news_data):
    columns_list = ['date', 'content', 'replyCount', 'retweetCount', 'likeCount', 'quoteCount']
    news_data = news_data[columns_list]
    return news_data

def clean(text):
    text = str(text)
    text = text.lower()
    text = re.sub('http\S+', '', text)
    text = text.replace('...', '')
    text = text.replace('..', '')
    text = re.sub(r'\(([^)]+)\)', " ", text)
    #text = re.sub(r'[^a-zA-Z ]+', ' ', text)
    text = re.sub(r' \w{1,2}_', ' ', text)
    text = re.sub('\s+',' ', text)
    return text

In [136]:
news_df = remove_unused_columns(news_df)

In [137]:
news_df['content'].apply(clean)

0                40.000 nouveaux actionnaires individuels 
1        [agenda] retrouvez les grands rendez-vous de l...
2        clôture : encourageant, le début de la saison ...
3        après plusieurs semaines de rumeurs, la minist...
4        l’action pizzorno environnement grimpe désorma...
                               ...                        
57992    paris : le cac 40 termine en hausse de 0,81 % ...
57993                         or : léger repli de l'once: 
57994    analyses et opinions : nos engagements pour 20...
57995    paris : le cac 40 reprend son souffle après un...
57996          or : l'once peu changée au premier fixing: 
Name: content, Length: 57997, dtype: object

In [138]:
news_df['date'] = pd.to_datetime(news_df['date'])

In [139]:
news_df['date'] = news_df['date'].apply(lambda x: x.date())

In [141]:
news_df = news_df[['date', 'content']]

In [142]:
news_df

Unnamed: 0,date,content
0,2021-10-15,40.000 nouveaux actionnaires individuels https...
1,2021-10-15,[Agenda] Retrouvez les grands rendez-vous de l...
2,2021-10-15,"Clôture : encourageant, le début de la saison ..."
3,2021-10-15,"Après plusieurs semaines de rumeurs, la minist..."
4,2021-10-15,L’action Pizzorno Environnement grimpe désorma...
...,...,...
57992,2010-01-19,"Paris : Le Cac 40 termine en hausse de 0,81 % ..."
57993,2010-01-19,Or : Léger repli de l'once: http://bit.ly/6h5mX0
57994,2010-01-19,Analyses et opinions : Nos engagements pour 20...
57995,2010-01-19,Paris : Le Cac 40 reprend son souffle après un...


In [146]:
news_date = news_df.groupby('date')['content'].agg(lambda x: '.'.join(x.values.tolist()))

In [147]:
news_date

date
2010-01-19    Paris Clôture : Le Cac 40 repasse au-dessus de...
2010-01-20    Paris Clôture : Les financières et les crainte...
2010-01-21    Paris Clôture : Nouvelle séance de consolidati...
2010-01-22    Rétro de la semaine : Le Cac 40 à la peine, pl...
2010-01-25    Paris Clôture : Paris au plus bas depuis plus ...
                                    ...                        
2021-10-11    Clôture : échaudés par la hausse des prix des ...
2021-10-12    Clôture : le scénario est identique depuis plu...
2021-10-13    Les informations selon lesquelles Volkswagen e...
2021-10-14    BOUYGUES : Bouygues : objet boursier mal ident...
2021-10-15    40.000 nouveaux actionnaires individuels https...
Name: content, Length: 3542, dtype: object

In [149]:
news_date.to_csv('../../data/processed/news_date.csv')

In [152]:
stock_df  = stock_df[['Date', 'Price']]
stock_df.columns = ['date', 'Price']

In [153]:
stock_df

Unnamed: 0,date,Price
0,2021-10-15,6727.52
1,2021-10-14,6685.21
2,2021-10-13,6597.38
3,2021-10-12,6548.11
4,2021-10-11,6570.54
...,...,...
3014,2010-01-08,4045.14
3015,2010-01-07,4024.80
3016,2010-01-06,4017.67
3017,2010-01-05,4012.91


In [154]:
from vaderSentiment_fr.vaderSentiment import SentimentIntensityAnalyzer




In [159]:
new_words = {'chute': -9, 'tomber': -9, 'laisser': -9, 'descendre': -9, 'monter': 9, 'hausse': 9, 'gain': 9, 'gagner': 9,
             'élevation': -9, 'baisse': -9, 'tremplin': -9, 'refuser': -9, 'déclin': -9, 'saut': 9, 'grimper': 9,
            'rabattre': -9, 'hikes': -9, 'jumps': 9, 'perdre': -9, 'profit': 9, 'perte': -9, 'détruit': -9, 'vendre': -9, 'acheter': 9,
             'récession': -9,
'affaiblissement': -9, 'perte record': -9, 'hausse record': 9, 'hausse du cac 40': 9, 'cac 40 chute': -9, 'euro chute': -9, 'montée de l euro': 9}


SIA = SentimentIntensityAnalyzer()

SIA.lexicon.update(new_words)

In [157]:
phrase = "Une phrase très cool à analyser"

score = SIA.polarity_scores(phrase)

In [165]:
news_date = news_date.to_frame()

In [169]:
#news_date['date'] = news_date.index
news_date.reset_index(drop=True, inplace=True)
news_date

Unnamed: 0,content,date
0,Paris Clôture : Le Cac 40 repasse au-dessus de...,2010-01-19
1,Paris Clôture : Les financières et les crainte...,2010-01-20
2,Paris Clôture : Nouvelle séance de consolidati...,2010-01-21
3,"Rétro de la semaine : Le Cac 40 à la peine, pl...",2010-01-22
4,Paris Clôture : Paris au plus bas depuis plus ...,2010-01-25
...,...,...
3537,Clôture : échaudés par la hausse des prix des ...,2021-10-11
3538,Clôture : le scénario est identique depuis plu...,2021-10-12
3539,Les informations selon lesquelles Volkswagen e...,2021-10-13
3540,BOUYGUES : Bouygues : objet boursier mal ident...,2021-10-14


In [170]:
for i in tqdm(news_date.itertuples()):
    score = SIA.polarity_scores(news_date.iloc[i[0]]['content'])
    news_date.at[i[0], 'score'] = score['compound']
    if score['compound'] >= 0:
        news_date.at[i[0], 'sentiment'] = 1
    else:
       news_date.at[i[0], 'sentiment'] = -1

3542it [00:39, 90.72it/s] 


In [172]:
news_date.to_csv('../../data/processed/news_classified.csv')