In [314]:
import numpy as np
import pandas as pd
import datetime
from textblob import TextBlob
import nltk

In [38]:
raw_data = pd.read_csv('./data/NLP_Cryptocurrency_Dataset.csv', header = 1, index_col = 0)

  mask |= (ar1 == a)


In [232]:
data = raw_data.copy()

# Pre-processing

## Colonne des dates

- On remarque qu'on a des commentaires qui sont en-dehors de la zone que l'on souhaite prédire (01 mai 2017 au 01 mai 2018).

- On pourra donc se servir des commentaires qui sont en dehors de cette période afin d'entraîner le modèle qie l'on appliquera sur la période qui nous intéresse vraiment.

In [40]:
data

Unnamed: 0,datetime,id,status,activity,merit,post
0,2018-08-24 07:33:34,Kurisuu9,Jr. Member,38.0,0.0,https://cointelegraph.com/news/kaspersky-lab-n...
1,2018-08-22 21:33:03,sejem,Jr. Member,238.0,2.0,"Every few weeks this summer, bitcoin bulls hav..."
2,2018-08-22 22:14:54,DooMAD,Legendary,1736.0,1136.0,Are people really waiting for an ETF? Or does...
3,2018-08-22 22:32:31,Anarchist,Sr. Member,399.0,250.0,Most people don't care about an ETF and don't ...
4,2018-08-22 22:43:13,erickkyut,Hero Member,854.0,506.0,Everyone is waiting again for the ETF approval...
5,2018-08-22 23:31:27,xitrum,Member,308.0,10.0,"Quote from: sejem on August 22, 2018, 09:33:03..."
6,2018-08-22 23:42:35,vgk88,Full Member,420.0,124.0,I'm not surprised that ETF rejected. First you...
7,2018-08-23 01:05:58,flowers5,Member,172.0,10.0,The regulators do not have their regulations i...
8,2018-08-23 02:01:32,diceoption.com,Newbie,28.0,0.0,"Waiting again for the ETF approval, BTC will pum"
9,2018-08-23 02:49:53,reactorjuno,Member,210.0,29.0,I'm sorry but how could we have an Exchange Tr...


# Traitement des données texte

Avec les données textes, on cherchera à établir une matrice TF-IDF. Pour cela :

1- Tout mettre en minuscules,

2- Enlever les stopwords,

3- Enlever les signes de ponctuation,

4- *Lemmatiser* ou *stemmer* les résultats.

Tout ceci devrait nous permettre d'obtenir une matrice TF-IDF un peu plus exploitable.

## Minuscules

In [252]:
data.post = data.post.apply(lambda x: str(x).lower())
data.post

0          https???cointelegraph?com?news?kaspersky lab n...
1          every weeks summer? bitcoin bulls gotten ray h...
2          are people really waiting etf?  or everyone th...
3          most people ?care etf ?expect anything ? only ...
4          everyone waiting etf approval i think delay an...
5          quote ? sejem august 22? 2018? 09?33?03 pm  ev...
6          i?surprised etf rejected? first need eradicate...
7          the regulators regulations yet?  they continue...
8                             waiting etf approval? btc pum 
9          i?sorry could exchange trade fund market like ...
10         quote ? doomad august 22? 2018? 10?14?54 pm  a...
11         no matter many etfs scrutinized reviewed sec r...
12         i agree market becoming resilient etf bullshit...
13         it seems stupid wait etf? this delivery contra...
14         quote ? blockchaingod august 23? 2018? 06?55?3...
15         enough bullshit? we fine without etf approval?...
16         it almost cer

## Signes de ponctuation

## Stopwords
- Stopwords classiques proposés par nltk

- Rajouter les noms d'utilisateurs ?

- Rajouter les trucs qui ressemblent à des dates ? (mois, années, heure, "AM" et "PM")

In [249]:
# Définition de la liste de stop words
from nltk.corpus import stopwords

# First definition of stop words
stop_words = stopwords.words('english')

stop_words += ["quote"]

stop_words += ['january', 'february', 'march', 'april', 'may', 'june', 'july',
               'august', 'september', 'october', 'november', 'december']

# Adding the ID's (lowered)
id_list = list(data.id.unique())
id_list = [str(name).lower() for name in id_list] # str(name) instead of (name) in case an ID is only numbers

#stop_words += id_list

stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [250]:
# Fonction pour enlever les stop words, que l'on exploite ensuite dans un .apply sur la colonne des posts
def stopword_remover(comment, stopwords):
    
    comment = comment.replace('\r', ' ').replace('\n', ' ')
    comment = ' '.join(word for word in comment.split(' ') if word not in stopwords)
    
    return comment

In [251]:
import re
import string

def stopword_remover_regex(comment, stopwords_list):
    
    comment = str(comment).replace('\r', ' ').replace('\n', ' ')
    reg_pattern = re.compile(r'\b(' + r'|'.join(stopwords_list) + r')\b\s*')
    comment = reg_pattern.sub('', comment)
    
    # Enlever les signes de ponctuation (on remplace par des points d'interrogation)
    table = str.maketrans('!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~-', '?' * 31 + ' ')
    comment = comment.translate(comment.maketrans(table))
    
    return reg_pattern.sub('', comment)

In [255]:
import time

start_time = time.time()

data.post = data.post.apply(stopword_remover_regex, stopwords_list = stop_words)

print(time.time() - start_time)

307.7529950141907


In [257]:
data.post = data.post.str.replace('?', '')

Possiblement lemmatiser / stemmiser après, mais on peut commencer par partir sur cette base.

## Remarque sur les quotes

- La plupart des commentaires sont une réponse à un autre commentaire ("quote" : le premier commentaire est donc affiché dans la réponse). On a pensé essayer de virer les quotes pour avoir juste le commentaire, sans celui auquel il répond.

- Malheureusement, ça n'a pas l'air d'être possible : pas de séparateur clair et indiscutable entre la quote et le corps du message.

- De toute manière, on peut partir du principe qu'un message qui a été quoté, à plus fotte raison plusieurs fois, aura donc une influence plus importante dans la discussion, et donc qu'il n'est pas scandaleux que cette influence soit reflétée directement dans la matrice TF-IDF.

In [281]:
data = data[data['datetime'].isna()==False]

In [287]:
data['datetime'][0][5:7]

'08'

In [291]:
data

Unnamed: 0,datetime,id,status,activity,merit,post
0,2018-08-24 07:33:34,Kurisuu9,Jr. Member,38.0,0.0,httpscointelegraphcomnewskaspersky lab north k...
1,2018-08-22 21:33:03,sejem,Jr. Member,238.0,2.0,every weeks summer bitcoin bulls gotten ray ho...
2,2018-08-22 22:14:54,DooMAD,Legendary,1736.0,1136.0,people really waiting etf everyone think peop...
3,2018-08-22 22:32:31,Anarchist,Sr. Member,399.0,250.0,people care etf expect anything newbies wanna...
4,2018-08-22 22:43:13,erickkyut,Hero Member,854.0,506.0,everyone waiting etf approval think delay anot...
5,2018-08-22 23:31:27,xitrum,Member,308.0,10.0,sejem 22 2018 093303 pm every weeks summer b...
6,2018-08-22 23:42:35,vgk88,Full Member,420.0,124.0,surprised etf rejected first need eradicate ma...
7,2018-08-23 01:05:58,flowers5,Member,172.0,10.0,regulators regulations yet continue delay eve...
8,2018-08-23 02:01:32,diceoption.com,Newbie,28.0,0.0,waiting etf approval btc pum
9,2018-08-23 02:49:53,reactorjuno,Member,210.0,29.0,sorry could exchange trade fund market like cr...


In [293]:
len(data['datetime'][175454])

16

In [299]:
data['datetime'][175454][6:10]

'2015'

In [294]:
len(data['datetime'][0])

19

In [304]:
def format_date(string):
    if len(string)==16:
        res= datetime(year = int(string[6:10]), month = int(string[3:5]), day = int(string[:2]))
    elif len(string)==19:
        res = datetime(year = int(string[:4]), month = int(string[5:7]), day = int(string[8:10]))
    else:
        res = None
    return res

In [290]:
data[data['datetime'].str.startswith('31/1')]

Unnamed: 0,datetime,id,status,activity,merit,post
175454,31/12/2015 19:48,hamiltino,Hero Member,644.0,500.0,latest size 45gb format compressed zip contain...
175455,31/12/2015 19:50,Meuh6879,Legendary,1512.0,1000.0,official location httpsbitcoinorgbinblock cha...
175456,31/12/2015 19:52,unamis76,Legendary,1414.0,1000.0,thanks work useless current bitcoin core versi...
175457,31/12/2015 19:56,hamiltino,Hero Member,644.0,500.0,official torrent 1 year old 20gb size know peo...
175458,31/12/2015 20:42,OmegaStarScream,Staff,1414.0,1101.0,synchronization network much faster bitcoin la...
182183,31/10/2017 16:39,aisyah88,Hero Member,588.0,500.0,known existence bitcoin films yet present bit...
182184,31/10/2017 16:45,xIIImaL,Legendary,1330.0,1003.0,aisyah88 31 2017 043917 pm known existence bi...
186528,31/10/2016 01:56,genos,Hero Member,574.0,500.0,to1ga 30 2016 064714 pm price rising moment c...
186529,31/10/2016 02:25,drwtsn32,Sr. Member,476.0,250.0,karartma1 30 2016 055731 pm bitcoin dead quee...
186530,31/10/2016 08:59,Darmo Gandoel,Jr. Member,49.0,0.0,start thinking bitcoin dead need exchange bitc...


In [305]:
from datetime import datetime
data['date'] = data['datetime'].map(lambda x : format_date(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [309]:
TimeLowBound= datetime(year = 2016, month = 1, day = 1)

In [311]:
data = data[data['date'] > TimeLowBound]

In [None]:
data['polarity'] = data.post.apply(lambda x: TextBlob(x).sentiment[0])

In [None]:
data['log_merit'] = np.log(data.merit + 1)
data['activity'] = np.log(data.activity + 1)

In [None]:
data['polarity_and_log_merit'] = data.polarity * data.log_merit
data.polarity_and_log_merit = data.polarity_and_log_merit - min(data.polarity_and_log_merit) /
 (max(data.polarity_and_log_merit) - min(data.polarity_and_log_merit)) * 2 - 1

In [313]:
data

Unnamed: 0,datetime,id,status,activity,merit,post,date
0,2018-08-24 07:33:34,Kurisuu9,Jr. Member,38.0,0.0,httpscointelegraphcomnewskaspersky lab north k...,2018-08-24
1,2018-08-22 21:33:03,sejem,Jr. Member,238.0,2.0,every weeks summer bitcoin bulls gotten ray ho...,2018-08-22
2,2018-08-22 22:14:54,DooMAD,Legendary,1736.0,1136.0,people really waiting etf everyone think peop...,2018-08-22
3,2018-08-22 22:32:31,Anarchist,Sr. Member,399.0,250.0,people care etf expect anything newbies wanna...,2018-08-22
4,2018-08-22 22:43:13,erickkyut,Hero Member,854.0,506.0,everyone waiting etf approval think delay anot...,2018-08-22
5,2018-08-22 23:31:27,xitrum,Member,308.0,10.0,sejem 22 2018 093303 pm every weeks summer b...,2018-08-22
6,2018-08-22 23:42:35,vgk88,Full Member,420.0,124.0,surprised etf rejected first need eradicate ma...,2018-08-22
7,2018-08-23 01:05:58,flowers5,Member,172.0,10.0,regulators regulations yet continue delay eve...,2018-08-23
8,2018-08-23 02:01:32,diceoption.com,Newbie,28.0,0.0,waiting etf approval btc pum,2018-08-23
9,2018-08-23 02:49:53,reactorjuno,Member,210.0,29.0,sorry could exchange trade fund market like cr...,2018-08-23


In [303]:
data[data['date'].isnull()==False]

Unnamed: 0,datetime,id,status,activity,merit,post,date


In [120]:
data.post

0          https://cointelegraph.com/news/kaspersky-lab-n...
1          every weeks summer, bitcoin bulls gotten ray h...
2          people really waiting etf?  everyone think peo...
3          people care etf expect anything . newbies wann...
4          everyone waiting etf approval think delay anot...
5          : 22, 2018, 09:33:03 pm  every weeks summer, b...
6          'surprised etf rejected. first need eradicate ...
7          regulators regulations yet.  continue delay ev...
8                             waiting etf approval, btc pum 
9          'sorry could exchange trade fund market like c...
10         : 22, 2018, 10:14:54 pm  people really waiting...
11         matter many etfs scrutinized reviewed sec regu...
12         agree market becoming resilient etf bullshit n...
13         seems stupid wait etf. delivery contract, fool...
14         : 23, 2018, 06:55:31  seems stupid wait etf. d...
15         enough bullshit. fine without etf approval. mu...
16         almost certai

In [37]:
np.log(data.merit + 1)

0          0.000000
1          1.098612
2          7.036148
3          5.525453
4          6.228511
5          2.397895
6          4.828314
7          2.397895
8          0.000000
9          3.401197
10         7.055313
11         6.970730
12         6.980076
13         4.615121
14         6.908755
15         0.000000
16         4.615121
17         6.917706
18         0.000000
19         2.397895
20         4.615121
21         0.000000
22         5.693732
23         0.000000
24         5.594711
25         0.000000
26         4.615121
27         2.484907
28         6.980076
29         6.216606
             ...   
1030855    6.927558
1030856    6.216606
1030857    6.216606
1030858    6.216606
1030859    6.216606
1030860    6.911747
1030861    5.525453
1030862    6.908755
1030863    0.000000
1030864    6.927558
1030865    6.216606
1030866    6.909753
1030867    6.280396
1030868    6.941190
1030869    6.216606
1030870    6.216606
1030871    6.216606
1030872    6.216606
1030873    5.525453


In [237]:
stopword_remover_regex(data.post[0], stop_words)

'https???cointelegraph?com?news?kaspersky lab north korea hacks cryptocurrency exchange  first macos malware        Do guys think unnamed cryptocurrency exchange announced public?'

In [229]:
len(string.punctuation)

32

In [230]:
table = str.maketrans('!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~-', '?' * 31 + ' ')
data.post[0].translate(data.post[0].maketrans(table))


'https???cointelegraph?com?news?kaspersky lab north korea hacks cryptocurrency exchange with first macos malware\r\n\r\n\r\n  Do you guys think this unnamed cryptocurrency exchange should be announced to the public?'

In [194]:


data.post[4].replace(r'.', '')

'everyone waiting etf approval think delay another time sec wants price dump  know consider price manipulation manage expectations delayed  bitcoin need etf continue exist without '

In [165]:
import string

data.post[1000].translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))

data

'increasing bank account      work 40 hour week increase portfolio    watching news crypto    dont smoke anymore money transfered portfolio   reading 5 10year predictions gives confidence hodl    always sandwiches   dont spend money expensive streetfood stand something   drink water   sold gold necklace btc   dont spend much irl deposit europian rupies btc portfolio '

In [205]:
test = data.head()

In [220]:
test['post'] = test['post'].str.translate(table)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [222]:
test['post'] = test['post'].str.replace("?",'')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [226]:
test.post[0]

'https://cointelegraphcom/news/kasperskylabnorthkoreahackscryptocurrencyexchangefirstmacosmalware        think unnamed cryptocurrency exchange announced public'

In [207]:
test

Unnamed: 0,datetime,id,status,activity,merit,post
0,2018-08-24 07:33:34,Kurisuu9,Jr. Member,38.0,0.0,https://cointelegraph.com/news/kaspersky-lab-n...
1,2018-08-22 21:33:03,sejem,Jr. Member,238.0,2.0,every weeks summer bitcoin bulls gotten ray ho...
2,2018-08-22 22:14:54,DooMAD,Legendary,1736.0,1136.0,people really waiting etf everyone think peop...
3,2018-08-22 22:32:31,Anarchist,Sr. Member,399.0,250.0,people care etf expect anything . newbies wann...
4,2018-08-22 22:43:13,erickkyut,Hero Member,854.0,506.0,everyone waiting etf approval think delay anot...
