In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import json
import re
import nltk

In [3]:
from nltk.corpus import stopwords


In [4]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [5]:
class NERModel:
    def __init__(self,vocabulary_file):
        self.vocabulary=self.__load_vocabulary(vocabulary_file)
        self.emoji_patterns = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                "]+", re.UNICODE)
        self.url_pattern=re.compile(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))")
        self.stops=set(stopwords.words("english"))
    def __load_vocabulary(self,vocabulary_file):
        file=open(vocabulary_file)
        return json.load(file)
    def remove_regex_pattern(self,regex_pattern,text,replacement=""):
        text=re.sub(regex_pattern,replacement,text)
        return text
    def preprocess_text(self,text):
        text=text.strip()
        text=text.lower()
        text=text.replace("\n"," ")
        text=self.remove_regex_pattern(self.emoji_patterns,text) # remove emojis
        text=self.remove_regex_pattern(self.url_pattern,text) # remove urls
        text=self.remove_regex_pattern(re.compile("[^a-z0-9%\s']"),text) # remove anything not an alphabet, a number, a space, a % or an apostrophe
        text=self.remove_regex_pattern(re.compile(r"\s\s+"),text,replacement=" ")
        text=[word for word in text.split(" ") if word not in self.stops] # stopword removal
        text=" ".join(text)
        return text
    
    def predict(self,text):
        text=self.preprocess_text(text)
        doc=nlp(text)
        answers=""
        for token in doc:
            if token.pos_=="NOUN" or token.pos_=="PROPN":
                if token.text in self.vocabulary:
                    answers+=token.text+" "
        if answers=="":
            return np.nan
        return answers

In [6]:
ner=NERModel("../input/crypto_vocabulary.json")

In [7]:
df=pd.read_csv("../input/cleaned_text.csv")
df.dropna(inplace=True)

In [8]:
df.isnull().sum()

id           0
cleantext    0
dtype: int64

In [9]:
df["keywords"]=df["cleantext"].apply(lambda x: ner.predict(x))

In [10]:
df[df["keywords"]!=None]

Unnamed: 0,id,cleantext,keywords
0,321712,hey using bot also filtering recommendations s...,
1,321713,good stuff surprised took long find community lol,
2,321717,using non official one,
3,321718,use one uniswap uses,
4,321719,keep mind hot subgraph change anytime without ...,
...,...,...,...
43481,374466,find many places also santiment historical bal...,time
43482,374467,guys anyone know application tools able check ...,application transactions
43483,374468,lobsters going kyiv web3 hackathon september 6...,
43484,374469,whats funny one complains txs rejected censore...,one


In [15]:
df[pd.isna(df["keywords"])==False]["keywords"].unique()

array(['money money ', 'identity ', 'governance ', ...,
       'year multi data ', 'application transactions ', 'chain self '],
      dtype=object)

In [16]:
df.to_csv("../output/extracted_words.csv",index=False)