In [111]:
import nltk
import pandas as pd
import numpy as np
import warnings
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
import re
import csv
import textblob
nltk.download('vader_lexicon')
nltk.download('wordnet')
warnings.filterwarnings('ignore') 
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/apple/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/apple/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/apple/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### 0. Lexicon Preparation

In [112]:
# import dictionaries
pos_dic=pd.read_csv('positive.csv')
neg_dic=pd.read_csv('negative.csv')
neu_dic=pd.read_csv('neutral.csv')

In [114]:
# merge into one dataframe
pos_dic['polarity']= 1
neg_dic['polarity']= -1
neu_dic['polarity']= 0
pos_dic.rename(columns={'Positive':'Words'},inplace=True)
neg_dic.rename(columns={'Negative':'Words'},inplace=True)
neu_dic.rename(columns={'Neutral':'Words'},inplace=True)

In [154]:
diclist=[pos_dic,neg_dic,neu_dic]
lexicon=pd.concat(diclist,keys=['Words','polarity'],ignore_index=True)

In [117]:
reg_map = {
         re.compile("rt [@0-9a-z_]{0,10}:"),
         re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"),
         re.compile("#[0-9a-z]+"),
         re.compile("@[0-9a-z]+"),
    }
    
stop = stopwords.words('english')
stemmer = SnowballStemmer("english")
lmtzr = WordNetLemmatizer()
    
def lower_and_remove_with_reg(text: str) -> str:
        text = text.lower()
        for v in reg_map:
            text = v.sub("", text)
        return text
    
lexicon['cleaned']=lexicon['Words'].apply(lower_and_remove_with_reg).replace('[^a-zA-Z0-9 ]', '', regex=True)

In [134]:
order=['cleaned','polarity']
lexicon=lexicon[order]
lexicon

Unnamed: 0,cleaned,polarity
0,buyfxe,1
1,longfxe,1
2,buysignal,1
3,upsidebreakout,1
4,eurusdbullintact,1
...,...,...
146,currentlyshorteurusd,-1
147,decreaseeurusdshort,-1
148,currentlyshorteurusd,-1
149,holdeurusdshort,-1


### 1. Text Cleaning

In [121]:
def preprocess(df) :
    # 1. Convert text into lowercase
    # 2. Remove non-alphabetic characters
    reg_map = {
         re.compile("rt [@0-9a-z_]{0,10}:"),
         re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"),
         re.compile("#[0-9a-z]+"),
         re.compile("@[0-9a-z]+"),
    }
    
    stop = stopwords.words('english')
    stemmer = SnowballStemmer("english")
    lmtzr = WordNetLemmatizer()
    
    def lower_and_remove_with_reg(text: str) -> str:
        text = text.lower()
        for v in reg_map:
            text = v.sub("", text)
        return text
    
    df['cleaned']=df['tweet'].apply(lower_and_remove_with_reg).replace('[^a-zA-Z0-9 ]', '', regex=True)
                        
    return df

### 2. Sentiment Analysis

In [122]:
def scanner(text,lexicon):
    text = re.sub('[^a-zA-Z]', '', str(text))
    score=0
    for i in lexicon:
        count = len(re.findall(re.escape(i[0]), text))
        score += count * i[1]

    return score