In [8]:
import nltk
import pandas as pd
import numpy as np
import warnings
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
import re
import csv
import textblob
nltk.download('vader_lexicon')
nltk.download('wordnet')
warnings.filterwarnings('ignore') 
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Lon\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lon\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lon\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### 0. Lexicon Preparation

In [9]:
# import dictionaries
pos_dic=pd.read_csv('positive.csv')
neg_dic=pd.read_csv('negative.csv')
neu_dic=pd.read_csv('neutral.csv')

In [10]:
# merge into one dataframe
pos_dic['polarity']= 1
neg_dic['polarity']= -1
neu_dic['polarity']= 0
pos_dic.rename(columns={'Positive':'Words'},inplace=True)
neg_dic.rename(columns={'Negative':'Words'},inplace=True)
neu_dic.rename(columns={'Neutral':'Words'},inplace=True)

In [11]:
diclist=[pos_dic,neg_dic,neu_dic]
lexicon=pd.concat(diclist,keys=['Words','polarity'],ignore_index=True)

In [12]:
reg_map = {
         re.compile("rt [@0-9a-z_]{0,10}:"),
         re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"),
         re.compile("#[0-9a-z]+"),
         re.compile("@[0-9a-z]+"),
    }
    
stop = stopwords.words('english')
stemmer = SnowballStemmer("english")
lmtzr = WordNetLemmatizer()
    
def lower_and_remove_with_reg(text: str) -> str:
        text = text.lower()
        for v in reg_map:
            text = v.sub("", text)
        return text
    
lexicon['cleaned']=lexicon['Words'].apply(lower_and_remove_with_reg).replace('[^a-zA-Z0-9 ]', '', regex=True)

In [23]:
order=['cleaned','polarity']
lexicon=lexicon[order]
lexicon

Unnamed: 0,cleaned,polarity
0,buyfxe,1
1,longfxe,1
2,buysignal,1
3,upsidebreakout,1
4,eurusdbullintact,1
...,...,...
146,currentlyshorteurusd,-1
147,decreaseeurusdshort,-1
148,currentlyshorteurusd,-1
149,holdeurusdshort,-1


### 1. Text Cleaning

In [44]:
def preprocess(df) :
    # 1. Remove tweets from user zhanusic and Violahkzvo
    # 2. Convert text into lowercase
    # 3. Remove non-alphabetic characters

    df = df.drop((df[df['author username'] == 'zhanusic'].index) | (df[df['author username'] == 'Violahkzvo']).index).reset_index(drop=True)

    reg_map = {
         re.compile("rt [@0-9a-z_]{0,10}:"),
         re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"),
         re.compile("#[0-9a-z]+"),
         re.compile("@[0-9a-z]+"),
    }
    
    stop = stopwords.words('english')
    stemmer = SnowballStemmer("english")
    lmtzr = WordNetLemmatizer()
    
    def lower_and_remove_with_reg(text: str) -> str:
        text = text.lower()
        for v in reg_map:
            text = v.sub("", text)
        return text
    
    df['cleaned']=df['tweet'].apply(lower_and_remove_with_reg).replace('[^a-zA-Z0-9 ]', '', regex=True)
                        
    return df

### 2. Sentiment Analysis

In [15]:
def scanner(text,lexicon):
    text = re.sub('[^a-zA-Z]', '', str(text))
    score=0

    for index in range(lexicon.shape[1]):
        dict_word = lexicon.iloc[index]['cleaned']
        count = len(re.findall(re.escape(dict_word), text))
        polarity = lexicon.iloc[index]['polarity']
        score += count * polarity

    return score

In [47]:
# Test
tweets = pd.read_csv("data/data_2013-07-26_eur_usd.csv")
tweets_text = tweets['tweet']

In [48]:
# Preprocess the tweets
cleaned_tweets = preprocess(tweets)
cleaned_text = cleaned_tweets['tweet']

In [51]:
scores = cleaned_text.apply(lambda x: scanner(x, lexicon))

Getting the sentiment analysis score for each tweet of all collected data. Then save the score with the corresponding tweet, group by the month.

In [20]:
import os

In [56]:
# Get the filenames of the csv files
csv_files = [x for x in os.listdir('data') if x.endswith(".csv")]

In [62]:
# Run scanner on all cleaned data, file by file. Then save the score for each tweet in each file in the senti_analysis_scores folder
# The format for the scores file is e.g. scores_2011-09-26.csv
for name in csv_files:
  df = pd.read_csv('data/' + name)
  cleaned_tweets = preprocess(df)
  cleaned_text = cleaned_tweets['cleaned']

  # Get the score for each tweet
  scores = cleaned_text.apply(lambda x: scanner(x, lexicon))
  score_df = pd.DataFrame({'tweet': cleaned_text, 'polarity_score': scores})

  # Save file to senti_anaylsis _scores folder
  score_filename = 'scores_'+ name[5:15] + '.csv'
  score_df.to_csv('senti_analysis_scores/' + score_filename)