In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import unicodedata
import numpy as np
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ansam\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ansam\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

Documentation:

* nltk: https://www.nltk.org/
* Textblob: https://textblob.readthedocs.io/en/dev/
* wordcloud: https://amueller.github.io/word_cloud/

In [27]:
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS

In [28]:
data = pd.read_csv('msft_dataworldnews.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Index,MSFT.Adjusted,worldnews
0,0,2022-01-03,328.727661,1 Feb 2016 — Find the best posts and communiti...
1,1,2022-01-04,323.090942,"In Portugal, with 89% of the total population ..."
2,2,2022-01-05,310.688141,18 Mar 2021 — Find the best posts and communit...
3,3,2022-01-06,308.233124,And yet pro israelis and islamophobes are allo...
4,4,2022-01-07,308.390289,I've taken several rapid tests (4 total at var...


## Text Cleaning

In [29]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def clean_text(text):
    text = re.sub(r"@[A-Za-z0–9]+", "", text) # Remove @mentions replace with blank
    text = re.sub(r"#", "", text) # Remove the ‘#’ symbol, replace with blank
    text = re.sub(r"RT[\s]+", "", text) # Removing RT, replace with blank
    text = re.sub(r"https?:\/\/\S+", "", text) # Remove the hyperlinks
    text = re.sub(r":", "", text) # Remove :
    return text

def remove_emojis(text):
    emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F" # emoticons
    u"\U0001F300-\U0001F5FF" # symbols & pictographs
    u"\U0001F680-\U0001F6FF" # transport & map symbols
    u"\U0001F1E0-\U0001F1FF" # flags (iOS)
    u"\U00002500-\U00002BEF" # chinese char
    u"\U00002702-\U000027B0"
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251"
    u"\U0001f926-\U0001f937"
    u"\U00010000-\U0010ffff"
    u"\u2640-\u2642"
    u"\u2600-\u2B55"
    u"\u200d"
    u"\u23cf"
    u"\u23e9"
    u"\u231a"
    u"\ufe0f" # dingbats
    u"\u3030"
    "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [30]:
# stopword removal
# not sure if needed it might have a negative impact on the sentiment analysis 

def remove_stopwords(text, is_lower_case=False, stopwords=None):
    if not stopwords:
        stopwords = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [31]:
data['worldnews'] = data['worldnews'].apply(strip_html_tags)
data['worldnews'] = data['worldnews'].apply(remove_accented_chars)
data['worldnews'] = data['worldnews'].apply(clean_text)
data['worldnews'] = data['worldnews'].apply(remove_emojis)
data['worldnews'] = data['worldnews'].apply(remove_stopwords)

In [32]:
data.head()

Unnamed: 0.1,Unnamed: 0,Index,MSFT.Adjusted,worldnews
0,0,2022-01-03,328.727661,1 Feb 2016 Find the best posts and communitie...
1,1,2022-01-04,323.090942,"In Portugal, with 89% of the total population ..."
2,2,2022-01-05,310.688141,18 Mar 2021 Find the best posts and communiti...
3,3,2022-01-06,308.233124,And yet pro israelis and islamophobes are allo...
4,4,2022-01-07,308.390289,I've taken several rapid tests (4 total at var...


In [33]:
def Subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

def Polarity(text):
    return TextBlob(text).sentiment.polarity

def Sentiment(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'


In [34]:
#Now we create a new column for what we just did and add it to the Tweet_df dataframe
data['Subjectivity'] = data['worldnews'].apply(Subjectivity)
data['Polarity'] = data['worldnews'].apply(Polarity)
data['Sentiment'] = data['Polarity'].apply(Sentiment)

data.head()

Unnamed: 0.1,Unnamed: 0,Index,MSFT.Adjusted,worldnews,Subjectivity,Polarity,Sentiment
0,0,2022-01-03,328.727661,1 Feb 2016 Find the best posts and communitie...,0.416667,0.206944,Positive
1,1,2022-01-04,323.090942,"In Portugal, with 89% of the total population ...",0.510331,0.28719,Positive
2,2,2022-01-05,310.688141,18 Mar 2021 Find the best posts and communiti...,0.316667,0.058333,Positive
3,3,2022-01-06,308.233124,And yet pro israelis and islamophobes are allo...,0.448788,0.096136,Positive
4,4,2022-01-07,308.390289,I've taken several rapid tests (4 total at var...,0.321402,0.129545,Positive


In [35]:
data.Sentiment.value_counts()

Sentiment
Positive    429
Negative     50
Neutral       2
Name: count, dtype: int64