# Import packages
## Installation:
pip install textblob
python -m textblob.download_corpora

In [1]:
import nltk
import pandas as pd
nltk.download('opinion_lexicon')
from nltk.corpus import opinion_lexicon
from textblob import TextBlob 
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /Users/apple/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


# Import Nasdap_news data 

In [2]:
dataset=pd.read_csv('IT.csv')

# Text preprocessing
## Removal of punctuations

In [3]:
def form_sentence(text):
    text_blob = TextBlob(text)
    return ' '.join(text_blob.words)

dataset['body']=dataset['body'].apply(form_sentence)

# Text preprocessing
## Removal of commonly used words (stopwords)

In [4]:
def no_user_alpha(text):
    text_list = [ele for ele in text.split() if ele != 'user']
    clean_tokens = [t for t in text_list if re.match(r'[^\W\d]*$', t)]
    clean_s = ' '.join(clean_tokens)
    clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')]
    return clean_mess

dataset['body']=dataset['body'].apply(no_user_alpha)

# Text preprocessing
## Normalization of words

In [5]:
def normalization(text_list):
        lem = WordNetLemmatizer()
        normalized_text_list = []
        for word in text_list:
            normalized_text = lem.lemmatize(word,'v')
            normalized_text_list.append(normalized_text)
        return normalized_text_list
    
dataset['body']=dataset['body'].apply(normalization)

# Generate a list of positive words and a list of negative words

In [6]:
pos_list=set(opinion_lexicon.positive())
neg_list=set(opinion_lexicon.negative())

# Rule-based Approach 
## Define a function to calculate the sentiment and apply it to every body text

In [7]:
def sentiment(sentence):
  senti=0
  words = [word.lower() for word in sentence]
  for word in words:
    if word in pos_list:
      senti += 1
    elif word in neg_list:
      senti -= 1
  return senti

#apply 'sentiment' function to every document
dataset['sentiment']=dataset['body'].apply(sentiment)

# Sentiment scores

In [8]:
print(dataset['sentiment'])

0        4
1       -5
2       21
3       -6
4        4
5       37
6        0
7       15
8       98
9       -4
10      -6
11       4
12      -5
13       2
14       8
15       3
16      14
17      16
18       8
19       2
20       6
21       4
22     -10
23       0
24       1
25       2
26       1
27       0
28      -6
29       3
        ..
1370     4
1371   -11
1372     0
1373    18
1374    13
1375     3
1376     4
1377     4
1378     6
1379     6
1380    -1
1381    24
1382    25
1383     1
1384     3
1385     1
1386    -4
1387   -10
1388     7
1389     6
1390     2
1391     5
1392    19
1393     9
1394    -1
1395     8
1396    68
1397    78
1398    19
1399    13
Name: sentiment, Length: 1400, dtype: int64
