In [1]:
import nltk
import pandas as pd
pd.set_option('display.max_colwidth',100)
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
#pip install afinn
from afinn import Afinn
# from textblob import TextBlob
# nltk.download('vader_lexicon')
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
# nltk.download('sentiwordnet')

In [2]:
#LOAD DATA
import os
os.chdir(r"C:\Users\vish\Documents\Data\Dataset")
text=pd.read_csv('train_sentiment.csv')

In [3]:
text=text.head(1000)

In [4]:
text[text.label==0].head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. ...
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. ...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦
4,5,0,factsguide: society now #motivation


In [5]:
text[text.label==1].head()

Unnamed: 0,id,label,tweet
13,14,1,@user #cnn calls #michigan middle school 'build the wall' chant '' #tcot
14,15,1,no comment! in #australia #opkillingbay #seashepherd #helpcovedolphins #thecove #helpcovedol...
17,18,1,retweet if you agree!
23,24,1,@user @user lumpy says i am a . prove it lumpy.
34,35,1,it's unbelievable that in the 21st century we'd need something like this. again. #neverump #xen...


In [6]:
#Text Preprocessing

In [7]:
def clean_text(text):
    text_output=' '.join(word for word in text.split(' ') if not word.startswith('#'))
    text_output=' '.join(word for word in text_output.split(' ') if not word.startswith('@'))
    text_output=' '.join(word for word in text_output.split(' ') if not word.endswith('â'))
    text_output=re.split('\W+',text_output)
    text_output=[word.lower() for word in text_output]
    string.punctuation=list(string.punctuation)
    string.punctuation.extend(['ð','','â'])
    text_output=[w for w in text_output if w not in string.punctuation]
    text_output=[w for w in text_output if w.isalpha()]
    stop_words = stopwords.words('english')
    text_output=[w for w in text_output if w not in stop_words]
    lemmatizer = WordNetLemmatizer()
    text_output=" ".join([lemmatizer.lemmatize(w) for w in text_output])
    return text_output

text['cleaned_tweet']=text['tweet'].apply(lambda x: clean_text(x))

In [8]:
text.head(10)

Unnamed: 0,id,label,tweet,cleaned_tweet
0,1,0,@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction. ...,father dysfunctional selfish drag kid dysfunction
1,2,0,@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx. ...,thanks credit use cause offer wheelchair van pdx
2,3,0,bihday your majesty,bihday majesty
3,4,0,#model i love u take with u all the time in urð±!!! ððððð¦ð¦ð¦,love u take u time urð
4,5,0,factsguide: society now #motivation,factsguide society
5,6,0,[2/2] huge fan fare and big talking before they leave. chaos and pay disputes when they get ther...,huge fan fare big talking leave chaos pay dispute get
6,7,0,@user camping tomorrow @user @user @user @user @user @user @user dannyâ¦,camping tomorrow dannyâ
7,8,0,the next school year is the year for exams.ð¯ can't think about that ð­ #school #exams #ha...,next school year year exam think
8,9,0,we won!!! love the land!!! #allin #cavs #champions #cleveland #clevelandcavaliers â¦,love land
9,10,0,@user @user welcome here ! i'm it's so #gr8 !,welcome


In [9]:
#AFINN Lexicon

In [10]:
#Model building
text['afinn_score'] = [Afinn().score(article) for article in text['cleaned_tweet']]
text['afinn_score'] = [1 if x <0 else 0 for x in text['afinn_score']]

#Evaluating model
metrics.accuracy_score(text['label'],text['afinn_score'])

0.806

In [17]:
#Text Blob Lexicon

In [18]:
#Model building
# text['textblob_score'] = [round(TextBlob(article).sentiment.polarity, 3) for article in text['cleaned_tweet']]
# text['textblob_score'] = [1 if x <0 else 0 for x in text['textblob_score']]

#Evaluating model
# metrics.accuracy_score(text['label'],text['textblob_score'])

In [19]:
#Vader Lexicon
#SentimentIntensityAnalyzer() takes in a string and returns a dictionary of four scores 
#negative, neutral, positive and compound (computed by normalizing the first 3 scores)

In [20]:
#Model building
# text['scores'] = text['cleaned_tweet'].apply(lambda article: SentimentIntensityAnalyzer().polarity_scores(article))
# text['compound_score']  = text['scores'].apply(lambda score_dict: score_dict['compound'])
# text['vader_score'] = text['compound_score'].apply(lambda score: 0 if score >=0 else 1) 

#Evaluating model
# metrics.accuracy_score(text['label'],text['vader_score'])

In [21]:
#Sentiword Lexicon

In [23]:
#Model building
def penn_to_wn(tag):
    """
    Convert the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None


def swn_polarity(text):
    sentiment = 0.0
    tagged_sentence = pos_tag(word_tokenize(text))
    for word, tag in tagged_sentence:
        wn_tag = penn_to_wn(tag)
        if wn_tag not in (wn.NOUN,wn.VERB, wn.ADJ, wn.ADV):
            continue
        synsets = wn.synsets(word, pos=wn_tag)
        if not synsets:
            continue
        synset = synsets[0]
        swn_synset = swn.senti_synset(synset.name())
        sentiment += swn_synset.pos_score() - swn_synset.neg_score()
    if sentiment >= 0:
        return 0
    elif sentiment <0:
        return 1
    
    
text['sentiwordnet_score']=text['cleaned_tweet'].apply(lambda x: swn_polarity(x))

#Evaluating model
metrics.accuracy_score(text['label'],text['sentiwordnet_score'])

0.768