In [76]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import sys
import unicodedata
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
from nltk.corpus import wordnet


In [88]:
#1. Importing the DailyComments file into a data frame
df = pd.read_csv('./datafiles/DailyComments.csv')
df

Unnamed: 0,Day of Week,comments
0,Monday,"Hello, how are you?"
1,Tuesday,Today is a good day!
2,Wednesday,It's my birthday so it's a really special day!
3,Thursday,Today is neither a good day or a bad day!
4,Friday,I'm having a bad day.
5,Saturday,There' s nothing special happening today.
6,Sunday,Today is a SUPER good day!


In [89]:
# Identify a scheme to categorize each comment as positive or negative

#uses count vectorizer transformer (page 58)
corpus = df['comments']
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(corpus)

# Look at vectorized words, I'm using this to determine my positive and negative words (pg 113)
print(vectorizer.get_feature_names())

['are', 'bad', 'birthday', 'day', 'good', 'happening', 'having', 'hello', 'how', 'is', 'it', 'my', 'neither', 'nothing', 'or', 'really', 'so', 'special', 'super', 'there', 'today', 'you']


In [90]:
#Removing all punctuation from the text

punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))

df['text'] = [string.translate(punctuation) for string in df['comments']]
df['text']

0                              Hello how are you
1                            Today is a good day
2    Its my birthday so its a really special day
3       Today is neither a good day or a bad day
4                            Im having a bad day
5        There s nothing special happening today
6                      Today is a SUPER good day
Name: text, dtype: object

In [91]:
#Tokenizing the words and appending each list of tokenized words into a list to be added to the dataframe
tokenized_list = []
for string in df['text']:
    tokenized = word_tokenize(string)
    tokenized_list.append(tokenized)

df['text'] = tokenized_list
df['text']

0                               [Hello, how, are, you]
1                            [Today, is, a, good, day]
2    [Its, my, birthday, so, its, a, really, specia...
3    [Today, is, neither, a, good, day, or, a, bad,...
4                            [Im, having, a, bad, day]
5       [There, s, nothing, special, happening, today]
6                     [Today, is, a, SUPER, good, day]
Name: text, dtype: object

In [92]:
#Applying POS tagging to each comment
comments_tagged = [pos_tag(comment) for comment in df['text']]
comments_tagged

[[('Hello', 'NNP'), ('how', 'WRB'), ('are', 'VBP'), ('you', 'PRP')],
 [('Today', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('good', 'JJ'), ('day', 'NN')],
 [('Its', 'PRP$'),
  ('my', 'PRP$'),
  ('birthday', 'NN'),
  ('so', 'IN'),
  ('its', 'PRP$'),
  ('a', 'DT'),
  ('really', 'RB'),
  ('special', 'JJ'),
  ('day', 'NN')],
 [('Today', 'NN'),
  ('is', 'VBZ'),
  ('neither', 'CC'),
  ('a', 'DT'),
  ('good', 'JJ'),
  ('day', 'NN'),
  ('or', 'CC'),
  ('a', 'DT'),
  ('bad', 'JJ'),
  ('day', 'NN')],
 [('Im', 'NNP'), ('having', 'VBG'), ('a', 'DT'), ('bad', 'JJ'), ('day', 'NN')],
 [('There', 'EX'),
  ('s', 'VBZ'),
  ('nothing', 'NN'),
  ('special', 'JJ'),
  ('happening', 'VBG'),
  ('today', 'NN')],
 [('Today', 'NN'),
  ('is', 'VBZ'),
  ('a', 'DT'),
  ('SUPER', 'JJ'),
  ('good', 'JJ'),
  ('day', 'NN')]]

In [129]:
#Applying text lemmatization using a series of functions

#Converting tags from a POS tag vector into letters to run through the lemmatizer function
def pos_tagger(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

#Function that takes a list of POS tag vectors and switches the POS tags for 
#letters that can be input into the lemmatizer function
def POS_simplifier(pos_tag_list):
    lemmatizer = WordNetLemmatizer()
    wordnet_tagged = [list(map(lambda x: (x[0], pos_tagger(x[1])), comments_tagged[i])) for i in range(len(pos_tag_list))]            
    return wordnet_tagged

tagged_comments = POS_simplifier(comments_tagged)
tagged_comments

[[('Hello', 'n'), ('how', None), ('are', 'v'), ('you', None)],
 [('Today', 'n'), ('is', 'v'), ('a', None), ('good', 'a'), ('day', 'n')],
 [('Its', None),
  ('my', None),
  ('birthday', 'n'),
  ('so', None),
  ('its', None),
  ('a', None),
  ('really', 'r'),
  ('special', 'a'),
  ('day', 'n')],
 [('Today', 'n'),
  ('is', 'v'),
  ('neither', None),
  ('a', None),
  ('good', 'a'),
  ('day', 'n'),
  ('or', None),
  ('a', None),
  ('bad', 'a'),
  ('day', 'n')],
 [('Im', 'n'), ('having', 'v'), ('a', None), ('bad', 'a'), ('day', 'n')],
 [('There', None),
  ('s', 'v'),
  ('nothing', 'n'),
  ('special', 'a'),
  ('happening', 'v'),
  ('today', 'n')],
 [('Today', 'n'),
  ('is', 'v'),
  ('a', None),
  ('SUPER', 'a'),
  ('good', 'a'),
  ('day', 'n')]]

In [130]:
# Reference: https://www.machinelearningplus.com/nlp/lemmatization-examples-python/
lemmatizer = WordNetLemmatizer()
lemmatized_sentences = []
for sublist in tagged_comments:
    lemmatized_sentences_sub = []
    for word, tag in sublist:
        if tag is None:
            # if there is no available tag, append the token as is
            lemmatized_sentences_sub.append(word)
        else:        
            # else use the tag to lemmatize the token
            lemmatized_sentences_sub.append(lemmatizer.lemmatize(word, tag))
    lemmatized_sentences.append(lemmatized_sentences_sub)
    
lemmatized_sentences

[['Hello', 'how', 'be', 'you'],
 ['Today', 'be', 'a', 'good', 'day'],
 ['Its', 'my', 'birthday', 'so', 'its', 'a', 'really', 'special', 'day'],
 ['Today', 'be', 'neither', 'a', 'good', 'day', 'or', 'a', 'bad', 'day'],
 ['Im', 'have', 'a', 'bad', 'day'],
 ['There', 's', 'nothing', 'special', 'happen', 'today'],
 ['Today', 'be', 'a', 'SUPER', 'good', 'day']]

In [95]:
def listToString(s): 
    str1 = " "   
    return str1.join(s)

new_list = []
for sublist in lemmatized_sentences:
    new_list.append(listToString(sublist))


new_list

['Hello how be you',
 'Today be a good day',
 'Its my birthday so its a really special day',
 'Today be neither a good day or a bad day',
 'Im have a bad day',
 'There s nothing special happen today',
 'Today be a SUPER good day']

In [128]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
for comment in new_list:
    print(analyzer.polarity_scores(comment),comment)

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0} Hello how be you
{'neg': 0.0, 'neu': 0.58, 'pos': 0.42, 'compound': 0.4404} Today be a good day
{'neg': 0.0, 'neu': 0.728, 'pos': 0.272, 'compound': 0.4576} Its my birthday so its a really special day
{'neg': 0.425, 'neu': 0.575, 'pos': 0.0, 'compound': -0.7101} Today be neither a good day or a bad day
{'neg': 0.467, 'neu': 0.533, 'pos': 0.0, 'compound': -0.5423} Im have a bad day
{'neg': 0.311, 'neu': 0.689, 'pos': 0.0, 'compound': -0.3089} There s nothing special happen today
{'neg': 0.0, 'neu': 0.347, 'pos': 0.653, 'compound': 0.8192} Today be a SUPER good day


#If the compound value is > 0.05 then the sentense is rated as positive and compound value is less than 0.05 then it is rated as negitive sentence.
From the above output 

#sentence 1: compound is neutral (compound value is 0.0)

#sentence 2: compound value is positive  (compound: 0.4404)

#sentence 3: compound value is positive  (compound: 0.4576)

#sentence 4: compound value is negitive  (compound: -0.7101)

#sentence 5: compound value is negitive  (compound: -0.5423)

#sentence 6: compound value is negitive  (compound: -0.3089)

#sentence 7: compound value is positive  (compound: -0.8192)


In [104]:
df1 = pd.DataFrame({'lemmatized_sentences' : corpus})

# check for positive words and negative words
# Positive Words
df1['super'] = df1.lemmatized_sentences.str.count('SUPER')
df1['good'] = df1.lemmatized_sentences.str.count('good')
df1['special']= df1.lemmatized_sentences.str.count('special')
df1['fantastic']= df1.lemmatized_sentences.str.count('fantastic')


# Negative Words
df1['bad'] = df1.lemmatized_sentences.str.count('bad')

# Totalscores
df1['totalpositive'] = (df1.super + df1.good + df1.special + df1.fantastic)
df1['totalnegative'] = (df1.bad)

#Calculate total score by summing positive and negitive words
df1['ScoreValue'] = (df1.totalpositive) - (df1.totalnegative)
df1 = pd.DataFrame(df1)


Z = sum(df1['ScoreValue'])

print("Total Score is:",Z)
df1

Total Score is: 4


Unnamed: 0,lemmatized_sentences,super,good,special,fantastic,bad,totalpositive,totalnegative,ScoreValue
0,"Hello, how are you?",0,0,0,0,0,0,0,0
1,Today is a good day!,0,1,0,0,0,1,0,1
2,It's my birthday so it's a really special day!,0,0,1,0,0,1,0,1
3,Today is neither a good day or a bad day!,0,1,0,0,1,1,1,0
4,I'm having a bad day.,0,0,0,0,1,0,1,-1
5,There' s nothing special happening today.,0,0,1,0,0,1,0,1
6,Today is a SUPER good day!,1,1,0,0,0,2,0,2


## For up to 5% extra credit

In [118]:
#2. Importing the Dailytweets file into a data frame
df_tweet = pd.read_csv('tweets.csv', encoding = "cp1252")

tweets_c = df_tweet.Tweet.copy()
tweets_c.head

<bound method NDFrame.head of 0      @switchfoot http://twitpic.com/2y1zl - Awww, t...
1      is upset that he can't update his Facebook by ...
2      @Kenichan I dived many times for the ball. Man...
3        my whole body feels itchy and like its on fire 
4      @nationwideclass no, it's not behaving at all....
                             ...                        
264                    i have to take my sidekick back. 
265    @chriscantore congrats! I'm totally jealous! o...
266                          gr8t my face is very itchy 
267    poor socks  luvvvvv the golden retriever!! I w...
268    I just saw that they found that Tracy girl in ...
Name: Tweet, Length: 269, dtype: object>

In [131]:
# function to remove special charecters for list.
def removeWordWithSC(text, char_list):
    #Remove words in a text that contains a special charecters from the list.
    text = text.split()
    res = [ele for ele in text if all(ch not in ele for ch in char_list)]
    res = ' '.join(res)
    return res

char_list = ['@', '#', 'http', 'www', '/', '!']

removeWordWithSC(tweets_c[1], char_list)

"is upset that he can't update his Facebook by texting it... and might cry as a result School today also."

In [132]:
tweets_cleaned = []
for t in tweets_c:
    tweets_cleaned.append(removeWordWithSC(t, char_list))
    
tweets_cleaned[0]

"- that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"

In [122]:
def tokenize(texts):
    tokenizer = nltk.RegexpTokenizer(r'\w+')

    texts_tokens = []
    for i, val in enumerate(texts):
        text_tokens = tokenizer.tokenize(val.lower())

        for i in range(len(text_tokens) - 1, -1, -1):
            if len(text_tokens[i]) < 4:
                del(text_tokens[i])

        texts_tokens.append(text_tokens)
        
    return texts_tokens

In [123]:
tweets_tokens = tokenize(tweets_cleaned)

tweets_tokens[:1]

[['that', 'bummer', 'shoulda', 'david', 'carr', 'third']]

In [124]:
def removeSW(texts_tokens):
    stopWords = set(stopwords.words('english'))
    texts_filtered = []

    for i, val in enumerate(texts_tokens):
        text_filtered = []
        for w in val:
            if w not in stopWords:
                text_filtered.append(w)
        texts_filtered.append(text_filtered)
        
    return texts_filtered

In [125]:
tweets_filtered = removeSW(tweets_tokens)

tweets_filtered[:1]

[['bummer', 'shoulda', 'david', 'carr', 'third']]

In [127]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
for comment in tweets_filtered:
    print(analyzer.polarity_scores(comment),comment)

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0} ['bummer', 'shoulda', 'david', 'carr', 'third']
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0} ['upset', 'update', 'facebook', 'texting', 'might', 'result', 'school', 'today', 'also']
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0} ['dived', 'many', 'times', 'ball', 'managed', 'save', 'rest', 'bounds']
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0} ['whole', 'body', 'feels', 'itchy', 'like', 'fire']
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0} ['behaving']
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0} ['whole', 'crew']
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0} ['need']
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0} ['long', 'time', 'rains', 'fine', 'thanks']
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0} ['nope']
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0} ['muera']
{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0} ['spring', 'break', 'plain', 'city', 'snow