# NLTK

### In this part we train our machine learning algorithm to recognize positive and negative sentiments:

https://www.datacamp.com/community/tutorials/text-analytics-beginners-nltk

In [1]:
# import libraries

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
import random as random

In [2]:
# training data
data = pd.read_csv('Sentiment.csv')
data = data[['text', 'sentiment']]
data = data[data.sentiment != "Neutral"]
data.head()

Unnamed: 0,text,sentiment
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive
5,"RT @GregAbbott_TX: @TedCruz: ""On my first day ...",Positive
6,RT @warriorwoman91: I liked her and was happy ...,Negative


In [3]:
# imbalanced dataset
print('Negative Sentiment count:', len(data[data['sentiment']=='Negative']))
print('Positive Sentiment count:',len(data[data['sentiment']=='Positive']))

Negative Sentiment count: 8493
Positive Sentiment count: 2236


In [4]:
# undersample
positive_df = data[data.sentiment != "Negative"]
negative_df = data[data.sentiment == "Negative"]

negative_df = negative_df[:len(positive_df)]

print('Negative Sentiment count:', len(negative_df))
print('Positive Sentiment count:', len(positive_df))

# concat the balanced data
balanced_data = pd.concat([negative_df, positive_df])
balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)

balanced_data.head()

Negative Sentiment count: 2236
Positive Sentiment count: 2236


Unnamed: 0,text,sentiment
0,"""@FoxNews web stream and mobile app fail durin...",Negative
1,Hahahahah this commentary is the best. #GOPDeb...,Positive
2,RT @Reince: Simply incredible. http://t.co/apX...,Positive
3,RT @TheRighToExist: Wow!! She is good!! Watch ...,Positive
4,RT @RWSurferGirl: Ask Trump a legitimate quest...,Positive


In [5]:
# tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')

# create a sparse matrix (BoW)
tf=TfidfVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_tf= tf.fit_transform(balanced_data['text'])

In [6]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(text_tf, balanced_data['sentiment'], test_size=0.1, random_state=123)

In [7]:
# classify

clf = MultinomialNB(alpha=0.1).fit(X_train, y_train)
predicted= clf.predict(X_test)

print("MultinomialNB Classifier Accuracy:",metrics.accuracy_score(y_test, predicted))

MultinomialNB Classifier Accuracy: 0.8080357142857143


In [8]:
# test for the first element
text_tf_2 = tf.fit_transform(balanced_data['text'])
clf.predict(text_tf_2)[0]

'Negative'

### Here, we input our own data, we clean it and we classify each tweet as positive or negative:

In [9]:
data = pd.read_csv('ocd.csv')

In [10]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

### data cleaning ###
def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
    return tweet

my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'

# cleaning master function
def clean_tweet(tweet, bigrams=False):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower() # lower case
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in my_stopwords] # remove stopwords

    tweet_token_list = [word_rooter(word) if '#' not in word else word
                        for word in tweet_token_list] # apply word rooter
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet

In [11]:
import re
data['clean_tweet'] = data.text.apply(clean_tweet, True)

In [12]:
tweets = data['clean_tweet']
tweets.dropna(how='all')
tweets.head()

0    new research find #acupuncture reduc #depressi...
1    new research find #acupuncture reduc #depressi...
2    new research find #acupuncture reduc #depressi...
3    new research find #acupuncture reduc #depressi...
4    new research find #acupuncture reduc #depressi...
Name: clean_tweet, dtype: object

In [13]:
# transform the actual data
tweet_tf= tf.transform(tweets.values.astype('U'))

In [14]:
# classify
sentiment = []
for i in range(len(list(tweets))):
    s = clf.predict(tweet_tf)[i]
    sentiment.append(s)

In [15]:
# add sentiment to the dataset
data['sentiment'] = sentiment


### Sentiment Analysis done!

### Let's test a random tweet, namely 650:

In [16]:
data.text[650]

'Does anyone else write a to do list for the day ahead? 😂 helps me soooooo much , #adulting #ocd #beyou 🤔😂'

### The algorithm outputs 'negative', which seems right:

In [17]:
data.sentiment[650]

'Positive'

### The algorithm has classified 819 tweets as 'Positive' and 2,044 'Negative'.

That means roughly 28% of the tweets about depression on Twitter are positive, and 72% are of negative sentiment

In [18]:
len(data[data['sentiment']=='Positive'])

819

In [19]:
len(data[data['sentiment']=='Negative'])

2044