### Sentiment Analysis Using Natural Language Processing

In [75]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import nltk 

In [76]:
tweets = pd.read_csv("demonetization-tweets_data.csv",encoding = "ISO-8859-1")

In [77]:
tweets.shape

(7470, 12)

In [78]:
tweets.head()

Unnamed: 0,text,favorited,favoriteCount,replyToSN,created,truncated,replyToSID,statusSource,screenName,retweetCount,isRetweet,retweeted
0,RT @rssurjewala: Critical question: Was PayTM ...,False,0,,11/23/2016 18:40,False,,"<a href=""http://twitter.com/download/android"" ...",HASHTAGFARZIWAL,331,True,False
1,"RT @roshankar: Former FinSec, RBI Dy Governor,...",False,0,,11/23/2016 18:40,False,,"<a href=""http://twitter.com/download/android"" ...",rahulja13034944,12,True,False
2,RT @satishacharya: Reddy Wedding! @mail_today ...,False,0,,11/23/2016 18:39,False,,"<a href=""http://cpimharyana.com"" rel=""nofollow...",CPIMBadli,120,True,False
3,RT @gauravcsawant: Rs 40 lakh looted from a ba...,False,0,,11/23/2016 18:38,False,,"<a href=""http://twitter.com/download/android"" ...",bhodia1,637,True,False
4,RT @sumitbhati2002: Many opposition leaders ar...,False,0,,11/23/2016 18:38,False,,"<a href=""http://twitter.com/download/android"" ...",sumitbhati2002,1,True,False


In [79]:
tweets.iloc[tweets['retweetCount'].idxmax()]['text']

'RT @RNTata2000: The government\x92s bold implementation of the demonetization programme needs the nation\x92s support. https://t.co/tx1ZILSor8'

### Cleaning the data

In [80]:
import string,re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [81]:
def clean_text(tweets):
    tweets = word_tokenize(tweets)
    tweets = tweets[4:]
    tweets= " ".join(tweets) 
    tweets= re.sub('https','',tweets) 
    tweets = [char for char in tweets if char not in string.punctuation] 
    tweets = ''.join(tweets) 
    tweets = [word for word in tweets.split() if word.lower() not in stopwords.words('english')] 
    return " ".join(tweets)

In [82]:
tweets['cleaned_text']=tweets['text'].apply(clean_text)

In [83]:
tweets['cleaned_text'].head()

0    Critical question PayTM informed Demonetizatio...
1    Former FinSec RBI Dy Governor CBDT Chair Harva...
2    Reddy Wedding mailtoday cartoon demonetization...
3    Rs 40 lakh looted bank Kishtwar J amp K Third ...
4    Many opposition leaders narendramodi Demonetiz...
Name: cleaned_text, dtype: object

### Processing the data

In [89]:
features = tweets['cleaned_text']
processed_features = []
for sentence in range(0, len(features)):
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))# Removing all the special characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)# Removing single characters appearing in the text except the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature)# Removing single characters appearing at the start 
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)# Substituting multiple spaces with a single space
    processed_feature = re.sub(r'^b\s+', '', processed_feature)# Removing prefix 'b'
    processed_feature = processed_feature.lower()# Converting to lowercase
    
    processed_features.append(processed_feature)

In [91]:
processed_features[:5]

['critical question paytm informed demonetization edict pm clearly fishy requires full disclosure amp',
 'former finsec rbi dy governor cbdt chair harvard professor lambaste demonetization aam aadmi listen th',
 'reddy wedding mailtoday cartoon demonetization reddywedding tcou7glnrq31f',
 'rs 40 lakh looted bank kishtwar amp third incident since demonetization terrorists',
 'many opposition leaders narendramodi demonetization respect decision support oppositio']

In [92]:
tweets['processed_text'] = processed_features

In [93]:
tweets.head()

Unnamed: 0,text,favorited,favoriteCount,replyToSN,created,truncated,replyToSID,statusSource,screenName,retweetCount,isRetweet,retweeted,cleaned_text,processed_text
0,RT @rssurjewala: Critical question: Was PayTM ...,False,0,,11/23/2016 18:40,False,,"<a href=""http://twitter.com/download/android"" ...",HASHTAGFARZIWAL,331,True,False,Critical question PayTM informed Demonetizatio...,critical question paytm informed demonetizatio...
1,"RT @roshankar: Former FinSec, RBI Dy Governor,...",False,0,,11/23/2016 18:40,False,,"<a href=""http://twitter.com/download/android"" ...",rahulja13034944,12,True,False,Former FinSec RBI Dy Governor CBDT Chair Harva...,former finsec rbi dy governor cbdt chair harva...
2,RT @satishacharya: Reddy Wedding! @mail_today ...,False,0,,11/23/2016 18:39,False,,"<a href=""http://cpimharyana.com"" rel=""nofollow...",CPIMBadli,120,True,False,Reddy Wedding mailtoday cartoon demonetization...,reddy wedding mailtoday cartoon demonetization...
3,RT @gauravcsawant: Rs 40 lakh looted from a ba...,False,0,,11/23/2016 18:38,False,,"<a href=""http://twitter.com/download/android"" ...",bhodia1,637,True,False,Rs 40 lakh looted bank Kishtwar J amp K Third ...,rs 40 lakh looted bank kishtwar amp third inci...
4,RT @sumitbhati2002: Many opposition leaders ar...,False,0,,11/23/2016 18:38,False,,"<a href=""http://twitter.com/download/android"" ...",sumitbhati2002,1,True,False,Many opposition leaders narendramodi Demonetiz...,many opposition leaders narendramodi demonetiz...


### Running Sentiment analysis

In [96]:
from textblob import TextBlob

In [97]:
def generate_polarity(text):
    sentiment = TextBlob(text).sentiment
    return sentiment

In [98]:
sentiment = tweets['processed_text'].apply(generate_polarity)
sentiment = sentiment.to_frame()
sentiment.head()

Unnamed: 0,processed_text
0,"(0.15, 0.5777777777777778)"
1,"(0.0, 0.0)"
2,"(0.0, 0.0)"
3,"(0.0, 0.0)"
4,"(0.5, 0.5)"


In [99]:
sentiment['polarity'] = sentiment['processed_text'].apply(lambda x:x[0])
sentiment ['subjectivity'] = sentiment['processed_text'].apply(lambda x:x[1])

In [100]:
tweets['polarity'] = sentiment['polarity']
tweets['subjectivity'] = sentiment['subjectivity']

In [102]:
tweets['polarity_encoded'] = ['positive' if x > 0 else 'negative' if x < 0 else 'neutral' for x in tweets['polarity']]

In [104]:
tweets['polarity_encoded'].value_counts()

polarity_encoded
neutral     3720
positive    2648
negative    1102
Name: count, dtype: int64

In [105]:
print("The most positive tweet:",tweets.iloc[tweets['polarity'].idxmax()]['processed_text'])
print("The most negative tweet:",tweets.iloc[tweets['polarity'].idxmin()]['processed_text']) 

The most positive tweet: one greatest computer scientists dr vijay bhatkar views demonetization decision hon pm narendramodi h
The most negative tweet: pathetic journalism media thought get stds atms another attempt malign demonetization tco


In [106]:
print("The most subjective tweet:",tweets.iloc[tweets['subjectivity'].idxmax()]['processed_text'])
print("The most objective tweet:",tweets.iloc[tweets['subjectivity'].idxmin()]['processed_text']) 

The most subjective tweet: demonetization harbhajansingh gives hilarious shagun suggestion struggling wedding season
The most objective tweet: former finsec rbi dy governor cbdt chair harvard professor lambaste demonetization aam aadmi listen th


### Applying Vectorization

In [107]:
tweets.columns

Index(['text', 'favorited', 'favoriteCount', 'replyToSN', 'created',
       'truncated', 'replyToSID', 'statusSource', 'screenName', 'retweetCount',
       'isRetweet', 'retweeted', 'cleaned_text', 'processed_text', 'polarity',
       'subjectivity', 'polarity_encoded'],
      dtype='object')

In [108]:
df = tweets[['processed_text', 'polarity_encoded']]

In [109]:
df.head()

Unnamed: 0,processed_text,polarity_encoded
0,critical question paytm informed demonetizatio...,positive
1,former finsec rbi dy governor cbdt chair harva...,neutral
2,reddy wedding mailtoday cartoon demonetization...,neutral
3,rs 40 lakh looted bank kishtwar amp third inci...,neutral
4,many opposition leaders narendramodi demonetiz...,positive


In [110]:
df.shape

(7470, 2)

In [111]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [112]:
def tokenize(text): 
    tk = TweetTokenizer()
    return tk.tokenize(text)

vectorizer = CountVectorizer(analyzer = 'word',tokenizer = tokenize,lowercase = True,ngram_range=(1, 1))

In [113]:
count= vectorizer.fit_transform(df['processed_text'])



In [114]:
count.shape

(7470, 8912)

### Create a classification model on our data

In [115]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

In [116]:
X = df['processed_text'].values
y = df['polarity_encoded'].values

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=100, test_size=0.3)

In [117]:
vectorizer = TfidfVectorizer(max_features=1000)
X_train_idf = vectorizer.fit_transform(X_train)
X_test_idf = vectorizer.transform(X_test)

In [119]:
df_idf = pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names_out(),columns=["idf_weights"])
df_idf.sort_values(by=['idf_weights'],ascending = False).head()

Unnamed: 0,idf_weights
ysrcp,7.770407
u092c,7.482725
oppn,7.364942
lakhs,7.364942
lets,7.364942


In [120]:
mnb = MultinomialNB()
mnb.fit(X_train_idf, y_train)

In [122]:
pred_mnb = mnb.predict(X_test_idf)
acc = accuracy_score(y_test, pred_mnb)


results = pd.DataFrame([['Multinomial Naive Bayes', acc]],
               columns = ['Model', 'Accuracy'])

print(results)

                     Model  Accuracy
0  Multinomial Naive Bayes    0.8639


In [130]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier()
clf_rf.fit(X_train_idf, y_train)
y_pred_rf = clf_rf.predict(X_test_idf)
acc = accuracy_score(y_test, y_pred_rf)
model_results = pd.DataFrame([['Random Forest(Gini)', acc]],
               columns = ['Model', 'Accuracy'])
dataframes_to_concat = [results, model_results]
results = pd.concat(dataframes_to_concat, ignore_index=True)
print(results)

                     Model  Accuracy
0  Multinomial Naive Bayes  0.863900
1      Random Forest(Gini)  0.926818
2      Random Forest(Gini)  0.923695


In [131]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(criterion='entropy')
clf_rf.fit(X_train_idf, y_train)
y_pred_rf = clf_rf.predict(X_test_idf)
acc = accuracy_score(y_test, y_pred_rf)
model_results = pd.DataFrame([['Random Forest(Entropy)', acc]],
               columns = ['Model', 'Accuracy'])
dataframes_to_concat = [results, model_results]
results = pd.concat(dataframes_to_concat, ignore_index=True)
print(results)

                     Model  Accuracy
0  Multinomial Naive Bayes  0.863900
1      Random Forest(Gini)  0.926818
2      Random Forest(Gini)  0.923695
3   Random Forest(Entropy)  0.924587


In [132]:
confusion_matrix(y_test,y_pred_rf) 

array([[ 270,   74,   10],
       [   5, 1053,    1],
       [   7,   72,  749]], dtype=int64)