<a href="https://colab.research.google.com/github/akib26/TwitterSentimentAnalysis/blob/v4/sentiment_analysis_twitter_and_facebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Twitter

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split # function for splitting data to train and test sets
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier
from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt

from nltk.metrics import accuracy
from nltk.metrics import ConfusionMatrix

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
data = pd.read_csv('Sentiment1.csv')
# Keeping only the neccessary columns
data = data[['text','sentiment']]
# Splitting the dataset into train and test set
X=data['text']
y=data['sentiment']

In [3]:
# Splitting the dataset into train and test set
X_train,X_test,y_train, y_test = train_test_split(data,y,test_size = 0.1,random_state=1,stratify=y)
#train,test=train_test_split(data,test_size=0.1)


In [4]:
X_train.shape,X_test.shape,y_train.shape, y_test.shape

((12483, 2), (1388, 2), (12483,), (1388,))

In [5]:
def wordcloud_draw(file,data,key, color = 'black'):
    words = ' '.join(data)
    cleaned_word = " ".join([word for word in words.split()
                            if 'http' not in word
                                and not word.startswith('@')
                                and not word.startswith('#')
                                and word != 'RT'
                            ])
    wordcloud = WordCloud(stopwords=STOPWORDS,
                      background_color=color,
                      width=2500,
                      height=2000
                     ).generate(cleaned_word)
    plt.figure(1,figsize=(13, 13))
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.title('Wordcloud of key "{}"'.format(key))
    plt.savefig('{}{}.png'.format(file,key))
    plt.close()

In [6]:
dict_of_Categorization = {k: v for k, v in data.groupby('sentiment')}

for key, value in dict_of_Categorization.items():
    num_Descriptions = len(value)
    if (num_Descriptions >= 50):
        num_Descriptions
        wordcloud_draw("Twitter_",value["text"], key, 'white')

In [7]:
tweets = []
stopwords_set = set(stopwords.words("english"))
for index,row in X_train.iterrows():
    words_filtered = [e.lower() for e in row.text.split() if len(e) >= 3]
    words_cleaned = [word for word in words_filtered
        if 'http' not in word
        and not word.startswith('@')
        and not word.startswith('#')
        and word != 'RT']
    words_without_stopwords = [word for word in words_cleaned if not word in stopwords_set]
    tweets.append((words_without_stopwords, row.sentiment))

In [8]:
# Extracting word features
def get_words_in_tweets(tweets):
    all = []
    for (words, sentiment) in tweets:
        all.extend(words)
    return all
 
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    features = wordlist.keys()
    return features
w_features = get_word_features(get_words_in_tweets(tweets))
 
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in w_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [9]:
# Training the Naive Bayes classifier
training_set = nltk.classify.apply_features(extract_features,tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [10]:
predicted=[]
for obj in X_test['text']:
  res=classifier.classify(extract_features(obj.split()))
  predicted.append(res)

In [11]:
y_test_list=y_test.tolist()

In [12]:

print(ConfusionMatrix(y_test_list, predicted).pretty_format(sort_by_count=True))

         |   P       N |
         |   o   N   e |
         |   s   e   g |
         |   i   u   a |
         |   t   t   t |
         |   i   r   i |
         |   v   a   v |
         |   e   l   e |
---------+-------------+
Positive |<442> 87  22 |
 Neutral |  80<357> 11 |
Negative | 111  65<213>|
---------+-------------+
(row = reference; col = test)



The numbers embedded in <> are the true positives (tp)meaning correctly classified.


*   rows refer to actual test set sentiment
*   columns refer to predicted sentiment by our naive bayes classifier





In [13]:
print("Accuracy:", accuracy(y_test_list, predicted))

Accuracy: 0.729106628242075


In [14]:

user_input=input("Enter your tweet")
res=classifier.classify(extract_features(user_input.split()))
print(res)


Enter your tweetthis is bad 
Negative


In [15]:
import pickle
mymodel = 'naive_finalized_model.sav'
pickle.dump(classifier, open(mymodel, 'wb'))

In [16]:
with open('tweets.pkl', 'wb') as f:
    pickle.dump(tweets, f)

# Facebook

In [17]:
fb = pd.read_csv('fb_sentiment.csv')
fb = fb[['text','sentiment']]
print(fb.head())

                                                text sentiment
0  Drug Runners and  a U.S. Senator have somethin...   Neutral
1  Heres a single, to add, to Kindle. Just read t...   Neutral
2  If you tire of Non-Fiction.. Check out http://...   Neutral
3    Ghost of Round Island is supposedly nonfiction.   Neutral
4  Why is Barnes and Nobles version of the Kindle...  Negative


In [18]:
fbpredicted=[]
for obj in fb['text']:
  res=classifier.classify(extract_features(obj.split()))
  fbpredicted.append(res)

In [19]:
fbactual=fb['sentiment'].tolist()

In [20]:

print(ConfusionMatrix(fbactual, fbpredicted).pretty_format(sort_by_count=True))

         |   P       N |
         |   o   N   e |
         |   s   e   g |
         |   i   u   a |
         |   t   t   t |
         |   i   r   i |
         |   v   a   v |
         |   e   l   e |
---------+-------------+
Positive |<465>124  52 |
 Neutral |  93<156> 31 |
Negative |  27  11 <41>|
---------+-------------+
(row = reference; col = test)



In [21]:
print("Accuracy:", accuracy(fbactual, fbpredicted))

Accuracy: 0.662


In [22]:
dict_of_Categorization = {k: v for k, v in fb.groupby('sentiment')}

for key, value in dict_of_Categorization.items():
    num_Descriptions = len(value)
    if (num_Descriptions >= 50):
        num_Descriptions
        wordcloud_draw("FB",value["text"], key, 'white')