# IRM Project (Twitter Sentiment Analysis)

#### Project members
   ###### 1. Anuj Agrawal                    1509113020
   ###### 2. Ayush Singh                    1509113031
   ###### 3. Aakash patel                    1509113002
   ###### 4. Abhilash Pandey             1509113003
   ###### 5. Gautam Genda                 1509113041

## Import all the Dependencies

In [37]:
import pandas as pd
import numpy as np
import tweepy
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import pickle

import warnings 
warnings.filterwarnings('always')

### connection through twitter API

In [40]:

Twitter = pickle.load(open('secret_twitter_credentials.pkl','rb'))

auth = tweepy.OAuthHandler(Twitter['consumer_key'], Twitter['consumer_secret'])
auth.set_access_token(Twitter['access_token'], Twitter['access_token_secret'])

api = tweepy.API(auth)

warnings.filterwarnings('always')

  


### Fetch tweets by entering query 

In [7]:

public_tweets = api.search(q='indian election', count=100)
warnings.filterwarnings('always')

In [9]:
tweet_list = []
for tweet in public_tweets:
    tweet_list.append(tweet.text)
    #print(tweet_list)

In [10]:
len(tweet_list)

100

### mark sentiments of each tweet using [textblob](http://textblob.readthedocs.io/en/dev/)

In [5]:

sentiments = [] 
for tweet in public_tweets:
    analysis = TextBlob(tweet.text)
    if 0 < analysis.sentiment.polarity:
        sentiments.append(1)
    else:
        sentiments.append(0)
        
warnings.filterwarnings('always')

### creating dataset from the fetched tweets

In [11]:

combined_list = list(zip(tweet_list, sentiments))
#print(combined_list)

cols = ['tweet', 'sentiment']

data = pd.DataFrame.from_records(combined_list, columns=cols)

print(data)

                                                tweet  sentiment
0   RT @MarioPuzo901: Wake up Siddhramiah ..Ur Pap...          0
1   Wake up Siddhramiah ..Ur Pappu Vish..Vishrayya...          0
2   Siddaramaiah-BJP face off over north Indian 'i...          0
3   @TimesNow @siddaramaiah if PM and CM Yogi come...          0
4   @TimesNow Sale the chutiye south Indian dirty ...          0
5   @TimesNow Seeing certain defeat in karnataka e...          1
6   @brainy_indian @dhume @divyaspandana This was ...          0
7   @siddaramaiah @BJP4Karnataka @BSYBJP Mr. Pappu...          0
8   @BDUTT @KapilSibal @arunjaitley @MVenkaiahNaid...          0
9   RT @DVSBJP: Are you putting your words in our ...          0
10  @FlanJerry @dhume @divyaspandana If this was s...          0
11  RT @intercepted: .@RalphNader on Democrats, th...          0
12  Kapil Sibal in 2010 when they were in power.\n...          0
13  RT @AP: BREAKING: Republican Debbie Lesko wins...          1
14  RT @DVSBJP: Are you p

In [12]:
data['sentiment'].describe()

count    100.00000
mean       0.15000
std        0.35887
min        0.00000
25%        0.00000
50%        0.00000
75%        0.00000
max        1.00000
Name: sentiment, dtype: float64

In [13]:
X = data['tweet']
y = data['sentiment']

print(X.shape)
print(y.shape)

(100,)
(100,)


### Split the dataset into Train-Test datasets

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

print(X_train.shape)
print(X_test.shape)

(75,)
(25,)


### Create Document Term Matrix using word Frequency

In [15]:
vect = CountVectorizer(stop_words='english')
X_train_counts = vect.fit_transform(X_train)

print(X_train_counts.shape)
print(type(X_train_counts))
print(X_train_counts)

(75, 309)
<class 'scipy.sparse.csr.csr_matrix'>
  (0, 109)	1
  (0, 120)	1
  (0, 274)	1
  (0, 86)	1
  (0, 211)	1
  (0, 128)	1
  (0, 239)	1
  (0, 234)	1
  (1, 80)	1
  (1, 124)	1
  (1, 137)	1
  (1, 219)	1
  (1, 186)	1
  (1, 123)	1
  (1, 197)	1
  (1, 182)	1
  (1, 210)	1
  (1, 185)	1
  (1, 303)	1
  (1, 218)	1
  (1, 82)	1
  (1, 128)	1
  (1, 234)	1
  (2, 80)	1
  (2, 124)	1
  :	:
  (72, 293)	1
  (72, 127)	1
  (72, 234)	1
  (73, 0)	1
  (73, 230)	1
  (73, 281)	1
  (73, 227)	1
  (73, 208)	1
  (73, 73)	1
  (73, 194)	1
  (73, 212)	1
  (73, 2)	1
  (73, 252)	1
  (73, 147)	1
  (73, 120)	1
  (74, 222)	1
  (74, 176)	1
  (74, 37)	1
  (74, 134)	1
  (74, 36)	1
  (74, 29)	1
  (74, 253)	1
  (74, 197)	1
  (74, 86)	1
  (74, 128)	1


In [16]:
print(vect.get_feature_names()[:20])
print(vect.get_feature_names()[-20:])

['0g3jry0ygj', '0zkbek1rbp', '2010', '2016', '_janhwi', 'abt', 'abvpjnu', 'africa', 'akshaysinghel', 'aljazeera_world', 'amp', 'anxious', 'ap', 'arizona', 'army', 'arunjaitley', 'asarambapu', 'awami', 'away', 'ayodhya']
['vish', 'vishrayya', 'vishwarayya', 'voices', 'vote', 'vs', 'vsjxhev4na', 'waha', 'wake', 'wants', 'wat', 'win', 'wins', 'wit', 'words', 'worst', 'year', 'yogi', 'youth', 'zooaltrgnc']


### Create Document term matrix using Tf-idf(Term Frequency- Inverse document frequency ).

In [17]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)
print(type(X_train_tfidf))
print(X_train_tfidf)


(75, 309)
<class 'scipy.sparse.csr.csr_matrix'>
  (0, 234)	0.14716998869348072
  (0, 239)	0.5086743893851138
  (0, 128)	0.13561350346136428
  (0, 211)	0.46420088330235926
  (0, 86)	0.208044907246352
  (0, 274)	0.40817089198748824
  (0, 120)	0.24566085784306554
  (0, 109)	0.46420088330235926
  (1, 234)	0.19709709259503685
  (1, 128)	0.18162009446458496
  (1, 82)	0.27862369706954065
  (1, 218)	0.27862369706954065
  (1, 303)	0.27862369706954065
  (1, 185)	0.27862369706954065
  (1, 210)	0.2739599523888023
  (1, 182)	0.2739599523888023
  (1, 197)	0.2174915580551883
  (1, 123)	0.24118114236732835
  (1, 186)	0.2739599523888023
  (1, 219)	0.25665814049778024
  (1, 137)	0.25665814049778024
  (1, 124)	0.27862369706954065
  (1, 80)	0.27862369706954065
  (2, 234)	0.19709709259503685
  (2, 128)	0.18162009446458496
  :	:
  (72, 155)	0.28429026243298994
  (72, 292)	0.28429026243298994
  (72, 25)	0.28429026243298994
  (73, 120)	0.1440932660850486
  (73, 147)	0.29836480578905467
  (73, 252)	0.298364805

### Fitting Naive Bayes classifier

In [18]:
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

#### Pre-processing the test Data

In [19]:
X_test_counts = vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

print(X_test_tfidf.shape)
print(type(X_test_tfidf))
print(X_test_tfidf)

(25, 309)
<class 'scipy.sparse.csr.csr_matrix'>
  (0, 303)	0.27862369706954065
  (0, 234)	0.19709709259503683
  (0, 219)	0.2566581404977802
  (0, 218)	0.27862369706954065
  (0, 210)	0.2739599523888023
  (0, 197)	0.21749155805518827
  (0, 186)	0.2739599523888023
  (0, 185)	0.27862369706954065
  (0, 182)	0.2739599523888023
  (0, 137)	0.2566581404977802
  (0, 128)	0.18162009446458494
  (0, 124)	0.27862369706954065
  (0, 123)	0.2411811423673283
  (0, 82)	0.27862369706954065
  (0, 80)	0.27862369706954065
  (1, 303)	0.27862369706954065
  (1, 234)	0.19709709259503683
  (1, 219)	0.2566581404977802
  (1, 218)	0.27862369706954065
  (1, 210)	0.2739599523888023
  (1, 197)	0.21749155805518827
  (1, 186)	0.2739599523888023
  (1, 185)	0.27862369706954065
  (1, 182)	0.2739599523888023
  (1, 137)	0.2566581404977802
  :	:
  (23, 219)	0.12519102664517115
  (23, 197)	0.1060866076048827
  (23, 130)	0.3032390179383435
  (23, 128)	0.08858946005498315
  (23, 117)	0.282626084597179
  (23, 112)	0.30323901793834

#### Testing the Test Data

In [20]:
y_pred = clf.predict(X_test_tfidf)

print("Accuracy : ",metrics.accuracy_score(y_test, y_pred))

Accuracy :  0.84


## Optimising the model in two steps
### A. Running the Agorithm using a pipeline that includes the following steps
   ##### 1. creating Document-term matrix using word frequency of each word in document
   ##### 2. creating Document matrix using Tf-idf term weighs
   ##### 3. Fitting classifier



In [21]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

text_clf = text_clf.fit(X_train, y_train)

### B. Optimising the algorithm by tunning the Model Parameters using **"GridSearchCV"** 
##### Tunning of Parameters includes-
###### 1. we can choose whether to use uni-gram or bi-gram words for vocabulary
###### 2. we can choose whether to Enable inverse-document-frequency reweighting or not (Desable by default).
###### 3. adjusting the smoothening parameter (alpha  default_value = 1) in naive bayes classifier

In [22]:
parameters = {'vect__ngram_range' : [(1,1), (1,2)],
             'tfidf__use_idf': (True, False),
             'clf__alpha' : (1e-2, 1e-3)
             }

In [39]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf.fit(X_train, y_train)

warnings.filterwarnings('always')

In [24]:
gs_clf.best_params_

{'clf__alpha': 0.001, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}

### Fitting Model using optimal Tuned parameters

In [25]:
vect1 = CountVectorizer(stop_words='english', ngram_range=(1,1))
X_train_counts1 = vect1.fit_transform(X_train, y_train)

tfidf_transformer1 = TfidfTransformer(use_idf=False)
X_train_tfidf1 = tfidf_transformer1.fit_transform(X_train_counts1)

print(X_train_tfidf1.shape)
print(type(X_train_tfidf1))
print(X_train_tfidf1)

(75, 309)
<class 'scipy.sparse.csr.csr_matrix'>
  (0, 109)	0.35355339059327373
  (0, 120)	0.35355339059327373
  (0, 274)	0.35355339059327373
  (0, 86)	0.35355339059327373
  (0, 211)	0.35355339059327373
  (0, 128)	0.35355339059327373
  (0, 239)	0.35355339059327373
  (0, 234)	0.35355339059327373
  (1, 80)	0.2581988897471611
  (1, 124)	0.2581988897471611
  (1, 137)	0.2581988897471611
  (1, 219)	0.2581988897471611
  (1, 186)	0.2581988897471611
  (1, 123)	0.2581988897471611
  (1, 197)	0.2581988897471611
  (1, 182)	0.2581988897471611
  (1, 210)	0.2581988897471611
  (1, 185)	0.2581988897471611
  (1, 303)	0.2581988897471611
  (1, 218)	0.2581988897471611
  (1, 82)	0.2581988897471611
  (1, 128)	0.2581988897471611
  (1, 234)	0.2581988897471611
  (2, 80)	0.2581988897471611
  (2, 124)	0.2581988897471611
  :	:
  (72, 293)	0.2672612419124244
  (72, 127)	0.2672612419124244
  (72, 234)	0.2672612419124244
  (73, 0)	0.2886751345948129
  (73, 230)	0.2886751345948129
  (73, 281)	0.2886751345948129
  (73, 2

In [26]:
clf1 = MultinomialNB(alpha=0.001)
clf1.fit(X_train_tfidf1, y_train)

MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True)

In [27]:
X_test_counts1 = vect1.transform(X_test)
X_test_tfidf1 = tfidf_transformer1.transform(X_test_counts1)

print(X_test_tfidf1.shape)
print(type(X_test_tfidf1))
print(X_test_tfidf1)

(25, 309)
<class 'scipy.sparse.csr.csr_matrix'>
  (0, 80)	0.2581988897471611
  (0, 82)	0.2581988897471611
  (0, 123)	0.2581988897471611
  (0, 124)	0.2581988897471611
  (0, 128)	0.2581988897471611
  (0, 137)	0.2581988897471611
  (0, 182)	0.2581988897471611
  (0, 185)	0.2581988897471611
  (0, 186)	0.2581988897471611
  (0, 197)	0.2581988897471611
  (0, 210)	0.2581988897471611
  (0, 218)	0.2581988897471611
  (0, 219)	0.2581988897471611
  (0, 234)	0.2581988897471611
  (0, 303)	0.2581988897471611
  (1, 80)	0.2581988897471611
  (1, 82)	0.2581988897471611
  (1, 123)	0.2581988897471611
  (1, 124)	0.2581988897471611
  (1, 128)	0.2581988897471611
  (1, 137)	0.2581988897471611
  (1, 182)	0.2581988897471611
  (1, 185)	0.2581988897471611
  (1, 186)	0.2581988897471611
  (1, 197)	0.2581988897471611
  :	:
  (23, 128)	0.25
  (23, 130)	0.25
  (23, 197)	0.25
  (23, 219)	0.25
  (23, 234)	0.25
  (23, 247)	0.25
  (23, 249)	0.25
  (23, 253)	0.25
  (23, 258)	0.25
  (23, 272)	0.25
  (24, 80)	0.2581988897471611


### testing accuracy :

In [28]:
y_pred1 = clf1.predict(X_test_tfidf1)

print("Accuracy :", metrics.accuracy_score(y_test, y_pred1))

Accuracy : 0.92


if we are not getting an improved accuracy after optimal tuning of parameters may be we should need to collect more data.

### Analysing the result Obtained

In [29]:
metrics.confusion_matrix(y_test, y_pred1)

array([[20,  1],
       [ 1,  3]])

In [30]:
# All false negative
print(X_test[y_test > y_pred1])

44    @namo_office @narendramodi Yes every Indian ta...
Name: tweet, dtype: object


In [31]:
X[44]

'@namo_office @narendramodi Yes every Indian take responsibility all things first election commission take strict actions &amp; strict Rules'

In [32]:
# All False positive
print(X_test[y_test < y_pred1])

82    @Shehla_Rashid Which congress you mean?\n1. In...
Name: tweet, dtype: object


In [33]:
X[82]

'@Shehla_Rashid Which congress you mean?\n1. Indian national congress \n2. Congress party of Kerala (Marxist)\n      Beâ€¦ https://t.co/oKbVF4lflE'

# Thank You !!!