## Importing relevant libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.feature_extraction.text import TfidfVectorizer

## Reading of dataset

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/twitter-data/master/twitt30k.csv')
df

Unnamed: 0,twitts,sentiment
0,@robbiebronniman Sounds like a great night.,1
1,Damn the person who stolde my wallet !!!!! Ma...,1
2,Greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars Pretty pretty pretty please, pak...",0
...,...,...
29995,@Calumfan1 is it in any way related to photosh...,0
29996,@Swiz_NZ really? wow thats crap,0
29997,"At the 2010 lexus HS250h press event. Again, ...",0
29998,@karmicunderpath ooooh now there's a nice thou...,1


In [4]:
df['sentiment'].value_counts()

0    15000
1    15000
Name: sentiment, dtype: int64

## Training the model using SVM

In [5]:
# Function to fit the model and return the tfidf frequency parameter which is 
# used to convert the text data to numerical data. Function also fits the data to svm model and returns
# tfidf and fitted model component.
def run_svm(df):
    X = df['twitts']
    y = df['sentiment']

    tfidf = TfidfVectorizer()

    X = tfidf.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

    print('Shape of X:', X.shape)

    svm_clf = LinearSVC()
    svm_clf.fit(X_train, y_train)

    y_pred = svm_clf.predict(X_test)

    print('Printing report')
    print(classification_report(y_test,y_pred)) # Classification report to evaluate performance metrics of the model.
    
    return tfidf, svm_clf

In [6]:
%%time
tfidf, clf= run_svm(df)

Shape of X: (30000, 40854)
Printing report
              precision    recall  f1-score   support

           0       0.75      0.74      0.75      3000
           1       0.74      0.75      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000

CPU times: user 821 ms, sys: 25.5 ms, total: 847 ms
Wall time: 851 ms


In [7]:
x = 'You are a good person. I hate you.'

clf.predict(tfidf.transform([x]))

array([1])

## Preprocessing using self built preprocessing package

In [8]:
import preprocess_kgptalkie as pp

In [9]:
pp.__version__

'0.10.3'

In [41]:
# help(pp)

In [11]:
df['twitts'] = df['twitts'].apply(lambda x: x.lower())

In [12]:
df['twitts'] = df['twitts'].apply(lambda x: pp.cont_exp(x))

In [13]:
df['twitts'].sample(5)

2362     all i have to say right now is: *sigh* robert ...
24626    so my computer is in this city but i was not h...
13424    going to six flags nicole, lee, alicia, and as...
7287     @introversimondu sounds good.  and yeah, i tur...
24216    thank you lord for this another wonderful morn...
Name: twitts, dtype: object

In [14]:
df.head()

Unnamed: 0,twitts,sentiment
0,@robbiebronniman sounds like a great night.,1
1,damn the person who stolde my wallet !!!!! ma...,1
2,greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars pretty pretty pretty please, pak...",0


In [15]:
tfidf, clf = run_svm(df)

Shape of X: (30000, 40753)
Printing report
              precision    recall  f1-score   support

           0       0.75      0.74      0.75      3000
           1       0.75      0.76      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000



In [16]:
# Preprocessing of data before evaluation. It is also called cleaning of data in simple terms.
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_emails(x))
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_urls(x))
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_rt(x))
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_html_tags(x))
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_special_chars(x))

In [17]:
tfidf, clf = run_svm(df)

Shape of X: (30000, 42855)
Printing report
              precision    recall  f1-score   support

           0       0.75      0.74      0.74      3000
           1       0.74      0.75      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000



In [18]:
x = 'You are a good person. I hate you.'

clf.predict(tfidf.transform([x]))

array([1])

## Fine tuning SVM model

In [19]:
def run_svm(df):
    X = df['twitts']
    y = df['sentiment']
    
    # Tuning the parameters of tfidf vectorizer for more optimal results.
    tfidf = TfidfVectorizer(norm = 'l1', ngram_range=(1,2), analyzer='word', max_features=5000)

    X = tfidf.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

    print('Shape of X:', X.shape)

    svm_clf = LinearSVC()
    svm_clf.fit(X_train, y_train)

    y_pred = svm_clf.predict(X_test)

    print('Printing report')
    print(classification_report(y_test,y_pred))
    
    return tfidf, svm_clf

run_svm(df)

Shape of X: (30000, 5000)
Printing report
              precision    recall  f1-score   support

           0       0.75      0.77      0.76      3000
           1       0.76      0.74      0.75      3000

    accuracy                           0.76      6000
   macro avg       0.76      0.76      0.76      6000
weighted avg       0.76      0.76      0.76      6000



(TfidfVectorizer(max_features=5000, ngram_range=(1, 2), norm='l1'),
 LinearSVC())

## Saving and Loading the model using pickle

In [20]:
import pickle

In [21]:
pickle.dump(clf, open('clf.pkl','wb'))
pickle.dump(tfidf, open('tfidf.pkl','wb'))

In [22]:
del clf
del tfidf

In [23]:
clf = pickle.load(open('clf.pkl','rb'))
tfidf = pickle.load(open('tfidf.pkl','rb'))

In [24]:
clf

LinearSVC()

In [25]:
# tfidf.vocabulary_   it is to check the tokenization of words. It shows how each word is assigned a specific token. 

In [26]:
clf.predict(tfidf.transform([x]))

array([1])

## Real Time Twitter Sentiment Analysis

In [27]:
consumer_key = 'LEkrbfNx0DT2ehJsMI9tiQLIT'
consumer_secret = '9dmZSRWGF8SGjY4cp762k7Q0ZLw7exhD6pCS1h4CRRxADigtas'
access_token = '1360377026071457802-ZudusLqN3y7pPTifcLjtaUMArHkuE6'
access_token_secret = 'HJPg3bGA1lhvKUcz2cTCeSw8hifLiZtT0gyi6ZzPA93ct'

In [28]:
import tweepy

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

public_tweets = api.home_timeline() #Collects the posts from your timeline

In [29]:
for tweet in public_tweets:
    print(tweet.text)

Deceased farmer’s nephew not entitled to €1m Kerry Group shares, court rules https://t.co/HAuUzOPFpk
🗣️ 🗣️ "We have never complained about the pitch in the past."

#TeamIndia's ace spinner @ashwinravi99 sets the reco… https://t.co/RsAWMCaQmH
RT @IrishTimesBiz: Get the latest business news and commentary from our expert business team every weekday with Business Today: a newslette…
RT @IrishTimesWorld: Ireland condemns Israeli destruction of Bedouin homes in West Bank https://t.co/XUNCHAXA6n via @IrishTimesWorld
Monkey at large after it escapes from its enclosure at Dublin Zoo https://t.co/sxdWZnVLHW
Gardaí examine phones of brothers involved in Cork murder-suicide https://t.co/SCZLjdSH1k
खिलौनों के क्षेत्र में भारत के पास Tradition भी है और Technology भी।
 
भारत के पास Concepts भी हैं, और Competence… https://t.co/FnioTnwheI
आजकल परिवारों में प्लेटाइम की जगह स्क्रीनटाइम ने ले ली है। लेकिन आपको खेल और खिलौनों की भूमिका को जरूर समझना चाहिए।… https://t.co/812bmNOINI
Roy Keane leaves his Ins

## Tracking Kewords

In [30]:
import json
from textblob import TextBlob
import csv

In [31]:
clf = pickle.load(open('clf.pkl','rb'))
tfidf = pickle.load(open('tfidf.pkl','rb'))

In [32]:
def predict_sentiment(x):
    x = [x]
    sent = clf.predict(tfidf.transform(x))
    return sent 

In [33]:
predict_sentiment('what i am not best')[0]

0

In [34]:
keyword = ['usa', 'china']

In [38]:
usa = 0
china = 0

# Creating a file in which marks the beginning of file by adding headers.
with open('sentiment.csv', 'w') as file:
    writer = csv.DictWriter(file, fieldnames=keyword)
    writer.writeheader()
    
class MyStreamListener(tweepy.StreamListener):
    
    def on_status(self, status):
        print(status.text)
    
    def on_data(self, data):
        raw_tweets = json.loads(data)
        #print(raw_tweets)
        try:
            x = str(raw_tweets['text']).lower()
            x = pp.cont_exp(x)
            x = pp.remove_emails(x)
            x = pp.remove_urls(x)
            x = pp.remove_html_tags(x)
            x = pp.remove_rt(x)
            x = pp.remove_special_chars(x)
            
            #blob = TextBlob(x)
#             sentiment = predict_sentiment(x)[0]
#             print(sentiment)
            
            global usa
            global china
            
            if 'usa' in x and 'china' not in x:
                sent = predict_sentiment(x)[0]
                usa = usa + sent
            elif 'china' in x and 'usa' not in x:
                sent = predict_sentiment(x)[0]
                china = china + sent
            else:
                pass
            
            print('usa: ',usa, 'china: ',china)
            
            with open('sentiment.csv', 'a') as file:
                writer = csv.DictWriter(file, fieldnames=keyword)
                info = {
                    'usa': usa,
                    'china': china
                }
                writer.writerow(info)
            
        except:
            pass
    
    def on_error(self, status_code):
        if status_code == 420:
            print('Error 420')
            #returning False in on_error disconnects the stream
            return False

In [39]:
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth = api.auth, listener=myStreamListener)

In [40]:
myStream.filter(track=keyword)

usa:  0 china:  1
usa:  0 china:  1
usa:  0 china:  1
usa:  0 china:  1
usa:  0 china:  1
usa:  0 china:  2
usa:  0 china:  2
usa:  0 china:  2
usa:  0 china:  3
usa:  0 china:  4
usa:  0 china:  4
usa:  0 china:  4
usa:  0 china:  4
usa:  0 china:  5
usa:  0 china:  6
usa:  0 china:  6
usa:  0 china:  6
usa:  0 china:  6
usa:  0 china:  6
usa:  0 china:  6
usa:  0 china:  6
usa:  0 china:  7
usa:  0 china:  8
usa:  0 china:  9
usa:  0 china:  9
usa:  1 china:  9
usa:  1 china:  9
usa:  2 china:  9
usa:  2 china:  9
usa:  2 china:  10
usa:  2 china:  10
usa:  3 china:  10
usa:  3 china:  11
usa:  3 china:  11
usa:  3 china:  11
usa:  3 china:  12
usa:  3 china:  12
usa:  3 china:  13
usa:  3 china:  13
usa:  3 china:  13
usa:  3 china:  13
usa:  3 china:  13
usa:  3 china:  14
usa:  4 china:  14
usa:  4 china:  15
usa:  5 china:  15
usa:  6 china:  15
usa:  6 china:  16
usa:  6 china:  16
usa:  6 china:  16
usa:  6 china:  16
usa:  6 china:  17
usa:  6 china:  17
usa:  6 china:  17
usa

usa:  77 china:  87
usa:  77 china:  87
usa:  77 china:  88
usa:  77 china:  88
usa:  77 china:  88
usa:  77 china:  88
usa:  77 china:  89
usa:  77 china:  89
usa:  77 china:  89
usa:  77 china:  89
usa:  77 china:  90
usa:  77 china:  90
usa:  77 china:  90
usa:  77 china:  90
usa:  77 china:  90
usa:  77 china:  90
usa:  77 china:  90
usa:  77 china:  90
usa:  77 china:  91
usa:  78 china:  91
usa:  79 china:  91
usa:  80 china:  91
usa:  80 china:  91
usa:  80 china:  91
usa:  81 china:  91
usa:  81 china:  91
usa:  82 china:  91
usa:  82 china:  91
usa:  82 china:  92
usa:  82 china:  92
usa:  82 china:  92
usa:  82 china:  92
usa:  82 china:  92
usa:  82 china:  92
usa:  83 china:  92
usa:  83 china:  92
usa:  83 china:  92
usa:  83 china:  93
usa:  83 china:  93
usa:  83 china:  93
usa:  83 china:  93
usa:  83 china:  93
usa:  83 china:  94
usa:  83 china:  94
usa:  84 china:  94
usa:  84 china:  95
usa:  84 china:  95
usa:  84 china:  95
usa:  84 china:  95
usa:  84 china:  95


usa:  138 china:  164
usa:  138 china:  164
usa:  138 china:  165
usa:  138 china:  165
usa:  138 china:  165
usa:  138 china:  165
usa:  138 china:  165
usa:  139 china:  165
usa:  139 china:  165
usa:  139 china:  165
usa:  140 china:  165
usa:  140 china:  165
usa:  140 china:  165
usa:  141 china:  165
usa:  141 china:  165
usa:  141 china:  165
usa:  141 china:  165
usa:  141 china:  166
usa:  141 china:  166
usa:  141 china:  167
usa:  142 china:  167
usa:  142 china:  167
usa:  142 china:  168
usa:  142 china:  168
usa:  142 china:  169
usa:  142 china:  170
usa:  142 china:  170
usa:  142 china:  170
usa:  142 china:  170
usa:  142 china:  171
usa:  142 china:  171
usa:  143 china:  171
usa:  143 china:  171
usa:  144 china:  171
usa:  144 china:  171
usa:  144 china:  171
usa:  144 china:  171
usa:  144 china:  172
usa:  144 china:  172
usa:  145 china:  172
usa:  145 china:  172
usa:  145 china:  172
usa:  145 china:  173
usa:  145 china:  173
usa:  145 china:  173
usa:  145 

KeyboardInterrupt: 