In [1]:
import numpy as np
import pandas as pd
import json

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/twitter-data/master/twitt30k.csv')
df

Unnamed: 0,twitts,sentiment
0,@robbiebronniman Sounds like a great night.,1
1,Damn the person who stolde my wallet !!!!! Ma...,1
2,Greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars Pretty pretty pretty please, pak...",0
...,...,...
29995,@Calumfan1 is it in any way related to photosh...,0
29996,@Swiz_NZ really? wow thats crap,0
29997,"At the 2010 lexus HS250h press event. Again, ...",0
29998,@karmicunderpath ooooh now there's a nice thou...,1


In [3]:
df.isna().sum()

twitts       0
sentiment    0
dtype: int64

In [4]:
df.sentiment.value_counts()

0    15000
1    15000
Name: sentiment, dtype: int64

In [5]:
def run_svm(df):
    X=df['twitts']
    y=df['sentiment']
    
    tfidf = TfidfVectorizer()
    X = tfidf.fit_transform(df['twitts'])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)
                            
    print('shape of X: ', X.shape)                        
    
    #Train Model
    clf = LinearSVC()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    print()
    print('Printing Report')
    print(classification_report(y_test, y_pred))
    
    return tfidf, clf
    

In [6]:
%%time
tfidf, clf = run_svm(df)

shape of X:  (30000, 40854)

Printing Report
              precision    recall  f1-score   support

           0       0.75      0.74      0.75      3000
           1       0.74      0.75      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000

CPU times: user 542 ms, sys: 13.1 ms, total: 556 ms
Wall time: 553 ms


In [7]:
x = ['i am really happy. thanks a lot for coming with me']

In [8]:
clf.predict(tfidf.transform(x))

array([1])

# Data Cleaning and Retraining SVM

In [9]:
import preprocess_andy as pp

In [10]:
dir(pp)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 'get_avg_wordlength',
 'get_characcounts',
 'get_cont_exp',
 'get_digit_counts',
 'get_emails',
 'get_hashtag_counts',
 'get_mentions_counts',
 'get_stopwords_counts',
 'get_uppercase_counts',
 'get_urls',
 'get_value_counts',
 'get_wordcount',
 'make_to_base',
 'remove_accented_chars',
 'remove_common_words',
 'remove_emails',
 'remove_html_tags',
 'remove_rarewords',
 'remove_rt',
 'remove_special_chars',
 'remove_stopwords',
 'remove_urls',
 'spelling_correction',
 'utils']

# Apply lower and Contractions

In [11]:
df['twitts'] = df['twitts'].apply(lambda x: x.lower())

In [12]:
df['twitts'] = df['twitts'].apply(lambda x: pp.get_cont_exp(x))

In [13]:
df.head()

Unnamed: 0,twitts,sentiment
0,@robbiebronniman sounds like a great night.,1
1,damn the person who stolde my wallet !!!!! ma...,1
2,greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars pretty pretty pretty please, pak...",0


In [14]:
run_svm(df)

shape of X:  (30000, 40846)

Printing Report
              precision    recall  f1-score   support

           0       0.75      0.74      0.75      3000
           1       0.75      0.76      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000



(TfidfVectorizer(), LinearSVC())

# REMOVE EMAILS

In [15]:
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_emails(x))
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_urls(x))
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_rt(x))
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_html_tags(x))
df['twitts'] = df['twitts'].apply(lambda x: pp.remove_special_chars(x))

In [16]:
df

Unnamed: 0,twitts,sentiment
0,robbiebronniman sounds like a great night,1
1,damn the person who stolde my wallet may karma...,1
2,greetings from the piano bench photo,1
3,drewryanscott i love it i love you haha forget...,1
4,kissthestars pretty pretty pretty please pakid...,0
...,...,...
29995,calumfan1 is it in any way related to photoshop,0
29996,swiz_nz really wow thats crap,0
29997,at the 2010 lexus hs250h press event again can...,0
29998,karmicunderpath ooooh now there is a nice thought,1


In [17]:
%%time
tfidf, clf = run_svm(df)

shape of X:  (30000, 42983)

Printing Report
              precision    recall  f1-score   support

           0       0.75      0.74      0.74      3000
           1       0.74      0.75      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000

CPU times: user 545 ms, sys: 4.67 ms, total: 550 ms
Wall time: 548 ms


In [18]:
clf.predict(tfidf.transform(x))

array([1])

# FINE TUNING MODEL

In [19]:
def run_svm(df):
    X = df['twitts']
    y = df['sentiment']

    tfidf = TfidfVectorizer(norm = 'l1', ngram_range=(1,2), analyzer='word', max_features=5000)
    X = tfidf.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

    print('shape of X: ', X.shape)

    clf = LinearSVC()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    print()
    print('Printing Report')
    print(classification_report(y_test, y_pred))
    
    return tfidf, clf

run_svm(df)

shape of X:  (30000, 5000)

Printing Report
              precision    recall  f1-score   support

           0       0.75      0.77      0.76      3000
           1       0.77      0.75      0.76      3000

    accuracy                           0.76      6000
   macro avg       0.76      0.76      0.76      6000
weighted avg       0.76      0.76      0.76      6000



(TfidfVectorizer(max_features=5000, ngram_range=(1, 2), norm='l1'),
 LinearSVC())

# SAVING AND LOADING THE MODEL

In [20]:
import pickle

In [21]:
pickle.dump(clf, open('clf.pkl', 'wb'))
pickle.dump(tfidf, open('tfidf.pkl', 'wb'))

In [22]:
del clf
del tfidf

In [23]:
clf = pickle.load(open('clf.pkl', 'rb'))
tfidf = pickle.load(open('tfidf.pkl', 'rb'))

In [24]:
#tfidf.vocabulary_

In [25]:
clf.predict(tfidf.transform(x))

array([1])

# REAL TIME SENTIMENT ANALYSIS

In [26]:
consumer_key = 'i47xjSBBe07R8zGauaQbl7oBA'
consumer_secret = 'sZoeCAoTO5zrHi270dBhhQhL203I6kQUjfKSRABrvQTxeSH9Zj'

In [27]:
access_token = '232661448-abXPKetKXDIWIU7RHZwstkNxw7vlONjKgjhDBEM7'
access_token_secret = 'mxJbEvny2kv3UsP6DkwSQAG11VlYnZJy3O2s0ywdYOl53'

In [28]:
import tweepy

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

public_tweets = api.home_timeline()
for tweet in public_tweets:
    print(tweet.text)

RT @alisher_ai: Last 1000 members to reach 10k members 🥳 Thank you for joining @__MLT__ and looking forward to learning and contributing to…
RT @into_AI: How can radiologists benefit from AI in 2021? - May 12, 2021 -- So let's say y https://t.co/X9vmDnKQpG #ai #intoAInews
RT @subex: Read our next blog to know the challenges faced by CIOs and how these challenges can be addressed by leveraging emerging technol…
RT @Rohit_kasture: For the development and testing of #autonomous vehicles, the #Cloud is the environment of choice. However, rapidly expan…
RT @Samu3lR0y: Pico4ML Brings Machine Learning To the Raspberry Pi Pico - Tom's Hardware

Read more here: https://t.co/fX08wxHVyr

#Artific…
RT @Clement_MENGUE: My Disruptive Tech Deal Daily! https://t.co/Vw7uyokFik Thanks to @grjenkin #artificialintelligence #machinelearning
RT @HarbRimah: #infographic Investing in Core Cybersecurity Technology is a must 

#AI #data #CyberSecurity #technology #tech #cybercrime #…
RT @datos_digital: Cisco Ac

# TRACKING KEYWORDS ON TWITTER

In [29]:
import json

In [30]:
track_keyword = ['usa','china']

In [31]:
class MyStreamListener(tweepy.StreamListener):
    def on_status(self, status):
        print(status.text)

    def on_data(self, data):
        raw_twitt = json.loads(data)
        try:
            print(raw_twitt['text'])
        except:
            pass

    def on_error(self, status_code):
        if status_code == 420:
            #returning False in on_error disconnects the stream
            return False

In [32]:
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth = api.auth, listener=myStreamListener)

In [None]:
myStream.filter(track=track_keyword)