In [1]:
import streamlit as st
import tweepy
# from wordcloud import WordCloud
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
# Import API key
from config import consumerKey
from config import consumerSecret
from config import accessToken
from config import accessTokenSecret

In [2]:
#from Datasets.trainML import word_embedding

In [3]:
# Create the authentication object
authenticate = tweepy.OAuthHandler(consumerKey, consumerSecret)

In [4]:
# Set the access token and access token secret
authenticate.set_access_token(accessToken, accessTokenSecret)

In [5]:
# Creating the API object while passing in the auth information
api = tweepy.API(authenticate, wait_on_rate_limit=True)
api

<tweepy.api.API at 0x280933e7588>

In [57]:
posts = api.search(
    q="@DanAndrewsmp -RT", result_type='recent', count=100, lang="en", tweet_mode="extended")

In [58]:
df = pd.DataFrame( [tweet.full_text for tweet in posts], columns=['Tweets'])
df

Unnamed: 0,Tweets
0,@TimWilsonMP @DanAndrewsMP Trying to will your...
1,@LastWave_ @JewsDownUnder @AlboMP @DanAndrewsM...
2,@JewsDownUnder @LastWave_ @AlboMP @DanAndrewsM...
3,@Carolewalker13 @TimWilsonMP @DanAndrewsMP Who...
4,So tell me something @DanAndrewsMP WHY is the ...
...,...
95,"@TimWilsonMP @DanAndrewsMP You haven't, so you..."
96,"@TimWilsonMP @DanAndrewsMP Cmon Tim, it's the ..."
97,@TimWilsonMP @DanAndrewsMP Do you do anything ...
98,@TimWilsonMP @DanAndrewsMP Oh shut up Tim. Ple...


In [59]:
#Create a function to clean the tweets:
def cleanTxt(text):
    # Removing @mentions
    text = re.sub('@[A-Za-z0–9]+', ' ', text)
    # Removing '#' hash tag symbol
    text = re.sub('#', '', text)
     # Removing ': hash tag symbol
    text = re.sub(':', '', text)
    # Removing RT re-tweet
    text = re.sub('RT[\s]+', '', text)
    # Removing hyperlink
    text = re.sub('https?:\/\/\S+', '', text)
    # Removing hyperlink
    text = re.sub('http:\/\/\S+', '', text)
    
    return text

In [60]:
df["Tweets"] = df["Tweets"].apply(cleanTxt)
df["Tweets"]

0            Trying to will yourself into relevance Tim
1      _         Australia?\nIn accordance with the ...
2        _       The power to dismiss a Premier when...
3      13     Who lives in America? I used to live i...
4     So tell me something   WHY is the 25km ‘rule’ ...
                            ...                        
95        You haven't, so you can sod off and try to...
96        Cmon Tim, it's the LNP loving racist MSM t...
97        Do you do anything except whine for attent...
98        Oh shut up Tim. Please just shut up and le...
99          Bit of both, ADS a major factor, relevan...
Name: Tweets, Length: 100, dtype: object

In [61]:
df= df.dropna()

In [62]:
df= df.drop_duplicates()

In [63]:
df.count()

Tweets    100
dtype: int64

In [64]:
df['Tweets'][75]

'    Aged care, Ruby Princess, Covidsafe app, sports rorts, Sydney airport. Holier than though art?'

In [65]:
##Get the independent features
X = df["Tweets"]

In [66]:
X.shape

(100,)

In [67]:
messages = X.copy()

In [68]:
import tensorflow as tf

In [69]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot

## Pre-processing the tweets to fit our ML model

In [70]:
import nltk
import re
from nltk.corpus import stopwords

In [71]:
#In order to remove words that are not meanningful (e.g. the, a, then, often...), we need to download those words.
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Babette\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [72]:
##Data Preprocessing/Cleaning
from nltk.stem.porter import PorterStemmer
#Initialise PorterStemmer for Stemming
ps = PorterStemmer()
#Create an empty list named corpus that will contain our cleaned sentences and words
corpus = []
#Create a loop to clean all the text in messages:
for i in range(0, len(messages)):
    #print index
    print(i)
    #use re (regular expressions) to substitute all characters except [a-zA-Z] by blank in message 'text'
    review = re.sub('[^a-zA-Z]', ' ', messages[i])
    #convert all the characters as lower case
    review = review.lower()
    #split all the words in each sentence to be able to later remove the stopwords
    review = review.split()
    
    #create a loop in review: for each word in review, keep only words that are not stopwords list and apply 'Stemming'
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    #join words with a space to build the review
    review = ' '.join(review)
    #append the review into the corpus
    corpus.append(review)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [73]:
#Checking the new sentences in the corpus
for i in range(0,10):
    print(corpus[i])

tri relev tim
australia accord convent westminst system parliamentari govern nevertheless governor retain reserv power crown right dismiss premier
power dismiss premier lost confid parliament power dismiss premier act unlaw power refus dissolv hous assembl despit request premier http co hmkk imtql
live america use live us return back australia children month back
tell someth km rule still place medic reason piec asshol held account pain suffer caus friend famili forget http co azitydecew
governor dismiss governor ball kick
puppet hell bent destroy victoria orchestr nd wave guis incompet divis amongst commun public vicpol begin
read constitut
governor dismiss premier governor retain reserv power crown right dismiss premier
contract still ipa paper tim


## One-Hot representation of words/sentences

In [74]:
#Apply One_hot representation for each word in the corpus based on the voc_size - each word is allocated a number within the sentence.
voc_size = 5000
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr

[[4031, 3275, 2367],
 [939,
  1549,
  4538,
  1964,
  1785,
  1825,
  2211,
  3184,
  2729,
  850,
  3403,
  4786,
  3751,
  130,
  1621,
  4307],
 [4786,
  1621,
  4307,
  589,
  2364,
  1478,
  4786,
  1621,
  4307,
  4081,
  2084,
  4786,
  3902,
  2085,
  1466,
  1112,
  65,
  2177,
  4307,
  1175,
  4350,
  3181,
  3008],
 [653, 4186, 4662, 653, 3035, 2878, 2452, 939, 1799, 378, 2452],
 [1661,
  1009,
  218,
  19,
  437,
  137,
  3667,
  4901,
  2227,
  3856,
  1368,
  4599,
  4602,
  1809,
  4146,
  1033,
  4468,
  2820,
  1175,
  4350,
  1251],
 [2729, 1621, 2729, 3359, 114],
 [1722,
  2944,
  4358,
  4458,
  2242,
  1181,
  2126,
  4323,
  4198,
  4109,
  1150,
  681,
  1267,
  1311,
  1517,
  1873],
 [1739, 3447],
 [2729, 1621, 4307, 2729, 850, 3403, 4786, 3751, 130, 1621, 4307],
 [1320, 437, 2523, 329, 2367],
 [2619,
  4000,
  4227,
  835,
  3570,
  4673,
  2333,
  98,
  130,
  644,
  4109,
  2980,
  649,
  65,
  2515,
  509,
  3823,
  1639,
  3250,
  3421,
  2411],
 [2173, 1

In [75]:
#Checking the maximum length of all sentences.
number_words=[]
for i in range(0,len(messages)):
    number_words.append(len(onehot_repr[i]))

In [76]:
import numpy as np

## Word Embedding

In [77]:
#Use pad sequencing to ensure all sentences are the same length.
#Set up the common length of each sentence. 31 words as per our trained modle
sent_length=31
#Embebbed each sentence as a matrix
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 4031 3275 2367]
 [   0    0    0 ...  130 1621 4307]
 [   0    0    0 ... 4350 3181 3008]
 ...
 [   0    0    0 ... 1442 3389 3556]
 [   0    0    0 ... 2980 1350 1966]
 [   0    0    0 ...  763 3678 1246]]


In [78]:
#Check first sentence embedded with 31 words
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0, 4031, 3275, 2367])

In [79]:
#Checking the shape of embedded_docs
len(embedded_docs)

100

In [80]:
#Storing embedded_docs into an array
X_final = np.array(embedded_docs)
X_final

array([[   0,    0,    0, ..., 4031, 3275, 2367],
       [   0,    0,    0, ...,  130, 1621, 4307],
       [   0,    0,    0, ..., 4350, 3181, 3008],
       ...,
       [   0,    0,    0, ..., 1442, 3389, 3556],
       [   0,    0,    0, ..., 2980, 1350, 1966],
       [   0,    0,    0, ...,  763, 3678, 1246]])

## Loading the model and predicting the values

In [81]:
# Load the model
from tensorflow.keras.models import load_model
model = load_model("Datasets/tweeter_ml_trained_50000.h5")

In [82]:
#Predict y values on X_final
y_pred=model.predict_classes(X_final)

In [83]:
df["Sentiment"] = y_pred
df.head(10)

Unnamed: 0,Tweets,Sentiment
0,Trying to will yourself into relevance Tim,1
1,_ Australia?\nIn accordance with the ...,0
2,_ The power to dismiss a Premier when...,0
3,13 Who lives in America? I used to live i...,0
4,So tell me something WHY is the 25km ‘rule’ ...,0
5,13 The Governor won’t dismiss \nThe G...,1
6,is a puppet and hell-bent on destroying Vi...,0
7,I’ve read the constitution...,0
8,The governor can dismiss the premier.. t...,0
9,Does your contract still have IPA on the p...,0


In [84]:
df["Sentiment"].value_counts()

1    55
0    45
Name: Sentiment, dtype: int64

In [87]:
for i in range (0,10):
    print(df["Tweets"][i])

    Trying to will yourself into relevance Tim
 _         Australia?
In accordance with the conventions of the Westminster system of parliamentary government, Nevertheless, the governor retains the reserve powers of the Crown, and has the right to dismiss the premier.
   _       The power to dismiss a Premier when he or she has lost the confidence of the Parliament;the power to dismiss a Premier or when he or she is acting unlawfully;the power to refuse to dissolve the house of Assembly despite a request from the Premier.
￼https//t.co/hMkk0IMtQl
 13     Who lives in America? I used to live in the US, but returned back to Australia with my children a few months back.
So tell me something   WHY is the 25km ‘rule’ still in place? There is no medical reason for it you piece of 💩 is it because of this 👇 - asshole! You will be held accountable for all the pain and suffering you have caused! My friends and family will not forget! https//t.co/azitYdECEw
 13       The Governor won’t dismiss   
