In [1]:
import streamlit as st
import tweepy
# from wordcloud import WordCloud
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
# Import API key
from config import consumerKey
from config import consumerSecret
from config import accessToken
from config import accessTokenSecret

In [135]:
# Create the authentication object
authenticate = tweepy.OAuthHandler(consumerKey, consumerSecret)

In [136]:
# Set the access token and access token secret
authenticate.set_access_token(accessToken, accessTokenSecret)

In [137]:
# Creating the API object while passing in the auth information
api = tweepy.API(authenticate, wait_on_rate_limit=True)
api

<tweepy.api.API at 0x24374d50208>

In [138]:
posts = api.search(
    q="#rogerfederer -RT", retweeted = "False", result_type='recent', count=100, lang="en", tweet_mode="extended")

In [139]:
text_input = "rogerfederer"
tweet_handle = text_input+" -RT"
tweet_handle

'rogerfederer -RT'

In [140]:
df = pd.DataFrame( [tweet.full_text for tweet in posts], columns=['Tweets'])
df.head()

Unnamed: 0,Tweets
0,"‘What Federer, Nadal, and Djokovic are doing i..."
1,“Have Him As A Tennis Reference”: Argentine Fo...
2,“Wasn't Very High In The Standings”: Andrey Ru...
3,Federer's comeback season...\nhttps://t.co/Flk...
4,Can you guess the brand without Googling it? \...


In [141]:
#Create a function to clean the tweets:
def cleanTxt(text):
    # Removing @mentions
    text = re.sub('@[A-Za-z0–9]+', ' ', text)
    # Removing '#' hash tag symbol
    text = re.sub('#', '', text)
     # Removing ': hash tag symbol
    text = re.sub(':', '', text)
    # Removing RT re-tweet
    text = re.sub('RT[\s]+', '', text)
    # Removing hyperlink
    text = re.sub('https?:\/\/\S+', '', text)
    # Removing hyperlink
    text = re.sub('http:\/\/\S+', '', text)
    
    return text

In [142]:
df["Tweets"] = df["Tweets"].apply(cleanTxt)
df["Tweets"]

0     ‘What Federer, Nadal, and Djokovic are doing i...
1     “Have Him As A Tennis Reference” Argentine Foo...
2     “Wasn't Very High In The Standings” Andrey Rub...
3     Federer's comeback season...\nhttps//t.co/Flka...
4     Can you guess the brand without Googling it? \...
                            ...                        
86    Big sis helping little bro. Aw! 😍😂🥰❤ rogerfede...
87    Federer fans after knowing that Rafael Nadal h...
88    I want to see a 21st title, Rog RogerFederer R...
89    Hey Roger, we're about to be neighbors! 👑🎾 Her...
90    RafaelNadal defeats NovakDjokovic to win Frenc...
Name: Tweets, Length: 91, dtype: object

In [143]:
df= df.dropna()

In [144]:
df= df.drop_duplicates()

In [145]:
df.count()

Tweets    91
dtype: int64

In [146]:
df['Tweets']

0     ‘What Federer, Nadal, and Djokovic are doing i...
1     “Have Him As A Tennis Reference” Argentine Foo...
2     “Wasn't Very High In The Standings” Andrey Rub...
3     Federer's comeback season...\nhttps//t.co/Flka...
4     Can you guess the brand without Googling it? \...
                            ...                        
86    Big sis helping little bro. Aw! 😍😂🥰❤ rogerfede...
87    Federer fans after knowing that Rafael Nadal h...
88    I want to see a 21st title, Rog RogerFederer R...
89    Hey Roger, we're about to be neighbors! 👑🎾 Her...
90    RafaelNadal defeats NovakDjokovic to win Frenc...
Name: Tweets, Length: 91, dtype: object

In [147]:
##Get the independent features
X = df["Tweets"]

In [148]:
X.shape

(91,)

In [149]:
messages = X.copy()

In [150]:
import tensorflow as tf

In [151]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot

## Pre-processing the tweets to fit our ML model

In [152]:
import nltk
import re
from nltk.corpus import stopwords

In [153]:
#In order to remove words that are not meanningful (e.g. the, a, then, often...), we need to download those words.
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Babette\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [154]:
##Data Preprocessing/Cleaning
from nltk.stem.porter import PorterStemmer
#Initialise PorterStemmer for Stemming
ps = PorterStemmer()
#Create an empty list named corpus that will contain our cleaned sentences and words
corpus = []
#Create a loop to clean all the text in messages:
for i in range(0, len(messages)):
    #print index
    print(i)
    #use re (regular expressions) to substitute all characters except [a-zA-Z] by blank in message 'text'
    review = re.sub('[^a-zA-Z]', ' ', messages[i])
    #convert all the characters as lower case
    review = review.lower()
    #split all the words in each sentence to be able to later remove the stopwords
    review = review.split()
    
    #create a loop in review: for each word in review, keep only words that are not stopwords list and apply 'Stemming'
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    #join words with a space to build the review
    review = ' '.join(review)
    #append the review into the corpus
    corpus.append(review)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90


In [155]:
#Checking the new sentences in the corpus
for i in range(0,10):
    print(corpus[i])

feder nadal djokov normal say former world number eight marco baghdati follow daili updat regard tenni rogerfeder novakdjokov http co p adk v w
tenni refer argentin footbal admir roger feder http co v nx f tw rogerfeder tenni musktreetenni footbal
high stand andrey rublev recal preciou moment roger feder http co wgrxewgwzq rogerfeder andreyrublev tenni musktreetenni
feder comeback season http co flka hnxgz tenni atp feder rogerfeder http co fgz e
guess brand without googl rogerfeder tenni http co qvojphcckv
new bg rogerfeder http co c amzktm
deni shapovalov take huge dig atp karen khachanov lost cool european open tennisnew women men wta atp tenni frenchopen franc pari grandslam ashbarti rogerfeder cocogauff naomiosaka rafaelnad serenawilliam http co f ectjfzvk
also read kitchen new york illustri eleven madison park chef rogerfeder find place chef floyd cardoz wing scrollchefofthemonth octob done read interview http co iwgtmongip scrollfood
roger vs rafa kind rivalri true tenni fan may

## One-Hot representation of words/sentences

In [156]:
voc_size = 10000

In [157]:
#Apply One_hot representation for each word in the corpus based on the voc_size - each word is allocated a number within the sentence.
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr

[[9661,
  6672,
  2731,
  9739,
  1835,
  8928,
  579,
  1763,
  2422,
  7665,
  9282,
  4958,
  5270,
  8190,
  816,
  7420,
  7344,
  9059,
  7255,
  4109,
  1623,
  3504,
  2282,
  8327],
 [7420,
  702,
  9430,
  3896,
  9631,
  2545,
  9661,
  7255,
  4109,
  2282,
  3690,
  5199,
  4764,
  7344,
  7420,
  9925,
  3896],
 [4454,
  8036,
  4045,
  6849,
  869,
  5727,
  26,
  2545,
  9661,
  7255,
  4109,
  2012,
  7344,
  9138,
  7420,
  9925],
 [9661,
  4227,
  4375,
  7255,
  4109,
  9233,
  8686,
  7420,
  7395,
  9661,
  7344,
  7255,
  4109,
  1648,
  4497],
 [4105, 7352, 3249, 2898, 7344, 7420, 7255, 4109, 5967],
 [5033, 5047, 7344, 7255, 4109, 9497, 6299],
 [3395,
  1347,
  9434,
  6343,
  9152,
  7395,
  7628,
  6205,
  6757,
  8127,
  872,
  3394,
  7770,
  5132,
  5691,
  7532,
  7395,
  7420,
  8965,
  3930,
  1081,
  929,
  7608,
  7344,
  7902,
  9266,
  9344,
  2395,
  7255,
  4109,
  5199,
  7412],
 [7610,
  8516,
  9394,
  5033,
  6693,
  9664,
  6064,
  4896,
  242

In [158]:
#Checking the maximum length of all sentences.
number_words=[]
for i in range(0,len(messages)):
    number_words.append(len(onehot_repr[i]))

In [159]:
import numpy as np

## Word Embedding

In [160]:
#Use pad sequencing to ensure all sentences are the same length.
#Set up the common length of each sentence. 31 words as per our trained modle
sent_length=31
#Embebbed each sentence as a matrix
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 3504 2282 8327]
 [   0    0    0 ... 7420 9925 3896]
 [   0    0    0 ... 9138 7420 9925]
 ...
 [   0    0    0 ... 5406 8646 8725]
 [   0    0    0 ... 4109 9117 2893]
 [   0    0    0 ... 7255 4109  539]]


In [161]:
#Check first sentence embedded with 31 words
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0, 9661, 6672, 2731, 9739,
       1835, 8928,  579, 1763, 2422, 7665, 9282, 4958, 5270, 8190,  816,
       7420, 7344, 9059, 7255, 4109, 1623, 3504, 2282, 8327])

In [162]:
#Checking the shape of embedded_docs
len(embedded_docs)

91

In [163]:
#Storing embedded_docs into an array
X_final = np.array(embedded_docs)
X_final

array([[   0,    0,    0, ..., 3504, 2282, 8327],
       [   0,    0,    0, ..., 7420, 9925, 3896],
       [   0,    0,    0, ..., 9138, 7420, 9925],
       ...,
       [   0,    0,    0, ..., 5406, 8646, 8725],
       [   0,    0,    0, ..., 4109, 9117, 2893],
       [   0,    0,    0, ..., 7255, 4109,  539]])

## Loading the model and predicting the values

In [164]:
# Load the model
from tensorflow.keras.models import load_model
model = load_model("Datasets/Tweets_dataset/tweeter_ml_trained_50000.h5")

In [165]:
#Predict y values on X_final
y_pred=model.predict_classes(X_final)

In [166]:
df["Sentiment"] = y_pred
df

Unnamed: 0,Tweets,Sentiment
0,"‘What Federer, Nadal, and Djokovic are doing i...",0
1,“Have Him As A Tennis Reference” Argentine Foo...,1
2,“Wasn't Very High In The Standings” Andrey Rub...,1
3,Federer's comeback season...\nhttps//t.co/Flka...,1
4,Can you guess the brand without Googling it? \...,0
...,...,...
86,Big sis helping little bro. Aw! 😍😂🥰❤ rogerfede...,0
87,Federer fans after knowing that Rafael Nadal h...,1
88,"I want to see a 21st title, Rog RogerFederer R...",0
89,"Hey Roger, we're about to be neighbors! 👑🎾 Her...",0


In [168]:
df["Sentiment"].value_counts()

0    51
1    40
Name: Sentiment, dtype: int64

In [171]:
for i in range (87,88):
    print(df["Tweets"][i])

Federer fans after knowing that Rafael Nadal has equaled Federer's Grand Slam record and now they have no argument left to not consider Rafael Nadal as the GOAT.
RafaelNadal RogerFederer Tennis FrenchOpen RolandGarros RolandGarros2020 Nadal Federer https//t.co/sucAQieZVn
