In [1]:
import streamlit as st
import tweepy
# from wordcloud import WordCloud
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
# Import API key
from config import consumerKey
from config import consumerSecret
from config import accessToken
from config import accessTokenSecret

In [2]:
# Create the authentication object
authenticate = tweepy.OAuthHandler(consumerKey, consumerSecret)

In [3]:
# Set the access token and access token secret
authenticate.set_access_token(accessToken, accessTokenSecret)

In [4]:
# Creating the API object while passing in the auth information
api = tweepy.API(authenticate, wait_on_rate_limit=True)
api

<tweepy.api.API at 0x24369ef7438>

In [64]:
posts = api.search(
    q="#rogerfederer -RT", retweeted = "False", result_type='recent', count=100, lang="en", tweet_mode="extended")

In [73]:
text_input = "rogerfederer"
tweet_handle = text_input+" -RT"
tweet_handle

'rogerfederer -RT'

In [65]:
 df = pd.DataFrame( [tweet.full_text for tweet in posts], columns=['Tweets'])
df.head()

Unnamed: 0,Tweets
0,"‘What Federer, Nadal, and Djokovic are doing i..."
1,“Have Him As A Tennis Reference”: Argentine Fo...
2,“Wasn't Very High In The Standings”: Andrey Ru...
3,Federer's comeback season...\nhttps://t.co/Flk...
4,Can you guess the brand without Googling it? \...


In [66]:
#Create a function to clean the tweets:
def cleanTxt(text):
    # Removing @mentions
    text = re.sub('@[A-Za-z0–9]+', ' ', text)
    # Removing '#' hash tag symbol
    text = re.sub('#', '', text)
     # Removing ': hash tag symbol
    text = re.sub(':', '', text)
    # Removing RT re-tweet
    text = re.sub('RT[\s]+', '', text)
    # Removing hyperlink
    text = re.sub('https?:\/\/\S+', '', text)
    
    return text

In [67]:
df["Tweets"] = df["Tweets"].apply(cleanTxt)
df["Tweets"]

0     ‘What Federer, Nadal, and Djokovic are doing i...
1     “Have Him As A Tennis Reference” Argentine Foo...
2     “Wasn't Very High In The Standings” Andrey Rub...
3     Federer's comeback season...\nhttps//t.co/Flka...
4     Can you guess the brand without Googling it? \...
                            ...                        
86    Big sis helping little bro. Aw! 😍😂🥰❤ rogerfede...
87    Federer fans after knowing that Rafael Nadal h...
88    I want to see a 21st title, Rog RogerFederer R...
89    Hey Roger, we're about to be neighbors! 👑🎾 Her...
90    RafaelNadal defeats NovakDjokovic to win Frenc...
Name: Tweets, Length: 91, dtype: object

In [68]:
df= df.dropna()

In [69]:
df= df.drop_duplicates()

In [70]:
df.count()

Tweets    91
dtype: int64

In [71]:
df['Tweets']

0     ‘What Federer, Nadal, and Djokovic are doing i...
1     “Have Him As A Tennis Reference” Argentine Foo...
2     “Wasn't Very High In The Standings” Andrey Rub...
3     Federer's comeback season...\nhttps//t.co/Flka...
4     Can you guess the brand without Googling it? \...
                            ...                        
86    Big sis helping little bro. Aw! 😍😂🥰❤ rogerfede...
87    Federer fans after knowing that Rafael Nadal h...
88    I want to see a 21st title, Rog RogerFederer R...
89    Hey Roger, we're about to be neighbors! 👑🎾 Her...
90    RafaelNadal defeats NovakDjokovic to win Frenc...
Name: Tweets, Length: 91, dtype: object

In [7]:
##Get the independent features
X = df["Tweets"]

In [8]:
X.shape

(100,)

In [9]:
messages = X.copy()

In [26]:
df

Unnamed: 0,Tweets
0,73_2 if you miss ...
1,73_2 if you miss ...
2,"Pretty high, in exhibition matches"
3,Only a fool would write off the Maestro. \n\...
4,Two legends faced off at Wimbledon in 2001. P...
...,...
95,Analysing ATP players' forehands a thread ✨\...
96,Tennis icon 🤝 Pop music icon\n\nThrowback to...
97,Analysing ATP players' forehands a thread ✨\...
98,Analysing ATP players' forehands a thread ✨\...


In [15]:
import tensorflow as tf

In [16]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot

## Pre-processing the tweets to fit our ML model

In [17]:
import nltk
import re
from nltk.corpus import stopwords

In [18]:
#In order to remove words that are not meanningful (e.g. the, a, then, often...), we need to download those words.
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Babette\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [38]:
##Data Preprocessing/Cleaning
from nltk.stem.porter import PorterStemmer
#Initialise PorterStemmer for Stemming
ps = PorterStemmer()
#Create an empty list named corpus that will contain our cleaned sentences and words
corpus = []
#Create a loop to clean all the text in messages:
for i in range(0, len(messages)):
    #print index
    print(i)
    # Removing hyperlink
    review = re.sub('https?:\/\/\S+', '', messages[i])
    #use re (regular expressions) to substitute all characters except [a-zA-Z] by blank in message 'text'
    review = re.sub('[^a-zA-Z]', ' ', messages[i])
    #convert all the characters as lower case
    review = review.lower()
    #split all the words in each sentence to be able to later remove the stopwords
    review = review.split()
    
    #create a loop in review: for each word in review, keep only words that are not stopwords list and apply 'Stemming'
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    #join words with a space to build the review
    review = ' '.join(review)
    #append the review into the corpus
    corpus.append(review)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [39]:
#Checking the new sentences in the corpus
for i in range(0,5):
    print(corpus[i])

rt camerlengo rt miss rogerfeder http co zwndtluqwq
rt camerlengo rt miss rogerfeder http co zwndtluqwq
tenni rogerfeder pretti high exhibit match
rt tenni fool would write maestro rate rogerfeder chanc http co smnnzft id
two legend face wimbledon pete sampra vs year old rogerfeder shock champ pete wimbledon titl roger tenni atptour http co sgjqg jnff
