# NBA Free Agency

This notebook aims to determine whether the top 10 NBA free agents (or players who opt for player option in their contracts) are likely stay or leave a team during free agency or the mid-season trade deadline. This will be determined using NLP techniques on phrases extracted from tweets, news, interviews, polls, basketball stats, etc. 

#### Top 10 NBA Free Agents 2019
According to SBNation and ESPN these players are: <br>
Link: https://www.sbnation.com/nba/2018/7/30/17616436/nba-free-agency-2019-list-kevin-durant-kyrie-irving

1. Kevin Durant
2. Kawhi Leonard
3. Kyrie Irving
4. Jimmy Butler 
5. Klay Thompson
6. DeMarcus Cousins
7. Al Horford
8. Kemba Walker
9. Khris Middleton
10. Eric Bledsoe

### Importing Libraries

In [1]:
#Web Scraping
from bs4 import BeautifulSoup
import requests
import urllib
import string

#Text Processing
#Download package for word_tokenize and lemmatize NLTK functions 
#  1. punkt
#  2. stopwords
#  3. wordnet
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/raghavagovil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/raghavagovil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/raghavagovil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Links

In [2]:
#Scraping the links of Google
def return_links(user_query):
    #Initialize empty array to store all links scraped from google query
    links = []
    google_search = "https://www.google.com/search?sclient=psy-ab&client=ubuntu&hs=k5b&channel=fs&biw=1366&bih=648&noj=1&q=" + user_query
    r = requests.get(google_search)
    #If google query is valid and returns 200 status (True)
    if r.ok:
        #Parse the returned object into an HTML format that bs4 understands
        soup = BeautifulSoup(r.text, "html.parser")
        #Go through each item in the google page, and save the green link beneath the
        #page header and append it to the empty list
        for item in soup.find_all('h3', attrs={'class' : 'r'}):
            links.append(item.a['href'][7:])
    #If status fails/404 error/page does not load correctly/invalid URL retrieved
    else:
        #Find query_errors and save the URL in an error log
        f = open("Error-Logs/query_errors.txt","a+")
        f.write("\n")
        f.write(user_query)
        f.close()
    return links

#Cleaning the extracted links they become valid 
def clean_links(links):
    '''
    This function cleans a link or a list of extracted links
    '''
    #Go through all the links and remove the extensions of the link
    #after the '&' and '%' symbols within each URL. 
    # -> Needed to create valid URLs from scraped URLs
    for i in range(0, len(links)):
        x = links[i].find('&')
        if x != -1:
            links[i] = links[i][:x]
        for i in range(0, len(links)):
            x = links[i].find('%')
            if x != -1:
                links[i] = links[i][:x]
    #Remove the initial invalid google search query link
    for i in range(0, len(links)):
        #Finding link that starts with '?' after cleaning
        test = re.findall(r"\?", links[i])
        #Converting list to string
        str1 = ''.join(test)
        if(str1 == '?'):
            links.remove(links[i])
            break
    for i in range(len(links)):
        print(i+1,links[i])
    return links

### Text

In [3]:
#Data Processing:
def clean_text(text):
    '''
    Data Processing consists of: 
    1. Converting text to lowercase
    2. Extracting alpha-numeric
    3. Removing stop words
    4. Removes punctuations
    5. Lemmatizes the given text
    6. Stems the given text
    7. Removes potential HTML markup tags
    8. Remove white spaces
    '''
    #1. Convert to lower case
    text = text.lower()
    #5. Lemmatizing the text
    lemma = WordNetLemmatizer()
    normalized = " ".join(lemma.lemmatize(word, pos = "v") for word in text.split())
    #Removing White spaces
    normalized = normalized.replace('\d+', '')
    normalized = normalized.strip()
    #Tokenize and extract words that are alpha-numeric
    tokens = word_tokenize(normalized)
    cleaned = [word for word in tokens if word.isalpha()]
    #Create a dictionary of stem-words such as "at" and "the"
    #that don't contribute to meaning and remove them from the list
    stop_words = set(stopwords.words('english'))
    words = [w for w in cleaned if not w in stop_words]
    #Remove punctuations
    exclude = set(string.punctuation)
    punc_free = [ch for ch in stop_words if ch not in exclude]
    #Stem words to root words if/where possible
    #porter = PorterStemmer()
    #stemmed = [porter.stem(word) for word in punc_free]
    #Remove common html markup words
    html_words = ['html','http','https','.com','.org','.edu', 
                  'img', 'href', 'span', 'b', 'u']
    words = [w for w in punc_free if not w in html_words]
    return words

#Writing the processed data to csv files
count = 0
def save_text(links):
    global count
    for i in range(0, len(links)):
        r = requests.get(links[i])
        if r.ok:
            soup = BeautifulSoup(r.content, "html.parser")
            text = soup.find_all('p')
            page_text = ""
            for item in text:
                str_contents = str(item.contents)
                len_contents = len(str_contents)
                page_text += str_contents[1:len_contents-1]
            text = clean_text(page_text)
            f = open("Excerpts/excerpt{}.csv".format(count),"w+")
            f.write(str(links[i])+"\n\n")
            f.write(str(text))
            f.close()
            page_text = ""
            count += 1
        else:
            f = open("Error-Logs/text_saving_errors.txt","a+")
            f.write("\n")
            f.write(links[i])
            f.close()
    print("\n{} files saved.".format(count))

### Main Program

In [4]:
#Extracting links
a = return_links("Lebron James Twitter")
#Cleaning Data
a = clean_links(a)
#Writing to csv files
save_text(a)

1 https://twitter.com/kingjames
2 https://twitter.com/kingjames/status/1052698329795026944
3 https://twitter.com/hashtag/lebronjames
4 https://twitter.com/FOXSports/status/1060580256933793792
5 https://twitter.com/hashtag/lebron
6 https://www.usatoday.com/story/sports/nba/lakers/2018/10/29/lebron-james-michael-jordan-matchup/1804629002/
7 https://www.nbcsports.com/philadelphia/the700level/ben-simmons-sets-twitter-ablaze-tweet-lebron-james
8 https://www.cbsnews.com/pictures/the-biggest-twitter-accounts-on-earth/4/
9 https://www.instagram.com/kingjames/

9 files saved.
