# NBA Free Agency

This notebook aims to determine whether the top 10 NBA free agents (or players who opt for player option in their contracts) are likely stay or leave a team during free agency or the mid-season trade deadline. This will be determined using NLP techniques on phrases extracted from tweets, news, interviews, polls, basketball stats, etc. 

#### Top 10 NBA Free Agents 2019
According to SBNation and ESPN these players are: <br>
Link: https://www.sbnation.com/nba/2018/7/30/17616436/nba-free-agency-2019-list-kevin-durant-kyrie-irving

1. Kevin Durant
2. Kawhi Leonard
3. Kyrie Irving
4. Jimmy Butler 
5. Klay Thompson
6. DeMarcus Cousins
7. Al Horford
8. Kemba Walker
9. Khris Middleton
10. Eric Bledsoe

### Importing Libraries

In [1]:
#Web Scraping
from bs4 import BeautifulSoup
import requests
import urllib
import urllib3
import string
import time 

#Text Processing
#Download package for word_tokenize and lemmatize NLTK functions 
#  1. punkt
#  2. stopwords
#  3. wordnet
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/arnavgarg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/arnavgarg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/arnavgarg/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Links

In [2]:
start = time.time()

def return_links(user_query):
    '''
    Function takes in a string as a query, 
    and scrapes the links of the top 10 websites 
    for this query on Google.
    
    Returns: A list of URLs
    '''
    #Initialize empty array to store all links scraped from google query
    links = []
    google_search = "https://www.google.com/search?sclient=psy-ab&client=ubuntu&hs=k5b&channel=fs&biw=1366&bih=648&noj=1&q=" + user_query
    r = requests.get(google_search)
    #If google query is valid and returns 200 status (True)
    if r.ok:
        #Parse the returned object into an HTML format that bs4 understands
        soup = BeautifulSoup(r.text, "html.parser")
        #Go through each item in the google page, and save the green link beneath the
        #page header and append it to the empty list
        for item in soup.find_all('h3', attrs={'class' : 'r'}):
            links.append(item.a['href'][7:])
    #If status fails/404 error/page does not load correctly/invalid URL retrieved
    else:
        #Find query_errors and save the URL in an error log
        f = open("Error-Logs/query_errors.txt","a+")
        f.write("\n")
        f.write(user_query)
        f.close()
    return links

def clean_links(links):
    '''
    This function takes a list of links, cleans each link 
    so that they become valid URLs. It also eliminates 
    most invalid links/URLs.
    
    Returns: A list of valid URLs.
    '''
    #Go through all the links and remove the extensions of the link
        #after the '&' and '%' symbols within each URL. 
    #Remove the initial invalid google search query link
    for i in range(0, len(links)-1):
        #Returns index where character is found
        x = links[i].find('&')
        y = links[i].find('%')
        flag = False
        if x != -1:
            print("Found x:",x)
            links[i] = links[i][:x]
            flag = True
        if y != -1 and not flag:
            print("Found y:",y)
            links[i] = links[i][:y]
        #Finding link that starts with '?' after cleaning
        test = re.findall(r"\?", links[i])
        #Converting list to string
        str1 = ''.join(test)
        if(str1 == '?'):
            links.remove(links[i])
        #Print link to output
        print(i+1,links[i])
    return links

### Text

In [3]:
def cleanhtml(raw_html):
    '''
    Helper function to clean html tags.
    
    Input: Block of raw HTML text in lower case.
    Returns: Text almost entirely free of HTML tags.
    '''
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    cleantext = re.sub('https', '', cleantext)
    return cleantext

def clean_text(text):
    '''
    Function takes a text extract, and processes it.
    
    Data Processing consists of: 
    1. Converting text to lowercase
    2. Extracting alpha-numeric
    3. Removing stop words
    4. Removes punctuations
    5. Lemmatizes the given text
    6. Stems the given text
    7. Removes potential HTML markup tags
    8. Remove white spaces
    
    Returns: A list of tokenized, processed words.
    
    Optionally add stemming:
    #Stem words to root words if/where possible
    #porter = PorterStemmer()
    #stemmed = [porter.stem(word) for word in punc_free]
    #Remove common html markup words
    '''
    #Convert to lower case
    text = text.lower()
    #Remove HTML tags
    text = cleanhtml(text)
    #Split text, and lemmatize each word
    lemma = WordNetLemmatizer()
    normalized = " ".join(lemma.lemmatize(word, pos = "v") for word in text.split())
    #Replace all digits with blank spaces
    normalized = normalized.replace('\d+', '')
    #Remove all white spaces (strip removes white spaces by default)
    normalized = normalized.strip()
    #Tokenize the text 
    tokenized = word_tokenize(normalized)
    #Extract words that are alpha and removes punctuations
    cleaned = [word for word in tokenized if word.isalpha()]
    #Create a dictionary of stem-words such as "at" and "the"
    #that don't contribute to meaning and remove them from the list
    stop_words = set(stopwords.words('english'))
    words = [w for w in cleaned if not w in stop_words]
    return words

count = 0
def save_text(links):
    '''
    Function takes in a set of links and extracts the text
    content from each website. It then processes this text
    content and stores into a csv file titled "excerpt".
    
    Returns: N/A
    '''
    global count
    for i in range(0, len(links)):
        r = requests.get(links[i])
        if r.ok:
            soup = BeautifulSoup(r.content, "html.parser")
            text = soup.find_all('p')
            page_text = ""
            for item in text:
                str_contents = str(item.contents)
                len_contents = len(str_contents)
                page_text += str_contents[1:len_contents-1]
            text = str(clean_text(page_text))
            f = open("Excerpts/excerpt{}.csv".format(count),"w+")
            f.write(str(links[i])+"\n\n"+text)
            f.close()
            page_text = ""
            count += 1
        else:
            f = open("Error-Logs/text_saving_errors.txt","a+")
            f.write(str(links[i])+"\n")
            f.close()
    print("\n{} files saved.".format(count))

### Main Program

In [4]:
#Extracting links
a = return_links("Lebron James")
#Cleaning Data
a = clean_links(a)
#Writing to csv files
save_text(a)

end = time.time()

print("Runtime:",end-start)

Found x: 15
1 https://en.wikipedia.org/wiki/LeBron_James&sa=U&ved=0ahUKEwjGnpne-creAhXiFzQIHUNvCw8QFggdMAM&usg=AOvVaw2ytH8Siwq29xGmfpEwMOZt
Found x: 53
2 http://www.espn.com/nba/player/_/id/1966/lebron-james
Found x: 61
3 https://www.basketball-reference.com/players/j/jamesle01.html
Found x: 27
4 http://www.lebronjames.com/
Found x: 29
5 https://twitter.com/kingjames
Found x: 71
6 https://sports.yahoo.com/rumor-lebron-james-called-favor-150028976.html
Found x: 42
7 https://sports.yahoo.com/nba/players/3704/
Found x: 68
8 https://www.cbssports.com/nba/players/playerpage/400553/lebron-james
Found x: 130
9 https://www.express.co.uk/sport/othersport/1043601/NBA-news-Golden-State-Warriors-DeMarcus-Cousins-LeBron-James-Los-Angeles-Lakers

8 files saved.
Runtime: 8.629467964172363
