## Basketball Injury Data

In [3]:
import pandas as pd 
import numpy as np

csv_url = "https://raw.githubusercontent.com/anly501/dsan-5000-project-rennyd123/main/dsan-website/5000-website/data/raw/injuries_2010-2020.csv"

bball_injury_data = pd.read_csv(csv_url)
print(bball_injury_data.head())

         Date     Team Acquired   Relinquished  \
0  2010-10-03    Bulls      NaN  Carlos Boozer   
1  2010-10-06  Pistons      NaN  Jonas Jerebko   
2  2010-10-06  Pistons      NaN  Terrico White   
3  2010-10-08  Blazers      NaN     Jeff Ayres   
4  2010-10-08     Nets      NaN    Troy Murphy   

                                               Notes  
0  fractured bone in right pinky finger (out inde...  
1      torn right Achilles tendon (out indefinitely)  
2  broken fifth metatarsal in right foot (out ind...  
3          torn ACL in right knee (out indefinitely)  
4             strained lower back (out indefinitely)  


In [4]:
#Changing the Date column class to be datetime
bball_injury_data["Date"] = pd.to_datetime(bball_injury_data["Date"])
print(bball_injury_data.dtypes)


#Dropping the acquired column because we are only concerned with the players who were placed on the IL
bball_injury_data = bball_injury_data.drop(columns="Acquired")
bball_injury_data = bball_injury_data.dropna()


#Extracting injury status into a new column when applicable
bball_injury_data['InjuryStatus'] = bball_injury_data['Notes'].str.extract(r'\((.*?)\)')
bball_injury_data['Notes'] = bball_injury_data['Notes'].str.replace(r'\([^)]*\)', '').str.strip()


Date            datetime64[ns]
Team                    object
Acquired                object
Relinquished            object
Notes                   object
dtype: object


In [5]:
file_path = '../../data/01-modified-data/basketball_injury_data.csv'  
bball_injury_data.to_csv(file_path, index=False) 

## NFL Concussion Data

This data comes from [this link](https://sportsandsociety.osu.edu/sports-data-sets).


In [6]:
file_path = "../../data/00-raw-data/Concussion Injuries 2012-2014.csv"

concussions = pd.read_csv(file_path)
concussions.head()

Unnamed: 0,ID,Player,Team,Game,Date,Opposing Team,Position,Pre-Season Injury?,Winning Team?,Week of Injury,Season,Weeks Injured,Games Missed,Unknown Injury?,Reported Injury Type,Total Snaps,Play Time After Injury,Average Playtime Before Injury
0,Aldrick Robinson - Washington Redskins vs. Tam...,Aldrick Robinson,Washington Redskins,Washington Redskins vs. Tampa Bay Buccaneers (...,30/09/2012,Tampa Bay Buccaneers,Wide Receiver,No,Yes,4,2012/2013,1,1.0,No,Head,0,14 downs,37.00 downs
1,D.J. Fluker - Tennessee Titans vs. San Diego C...,D.J. Fluker,San Diego Chargers,Tennessee Titans vs. San Diego Chargers (22/9/...,22/09/2013,Tennessee Titans,Offensive Tackle,No,No,3,2013/2014,1,1.0,No,Concussion,0,78 downs,73.50 downs
2,Marquise Goodwin - Houston Texans vs. Buffalo ...,Marquise Goodwin,Buffalo Bills,Houston Texans vs. Buffalo Bills (28/9/2014),28/09/2014,Houston Texans,Wide Receiver,No,No,4,2014/2015,1,1.0,No,Concussion,0,25 downs,17.50 downs
3,Bryan Stork - New England Patriots vs. Buffalo...,Bryan Stork,New England Patriots,New England Patriots vs. Buffalo Bills (12/10/...,12/10/2014,Buffalo Bills,Center,No,Yes,6,2014/2015,1,1.0,No,Head,0,82 downs,41.50 downs
4,Lorenzo Booker - Chicago Bears vs. Indianapoli...,Lorenzo Booker,Chicago Bears,Chicago Bears vs. Indianapolis Colts (9/9/2012),9/09/2012,Indianapolis Colts,Running Back,Yes,Yes,1,2012/2013,0,,No,Head,0,Did not return from injury,


In [7]:
cols_drop = ["Game", "ID"]
concussions = concussions.drop(columns=cols_drop)
concussions.head()

Unnamed: 0,Player,Team,Date,Opposing Team,Position,Pre-Season Injury?,Winning Team?,Week of Injury,Season,Weeks Injured,Games Missed,Unknown Injury?,Reported Injury Type,Total Snaps,Play Time After Injury,Average Playtime Before Injury
0,Aldrick Robinson,Washington Redskins,30/09/2012,Tampa Bay Buccaneers,Wide Receiver,No,Yes,4,2012/2013,1,1.0,No,Head,0,14 downs,37.00 downs
1,D.J. Fluker,San Diego Chargers,22/09/2013,Tennessee Titans,Offensive Tackle,No,No,3,2013/2014,1,1.0,No,Concussion,0,78 downs,73.50 downs
2,Marquise Goodwin,Buffalo Bills,28/09/2014,Houston Texans,Wide Receiver,No,No,4,2014/2015,1,1.0,No,Concussion,0,25 downs,17.50 downs
3,Bryan Stork,New England Patriots,12/10/2014,Buffalo Bills,Center,No,Yes,6,2014/2015,1,1.0,No,Head,0,82 downs,41.50 downs
4,Lorenzo Booker,Chicago Bears,9/09/2012,Indianapolis Colts,Running Back,Yes,Yes,1,2012/2013,0,,No,Head,0,Did not return from injury,


In [8]:
file_path = '../../data/01-modified-data/nfl_concussions.csv'  
concussions.to_csv(file_path, index=False) 

## NFL Game Injury Data 

The data came from [here](https://www.kaggle.com/competitions/nfl-playing-surface-analytics/data).

In [9]:
file_path = "../../data/00-raw-data/InjuryRecord.csv"

nfl_injuries = pd.read_csv(file_path)

cols_drop = ["GameID", "PlayKey", "DM_M1", "DM_M7", "DM_M28", "DM_M42"]
nfl_injuries = nfl_injuries.drop(columns=cols_drop)
print(nfl_injuries)

     PlayerKey BodyPart    Surface
0        39873     Knee  Synthetic
1        46074     Knee    Natural
2        36557    Ankle  Synthetic
3        46646    Ankle    Natural
4        43532    Ankle  Synthetic
..         ...      ...        ...
100      44423     Knee  Synthetic
101      31933     Knee  Synthetic
102      47285     Knee    Natural
103      37068     Knee    Natural
104      36696     Knee  Synthetic

[105 rows x 3 columns]


In [10]:
file_path = "../../data/00-raw-data/PlayList.csv"

play_list = pd.read_csv(file_path)

cols_drop = ["GameID", "PlayKey", "PlayerDay", "PlayerGame", "Position", "PositionGroup", "PlayType", "PlayerGamePlay"]
play_list = play_list.drop(columns=cols_drop)
print(play_list)

        PlayerKey RosterPosition StadiumType  FieldType  Temperature  \
0           26624    Quarterback     Outdoor  Synthetic           63   
1           26624    Quarterback     Outdoor  Synthetic           63   
2           26624    Quarterback     Outdoor  Synthetic           63   
3           26624    Quarterback     Outdoor  Synthetic           63   
4           26624    Quarterback     Outdoor  Synthetic           63   
...           ...            ...         ...        ...          ...   
267000      47888     Cornerback     Outdoor  Synthetic           33   
267001      47888     Cornerback     Outdoor  Synthetic           33   
267002      47888     Cornerback     Outdoor  Synthetic           33   
267003      47888     Cornerback     Outdoor  Synthetic           33   
267004      47888     Cornerback     Outdoor  Synthetic           33   

               Weather  
0       Clear and warm  
1       Clear and warm  
2       Clear and warm  
3       Clear and warm  
4       Cl

In [11]:
nfl_injuries = pd.merge(nfl_injuries, play_list, on="PlayerKey", how="outer")
nfl_injuries.head()

Unnamed: 0,PlayerKey,BodyPart,Surface,RosterPosition,StadiumType,FieldType,Temperature,Weather
0,39873,Knee,Synthetic,Linebacker,Indoors,Synthetic,85,Mostly Cloudy
1,39873,Knee,Synthetic,Linebacker,Indoors,Synthetic,85,Mostly Cloudy
2,39873,Knee,Synthetic,Linebacker,Indoors,Synthetic,85,Mostly Cloudy
3,39873,Knee,Synthetic,Linebacker,Indoors,Synthetic,85,Mostly Cloudy
4,39873,Knee,Synthetic,Linebacker,Indoors,Synthetic,85,Mostly Cloudy


In [12]:
nfl_injuries = nfl_injuries.drop(columns="Surface")
nfl_injuries = nfl_injuries.drop_duplicates(keep="first")

In [13]:
nfl_injuries = nfl_injuries.dropna()

In [14]:
nfl_injuries.tail()

Unnamed: 0,PlayerKey,BodyPart,RosterPosition,StadiumType,FieldType,Temperature,Weather
95784,36696,Knee,Cornerback,Outdoors,Synthetic,49,Mostly sunny
95820,36696,Knee,Cornerback,Outdoor,Natural,37,Mostly Cloudy
95888,36696,Knee,Cornerback,Outdoor,Natural,29,Cloudy
95937,36696,Knee,Cornerback,Outdoor,Natural,45,Fair
96002,36696,Knee,Cornerback,Outside,Natural,50,Partly Cloudy


In [33]:
indoor_variations = ['Indoors', 'Indoor']
outdoors_variations = ['Outdoors',   'Outdoor', 'Heinz Field', 'Outddors', 'Cloudy','Oudoor', 'Outside', 'Open', 'Ourdoor', 'Outdor']
retractable_variations = ['Retractable Roof', 'Domed, open', 'Domed, Open', 'Retr. Roof-Closed',  'Retr. Roof Closed', 'Open',   'Bowl', 'Dome', 'Retr. Roof - Closed', 'Retr. Roof-Open',  'Indoor, Open Roof', 'Indoor, Roof Closed', 'Closed Dome', 'Domed', 'Domed, ', 'Retr. Roof - Open']

nfl_injuries['StadiumType'] = nfl_injuries['StadiumType'].replace(outdoors_variations, 'outdoor')
nfl_injuries['StadiumType'] = nfl_injuries['StadiumType'].replace(indoor_variations, 'indoor')
nfl_injuries['StadiumType'] = nfl_injuries['StadiumType'].replace(retractable_variations, 'retractable roof')

In [34]:
file_path = "../../data/01-modified-data/nfl_injuries.csv"
nfl_injuries.to_csv(file_path, index=False)

## News Data 

In [16]:
import requests
import json
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
baseURL = "https://newsapi.org/v2/everything?"
total_requests = 2
verbose = True

API_KEY = "8307bbe570f142dc8f6446969fdb7622"
TOPIC = "soccer injury"

In [18]:
URLpost = {'apiKey': API_KEY, 
           'q':'+'+TOPIC,
           "sortBy": 'relevancy',
           'totalRequests': 1}

print(baseURL)

response = requests.get(baseURL, URLpost)
response = response.json()

print(json.dumps(response, indent=2))

from datetime import datetime 
timestamp = datetime.now().strftime("%Y-%m-%d-H%H-M%M-S%S")

with open(timestamp+"-newapi-raw-data.json", "w") as outfile:
    json.dump(response, outfile, indent=4)

https://newsapi.org/v2/everything?
{
  "status": "ok",
  "totalResults": 407,
  "articles": [
    {
      "source": {
        "id": null,
        "name": "CNET"
      },
      "author": "Adam Oram",
      "title": "Man United vs. Crystal Palace Livestream: How to Watch Premier League Soccer From Anywhere - CNET",
      "description": "It's the second meeting this week between these two teams at Old Trafford.",
      "url": "https://www.cnet.com/tech/services-and-software/man-united-vs-crystal-palace-livestream-how-to-watch-premier-league-soccer-from-anywhere/",
      "urlToImage": "https://www.cnet.com/a/img/resize/544c671965e7ac8596064918365dfee0cdf97403/hub/2023/09/29/7202e833-8752-4b3a-8332-4c0a79e17ca0/gettyimages-1608232965.jpg?auto=webp&fit=crop&height=675&width=1200",
      "publishedAt": "2023-09-30T12:02:09Z",
      "content": "Just four days after these two sides locked horns in the Carabao Cup, Manchester United and Crystal Palace go head-to-head once more at Old Trafford in

In [19]:
def string_cleaner(input_string):
    try:
        out = re.sub(r"""
                   [,.:@#?!&$-]+
                   \ *
                   """, 
                   " ", 
                   input_string, flags=re.VERBOSE)
        out = re.sub('[’.]+', '', input_string)
        out = re.sub(r'\s+', ' ', out)
        out = out.lower()
    except: 
        print("ERROR")
        out = ''
    return out 

In [20]:
article_list = response["articles"]
article_keys = article_list[0].keys()
print("AVAILABLE KEYS:")
print(article_keys)
index=0
cleaned_data = [];
for article in article_list:
    tmp=[]
    if(verbose):
        print("#--------------------------------")
        print("#", index)
        print("#--------------------------------")
    for key in article_keys:
        if(verbose):
            print("--------------------")
            print(key)
            print(article[key])
            print("--------------------")

        if(key=="source"):
            src=string_cleaner(article[key]['name'])
            tmp.append(src)
        
        if(key=="author"):
            author=string_cleaner(article[key])
            if(src in author):
                print(" AUTHOR ERROR:", author); author='NA'
            tmp.append(author)
        
        if(key=='title'):
            tmp.append(string_cleaner(article[key]))

        if(key=='description'):
            tmp.append(string_cleaner(article[key]))

        if(key=="publishedAt"):
            ref = re.compile('.*-.*-.*T.*:.*:.*Z')
            date = article[key]
            if(not ref.match(date)):
                print(" DATE ERROR:", date); date="NA"
            tmp.append(date)

    cleaned_data.append(tmp)
    index+=1


AVAILABLE KEYS:
dict_keys(['source', 'author', 'title', 'description', 'url', 'urlToImage', 'publishedAt', 'content'])
#--------------------------------
# 0
#--------------------------------
--------------------
source
{'id': None, 'name': 'CNET'}
--------------------
--------------------
author
Adam Oram
--------------------
--------------------
title
Man United vs. Crystal Palace Livestream: How to Watch Premier League Soccer From Anywhere - CNET
--------------------
--------------------
description
It's the second meeting this week between these two teams at Old Trafford.
--------------------
--------------------
url
https://www.cnet.com/tech/services-and-software/man-united-vs-crystal-palace-livestream-how-to-watch-premier-league-soccer-from-anywhere/
--------------------
--------------------
urlToImage
https://www.cnet.com/a/img/resize/544c671965e7ac8596064918365dfee0cdf97403/hub/2023/09/29/7202e833-8752-4b3a-8332-4c0a79e17ca0/gettyimages-1608232965.jpg?auto=webp&fit=crop&height=6

In [21]:
df = pd.DataFrame(cleaned_data)
print(df.head())
df.to_csv('cleaned.csv', index=False)

                0                             1  \
0            cnet                     adam oram   
1            cnet                     adam oram   
2        huffpost                            ap   
3  slate magazine  josh levin and stefan fatsis   
4        deadspin                      sam fels   

                                                   2  \
0  man united vs crystal palace livestream: how t...   
1  man city vs nottingham forest livestream: how ...   
2  spanish soccer star accuses federation of thre...   
3  is there any part of aaron rodgers first game ...   
4  mls is playing pretty fast and loose with lion...   

                                                   3                     4  
0  it's the second meeting this week between thes...  2023-09-30T12:02:09Z  
1  pep guardiola's men look to maintain their 100...  2023-09-23T12:00:09Z  
2  “nothing has changed,” said jenni hermoso, as ...  2023-09-19T11:28:10Z  
3               cmon, the new york jets are invo

In [22]:
import pandas as pd 

df = pd.read_csv("cleaned.csv")
df.columns

rename_map = {
    '0': 'source',
    '1': 'author',
    '2': 'title',
    '3': 'description',
    '4': 'publish_date'
}

df.rename(columns=rename_map, inplace=True)
df


Unnamed: 0,source,author,title,description,publish_date
0,cnet,adam oram,man united vs crystal palace livestream: how t...,it's the second meeting this week between thes...,2023-09-30T12:02:09Z
1,cnet,adam oram,man city vs nottingham forest livestream: how ...,pep guardiola's men look to maintain their 100...,2023-09-23T12:00:09Z
2,huffpost,ap,spanish soccer star accuses federation of thre...,"“nothing has changed,” said jenni hermoso, as ...",2023-09-19T11:28:10Z
3,slate magazine,josh levin and stefan fatsis,is there any part of aaron rodgers first game ...,"cmon, the new york jets are involved",2023-09-18T21:43:37Z
4,deadspin,sam fels,mls is playing pretty fast and loose with lion...,the problem with hinging an entire leagues rep...,2023-10-03T11:27:00Z
...,...,...,...,...,...
95,cbs sports,chuck booth,"us open cup final, inter miami vs houston dyna...",significant questions around messi's health li...,2023-09-27T13:21:00Z
96,the conversation africa,"philip anloague, adjunct professor of physical...",aaron rodgers' season-ending achilles tear res...,"two days after rodgers injury, the nfl players...",2023-09-22T12:30:44Z
97,nakedcapitalismcom,yves smith,"‘honey, i bought my football team! the world o...",confirming that investors in football (as in t...,2023-10-17T09:17:17Z
98,the punch,agency report,miami put four past toronto despite messis ear...,inter miami shrugged off an injury concern to ...,2023-09-21T01:00:29Z


In [23]:
df["combined"] = df["title"] + df["description"]

In [25]:
cols_drop = ["title", "description"]
df.drop(columns=cols_drop, inplace=True)

In [27]:
df.head()

Unnamed: 0,source,author,publish_date,combined
0,cnet,adam oram,2023-09-30T12:02:09Z,man united vs crystal palace livestream: how t...
1,cnet,adam oram,2023-09-23T12:00:09Z,man city vs nottingham forest livestream: how ...
2,huffpost,ap,2023-09-19T11:28:10Z,spanish soccer star accuses federation of thre...
3,slate magazine,josh levin and stefan fatsis,2023-09-18T21:43:37Z,is there any part of aaron rodgers first game ...
4,deadspin,sam fels,2023-10-03T11:27:00Z,mls is playing pretty fast and loose with lion...


In [28]:
df.to_csv('cleaned.csv', index=False)