In [2]:
import os
import pandas as pd
import re

In [8]:
# Function to process a file
def process_file(input_file, output_folder):
    # Extract year from file name
    file_name = os.path.basename(input_file)
    year_match = re.search(r'(\d{4})_', file_name)
    if year_match:
        year = int(year_match.group(1))
    else:
        year = None
    
    # Read CSV file
    data = pd.read_csv(input_file)
    
    # Add Year column
    data['Game_Year'] = year
    data['Game_Year'] = data['Game_Year'].astype(int)

    # Replace values in "Selection Type" column
    data['Selection Type'] = data['Selection Type'].replace({
        'Eastern All-Star Fan Vote Selection': 'East All-Star Fans Selection',
        'Western All-Star Fan Vote Selection': 'West All-Star Fans Selection',
        'Eastern All-Star Coaches Selection': 'East All-Star Coaches Selection',
        'Western All-Star Coaches Selection': 'West All-Star Coaches Selection',
        'Eastern All-Star Replacement Selection': 'East All-Star Replacement Selection',
        'Western All-Star Replacement Selection': 'West All-Star Replacement Selection',
    })

    # Split the "Selection Type" column based on specified values
    selection_type_splits = data['Selection Type'].str.split(expand=True)
    
    # Assign splits to respective columns
    data['Region'] = selection_type_splits[0]
    data['Selection_Type'] = selection_type_splits[2]
    
    # Split "HT" column into "Height_Feet" and "Height_Inch"
    data[['Height_Feet', 'Height_Inch']] = data['HT'].str.split('-', expand=True)
    
    # Calculate "Height_Inches" column
    data['Height_In_Inches'] = data['Height_Feet'].astype(int) * 12 + data['Height_Inch'].astype(int)

    # Split "NBA Draft Status" column into three columns: NBA_Draft_Year, NBA_Draft_Round, NBA_Draft_Pick
    draft_status_splits = data['NBA Draft Status'].str.extract(r'(\d{4}) Rnd (\d) Pick (\d+)', expand=True)
    draft_status_splits.columns = ['NBA_Draft_Year', 'NBA_Draft_Round', 'NBA_Draft_Pick']

    # Concatenate draft status columns with the original data
    data = pd.concat([data, draft_status_splits], axis=1)

    # Convert to integer values, ignore NaN
    columns_to_convert = ['NBA_Draft_Year', 'NBA_Draft_Round', 'NBA_Draft_Pick']
    data[columns_to_convert] = data[columns_to_convert].fillna(0)
    data[columns_to_convert] = data[columns_to_convert].astype(int)
    data[columns_to_convert] = data[columns_to_convert].replace(0, "-")
    
    # Rename columns
    data = data.rename(columns={
        'WT': 'Weight',
        'Pos': 'Position'
    })

    # Remove column
    data = data.drop(columns=['Selection Type', 'HT', 'NBA Draft Status'])
    
    # Save the processed data
    output_file = os.path.join(output_folder, os.path.basename(input_file).replace('.csv', '_processed.csv'))
    data.to_csv(output_file, index=False)
    
    print(f"Processed data saved as {output_file}")
    # Save the processed data
    output_file = os.path.join(output_folder, os.path.basename(input_file).replace('.csv', '_processed.csv'))
    data.to_csv(output_file, index=False)
    
    print(f"Processed data saved as {output_file}")

In [9]:

# Directory where the CSV files are located
input_folder = "../Web-Scraping-Player-Details/raw_player_details_data"
output_folder = "../Web-Scraping-Player-Details/cleaned_player_details_data/"

# List all CSV files in the folder
for file in os.listdir(input_folder):
    if file.endswith(('Player_Details_1.csv', 'Player_Details_2.csv')):
        input_file_path = os.path.join(input_folder, file)
        process_file(input_file_path, output_folder)


Processed data saved as ../Web-Scraping-Player-Details/cleaned_player_details_data/1987_Player_Details_1_processed.csv
Processed data saved as ../Web-Scraping-Player-Details/cleaned_player_details_data/1987_Player_Details_1_processed.csv
Processed data saved as ../Web-Scraping-Player-Details/cleaned_player_details_data/2006_Player_Details_1_processed.csv
Processed data saved as ../Web-Scraping-Player-Details/cleaned_player_details_data/2006_Player_Details_1_processed.csv
Processed data saved as ../Web-Scraping-Player-Details/cleaned_player_details_data/2003_Player_Details_1_processed.csv
Processed data saved as ../Web-Scraping-Player-Details/cleaned_player_details_data/2003_Player_Details_1_processed.csv
Processed data saved as ../Web-Scraping-Player-Details/cleaned_player_details_data/1982_Player_Details_1_processed.csv
Processed data saved as ../Web-Scraping-Player-Details/cleaned_player_details_data/1982_Player_Details_1_processed.csv
Processed data saved as ../Web-Scraping-Player-D

In [12]:
# Merging dataset into 1 large dataset
folder_path = "../Web-Scraping-Player-Details/cleaned_player_details_data/"

# List all files
processed_files = [file for file in os.listdir(folder_path)]

# Initialize empty df
combined_data = []

# Loop through files & concatenate them
for file in processed_files:
    file_path = os.path.join(folder_path, file)
    data = pd.read_csv(file_path)
    combined_data.append(data)

# Outer join combined data
merged_data = pd.concat(combined_data, axis=0, ignore_index=True, sort=False)

# Preview data
print(merged_data.head)
print(merged_data.dtypes)

# Save merged data to new file
merged_file_path = "../Web-Scraping-Player-Details/cleaned_player_details_data/combined_dataset.csv"
merged_data.to_csv(merged_file_path, index=False)

<bound method NDFrame.head of                   Player Position  Weight                    Team  \
0    Kareem Abdul-Jabbar        C     225      Los Angeles Lakers   
1           Tom Chambers       PF     220            Phoenix Suns   
2          Clyde Drexler       SG     210  Portland Trail Blazers   
3        Kevin Duckworth        C     275  Portland Trail Blazers   
4             Mark Eaton        C     275               Utah Jazz   
..                   ...      ...     ...                     ...   
756        Dirk Nowitzki       PF     245        Dallas Mavericks   
757     Shaquille O'Neal        C     325      Los Angeles Lakers   
758          Gary Payton        G     180     Seattle SuperSonics   
759      Peja Stojakovic        F     229        Sacramento Kings   
760         Chris Webber      F-C     245        Sacramento Kings   

       Nationality  Game_Year Region Selection_Type  Height_Feet  Height_Inch  \
0    United States       1989   West    Replacement         