In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Load the first dataset
df1 = pd.read_csv('badmintondata.csv')

# Load the second dataset
df2 = pd.read_csv('badmintondata2.csv')

# Combine both datasets
df_combined = pd.concat([df1, df2], ignore_index=True)

# Count rows before cleaning
rows_before_cleaning = df_combined.shape[0]

# Handling missing values
df_combined = df_combined.dropna()  # Drop rows with missing values

# Handling outliers (Using z-score)
z_scores_combined = (df_combined - df_combined.mean()) / df_combined.std()  # Calculate z-scores
threshold = 3  # Set a threshold for outliers
outliers_combined = (z_scores_combined.abs() > threshold).any(axis=1)  # Find rows with outliers
df_combined_cleaned = df_combined[~outliers_combined].copy()  # Filter out rows with outliers

# Identify consecutive rows of all zeros
mask = (df_combined == 0).all(axis=1)
groups = mask.cumsum()

# Filter out consecutive rows of all zeros
df_combined_cleaned = df_combined[~mask]
df_combined_cleaned.to_csv('cleaned_dataset.csv', index = False)

# Count rows after cleaning
rows_after_cleaning = df_combined_cleaned.shape[0]

print("Number of rows before cleaning:", rows_before_cleaning)
print("Number of rows after cleaning:", rows_after_cleaning)


Number of rows before cleaning: 315200
Number of rows after cleaning: 168842


In [None]:
# Group the data before each group of zeros
grouped_df = df_combined_cleaned.groupby(groups, group_keys=False)

# Function to add a Time field and SERVE_ID to a group
def add_time_and_serve_id(group):
    group['TIME'] = np.arange(len(group)) * 10  # Multiply by 10 to get time in ms
    group['SERVE_ID'] = group.name  # Assign the group name as the SERVE_ID

    # Reset time to 0 at the start of each group
    group.loc[group.index[0], 'TIME'] = 0
    
    return group


# Apply a TIME-based transformation and add SERVE_ID to each group
grouped_df = grouped_df.apply(add_time_and_serve_id)

# Extract unique groups
serve_ids = grouped_df['SERVE_ID'].unique()

# Split the groups into testing set and sample set (split into a testing set and a sample set using a 70:30 ratio)
train_groups, test_groups = train_test_split(serve_ids, test_size=0.3, random_state=42)

# Create the training set
train_df = pd.concat([grouped_df[grouped_df['SERVE_ID'] == group] for group in train_groups])

# Create the testing set
test_df = pd.concat([grouped_df[grouped_df['SERVE_ID'] == group] for group in test_groups])

# Print two unique groups
for group in serve_ids[:2]:
    group_data = grouped_df[grouped_df['SERVE_ID'] == group]
    print(f"Group: {group}")
    print(group_data)
    print()


In [None]:
# Define the non-feature columns
non_feature_cols = ["HUMAN PLAYER POSITION (X) metres", "HUMAN PLAYER POSITION (Y) metres",
                    "INITITAL VELOCITY OF SHUTTELCOCK(m/s)", "INITIAL SHUTTELCOCK FIRING ANGLE (DEGREE)",
                    "SHUTTELCOCK SLANT ANGLE TO SIDELINE(DEGREE)", "TIME"]

# Define the feature columns
feature_cols = ["SHUTTLECOCK POSITIION IN AIR(X) metres", "SHUTTLECOCK POSITIION IN AIR(Y) metres",
                "SHUTTLECOCK POSITIION IN AIR(Z) metres"]

# Create the training feature DataFrame
train_features = train_df[feature_cols].copy()

# Create the training non-feature DataFrame
train_non_features = train_df[non_feature_cols].copy()

# Create the testing feature DataFrame
test_features = test_df[feature_cols].copy()

# Create the testing non-feature DataFrame
test_non_features = test_df[non_feature_cols].copy()

# Combine the feature and non-feature DataFrames for training and testing
X_train = pd.concat([train_features, train_non_features], axis=1)
X_test = pd.concat([test_features, test_non_features], axis=1)
y_train = train_df["SHUTTLECOCK POSITIION IN AIR(Z) metres"]
y_test = test_df["SHUTTLECOCK POSITIION IN AIR(Z) metres"]

# Train the model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE:", rmse)

# Evaluate other metrics or perform additional analysis if needed
