In [None]:
!pip install fancyimpute

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score
import pickle
from fancyimpute import IterativeImputer
import matplotlib.pyplot as plt

# Load the data and select columns

columns_to_select = [
    'Distance', 'Last Finish pos', 'Last-1 Finish pos', 'Weight', 'Wins', 'Places',
    'win%', 'Place%', 'Course Wins', 'Course Places', 'Course Starts',
    'Distance Wins', 'Distance Places', 'Good Wins', 'Good Places', 'Good Starts', ' age',
    '1Up Wins', '1Up Places', '1Up Starts', '2Up Wins', '2Up Places', '2Up Starts',
    '3Up Wins', '3Up Places', '3Up Starts', '4Up Wins', '4Up Places', '4Up Starts',
    'Last Base Rating', 'Last-1 Base Rating', 'Last-1 WFA Rating',
    'Joc Ovrl12m Wins', 'Joc Ovrl12m Places',
    'Joc Ovrl12m Starts', 'Trn Loc12m Wins', 'Trn Loc12m Places', 'Trn Loc12m Starts',
    'Trn Loc5y Wins', 'Trn Loc5y Places', 'Trn Loc5y Starts', 'Trn Ovrl12m Wins',
    'Trn Ovrl12m Places', 'Trn Ovrl12m Starts', 'Joc/Trn Wins', 'Joc/Trn Starts',
    'Last Dist', 'Last Margin', 'Last Weight', 'Last-1 Dist', 'Last-1 Margin', 'Last-1 Weight',
    'Position'
]
file_path = './2019-WFResultsMerged.csv'
data = pd.read_csv(file_path, usecols=columns_to_select, nrows=30000, low_memory=True)

display(list(data.columns))
print(data.shape)




In [None]:
# Remove rows where 'Position' is 99.0
data = data[data['Position'] != 99.0]


# Display the percentage of missing values in each column before imputation
print("Percentage of missing values before imputation:")
display((data.isnull().sum() / len(data)) * 100)
missing_percentage = (data.isnull().sum() / len(data)) * 100

In [None]:

# Extract features and target variable
X = data.drop('Position', axis=1)
y = data['Position']

# Perform MICE imputation
imputer = IterativeImputer(max_iter=10, random_state=0)
X_imputed = imputer.fit_transform(X)

# Create a DataFrame with imputed data
data_imputed = pd.DataFrame(X_imputed, columns=X.columns)
data_imputed['Position'] = y  # Adding the target variable back

# Display the percentage of missing values in each column after imputation
print("\nPercentage of missing values after imputation:")
print((data_imputed.isnull().sum() / len(data_imputed)) * 100)



In [None]:
# Plot the distribution of 'Position' values
plt.figure(figsize=(10, 6))
data_imputed['Position'].value_counts().sort_index().plot(kind='bar', color='skyblue')
plt.xlabel('Position')
plt.ylabel('Count')
plt.title('Distribution of Position Values')
plt.show()

In [None]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

print(X_imputed.shape)

# Apply PCA
pca = PCA(n_components=0.95)  # Choose the explained variance you want to retain
X_pca = pca.fit_transform(X_scaled)
print(X_pca.shape)
# Save PCA model
with open('./pca_model.pkl', 'wb') as pca_file:
    pickle.dump(pca, pca_file)

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.05, random_state=42)

# Encode the target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Train and save SVM model
svm_model = SVC()
svm_model.fit(X_train, y_train_encoded)
with open('./svm_model.pkl', 'wb') as svm_file:
    pickle.dump(svm_model, svm_file)
