In [None]:
# load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from scipy.stats import zscore
from sklearn.metrics import f1_score, accuracy_score, mean_squared_error

In [None]:
# load the data
path = '/Users/amankaur/Downloads/final_processed.csv'
df_csv = pd.read_csv(path)
df_csv.sample(5)

Unnamed: 0.1,Unnamed: 0,userId,gender,sport,id,longitude,latitude,altitude,timestamp,time_elapsed,heart_rate,derived_speed,distance,tar_heart_rate,tar_derived_speed,since_begin,since_last
24421958,24421958,5607830,male,run,352474425,-43.233297,-22.911036,-2.082819,1401985144,-0.052134,-4.898841,0.255093,-2.456982,115.575608,18.44718,-14.805054,-0.01158
25722941,25722941,7654809,male,run,452337496,5.23271,51.017632,-1.90436,1419950993,-0.104291,1.965995,-1.874954,-2.548111,147.991284,13.092428,-14.805054,-0.01158
14530035,14530035,1428766,male,bike,234717496,0.465231,49.484027,-2.440054,1377205546,-0.062387,-5.677717,3.210605,11.665921,111.897767,25.877074,-14.805054,-0.01158
9454492,9454492,159106,male,mountain bike,110372707,10.804173,55.212033,-2.233086,1354138850,0.0076,1.142551,2.009675,3.960602,144.10299,22.858042,-14.805054,-0.01158
28156562,28156562,9882712,male,bike,355398485,-75.806707,42.296954,1.139509,1402437307,-0.094929,-7.108062,1.509746,13.714082,105.143694,21.601267,-14.805054,-0.01158


In [None]:
df_csv.info()
clean_df = df_csv.drop('Unnamed: 0', axis=1) # drop the csv created column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30897600 entries, 0 to 30897599
Data columns (total 17 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Unnamed: 0         int64  
 1   userId             int64  
 2   gender             object 
 3   sport              object 
 4   id                 int64  
 5   longitude          float64
 6   latitude           float64
 7   altitude           float64
 8   timestamp          int64  
 9   time_elapsed       float64
 10  heart_rate         float64
 11  derived_speed      float64
 12  distance           float64
 13  tar_heart_rate     float64
 14  tar_derived_speed  float64
 15  since_begin        float64
 16  since_last         float64
dtypes: float64(11), int64(4), object(2)
memory usage: 3.9+ GB


In [None]:
len_userid = len(clean_df['userId'].unique())
len_id = len(clean_df['id'].unique())
print(f'length of userIds are: {len_userid}')
print(f'lenth of ids are: {len_id}')
clean_df['gender'].unique()
clean_df = clean_df[clean_df['gender'] != 'unknown']

length of userIds are: 1038
lenth of ids are: 102992


In [None]:
clean_df['tar_heart_rate'] = zscore(clean_df['tar_heart_rate'])
clean_df['tar_heart_rate'].describe()

count    3.064290e+07
mean    -8.287294e-15
std      1.000000e+00
min     -3.999437e+00
25%     -5.979219e-01
50%      9.705827e-02
75%      6.922186e-01
max      2.898423e+00
Name: tar_heart_rate, dtype: float64

In [None]:
clean_df.columns

Index(['userId', 'gender', 'sport', 'id', 'longitude', 'latitude', 'altitude',
       'timestamp', 'time_elapsed', 'heart_rate', 'derived_speed', 'distance',
       'tar_heart_rate', 'tar_derived_speed', 'since_begin', 'since_last'],
      dtype='object')

Split Data into training/testing

In [None]:
clean_df_check = clean_df[clean_df.groupby(['sport']).cumcount() < 30000]
clean_df_check['sport'].value_counts()

sport
bike                       30000
core stability training    30000
fitness walking            30000
run                        30000
skate                      30000
walk                       30000
roller skiing              30000
hiking                     30000
orienteering               30000
cross-country skiing       30000
indoor cycling             30000
mountain bike              30000
bike (transport)           30000
kayaking                   18000
circuit training           15900
rowing                     13800
weight training             9000
downhill skiing             7200
soccer                      6300
snowshoeing                 4500
golf                        3300
horseback riding            2100
badminton                   1500
climbing                    1500
treadmill running           1500
tennis                      1500
basketball                  1200
table tennis                 900
rugby                        900
elliptical                   600
snow

In [None]:
# using train test split to split ds
# features = ['userId', 'id', 'longitude', 'latitude', 'altitude', 'timestamp',
#        'time_elapsed', 'heart_rate', 'derived_speed', 'distance',
#        'tar_heart_rate', 'tar_derived_speed', 'since_begin', 'since_last',
#        'gender_female', 'gender_male']
features = ['userId', 'tar_heart_rate']

# X = clean_df.loc[:, features]
# y = clean_df.loc[:, ['sport']]

X = clean_df_check.loc[:, features]
y = clean_df_check.loc[:, ['sport']]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size = .80, shuffle=False)

In [None]:
print(f'X.shape = {X.shape}')
print(f'y.shape = {y.shape}')

print(f'X_train.shape = {X_train.shape}')
print(f'X_test.shape = {X_test.shape}')
print(f'y_train.shape = {y_train.shape}')
print(f'y_test.shape = {y_test.shape}')

X.shape = (482700, 2)
y.shape = (482700, 1)
X_train.shape = (386160, 2)
X_test.shape = (96540, 2)
y_train.shape = (386160, 1)
y_test.shape = (96540, 1)


In [None]:
small_df = clean_df_check[:10000]
X_small = small_df.loc[:, features]
y_small = small_df.loc[:, ['sport']]

Model: SVC

In [None]:
X_small_train, X_small_test, y_small_train, y_small_test = train_test_split(X_small, y_small, random_state=42, train_size = .80, shuffle=False)

In [None]:
# load library
from sklearn.svm import SVC

svc_model = SVC()
svc_model.fit(X_small_train, y_small_train)
svc_predict = svc_model.predict(X_small_test) # check performance

  y = column_or_1d(y, warn=True)


In [None]:
# Calculate the accuracy score
accuracy = accuracy_score(y_small_test, svc_predict)
print(f'accuracy: {accuracy}')

accuracy: 0.85


trying on whole data set

In [None]:
# trying on whole data set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size = .80,  shuffle=False)

In [None]:
# load library
from sklearn.svm import SVC

svc_model = SVC()
svc_model.fit(X_train, y_train)
svc_predict = svc_model.predict(X_test) # check performance

  y = column_or_1d(y, warn=True)


In [None]:
# Calculate the accuracy score
accuracy = accuracy_score(y_test, svc_predict)
print(f'accuracy: {accuracy}')

Model: Linear SVC

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size = .80,  shuffle=True)

In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 386160 entries, 15768836 to 433958
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   userId          386160 non-null  int64  
 1   tar_heart_rate  386160 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 8.8 MB


In [None]:
# load library
from sklearn.svm import LinearSVC

lsvc_model = LinearSVC(verbose=0)
lsvc_model.fit(X_train, y_train)
lsvc_predict = lsvc_model.predict(X_test) # check performance

  y = column_or_1d(y, warn=True)


In [None]:
# Calculate the accuracy score
accuracy = accuracy_score(y_test, lsvc_predict)
print(f'accuracy: {accuracy}')

accuracy: 0.09130930184379532


Model: MLPClassifier

In [None]:
X_small_train, X_small_test, y_small_train, y_small_test = train_test_split(X_small, y_small, random_state=42, train_size = .80, shuffle=True)

In [None]:
from sklearn.neural_network import MLPClassifier

# Create the classifier
clf = MLPClassifier()

# Fit the classifier to the training data
clf.fit(X_small_train, y_small_train)

# Predict the labels of the test data
y_pred = clf.predict(X_small_test)

  y = column_or_1d(y, warn=True)


In [None]:
# Calculate the accuracy score
accuracy = accuracy_score(y_small_test, y_pred)
print(f'accuracy: {accuracy}')

accuracy: 0.6615


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size = .80,  shuffle=True)

In [None]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 386160 entries, 15768836 to 433958
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   userId          386160 non-null  int64  
 1   tar_heart_rate  386160 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 8.8 MB


In [None]:
from sklearn.neural_network import MLPClassifier

# Create the classifier
clf = MLPClassifier()


In [None]:
y_train.value_counts()

sport                  
skate                      24101
bike (transport)           24076
run                        24076
cross-country skiing       24075
roller skiing              24037
orienteering               24010
indoor cycling             23987
bike                       23986
core stability training    23970
mountain bike              23961
walk                       23946
hiking                     23907
fitness walking            23888
kayaking                   14423
circuit training           12655
rowing                     11079
weight training             7221
downhill skiing             5743
soccer                      5062
snowshoeing                 3542
golf                        2643
horseback riding            1692
badminton                   1212
tennis                      1202
climbing                    1193
treadmill running           1187
basketball                   962
rugby                        731
table tennis                 694
snowboarding       

In [None]:

# Fit the classifier to the training data
clf.fit(X_train, y_train)


# Predict the labels of the test data
y_pred_ = clf.predict(X_test)

  y = column_or_1d(y, warn=True)


In [None]:
# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred_)
print(f'accuracy: {accuracy}')

accuracy: 0.061363165527242594


Model: DecisionTreeRegressor
Note: Need to one hot encode the values

In [None]:
from sklearn.tree import DecisionTreeRegressor

reg = DecisionTreeRegressor(max_depth = 4, random_state = 0)
reg.fit(X_train, y_train)


In [None]:
score = reg.score(X_test, y_test)
print(score)

0.016178888322821716
