In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.impute import SimpleImputer
from sklearn import preprocessing
import pandas as pd
import numpy as np
np.random.seed(0)

In [22]:
data = pd.read_csv("Cristano_Ronaldo_Final_v1/data.csv")
data = data.drop(data.columns[0],axis=1)
# data.head()

# purged features:
data = data.drop(['shot_id_number'],axis=1)
data = data.drop(['date_of_game'],axis=1)
data = data.drop(['team_id'],axis=1)
# data = data.drop(['type_of_shot'],axis=1)
# data = data.drop(['game_season'], axis=1)
# data = data[data['shot_id_number'].notnull()]

data = pd.get_dummies(data)
data = data.drop(["type_of_shot_shot - 37","home/away_MANU @ NOK","shot_basics_Right Corner","home/away_MANU vs. NOP","lat/lng_40.324211, -111.674849","type_of_shot_shot - 33",
"home/away_MANU vs. VAN",
"type_of_combined_shot_shot - 2",
"shot_basics_Left Corner",
"type_of_shot_shot - 34",
"home/away_MANU vs. CHH",
"home/away_MANU @ VAN",
"lat/lng_49.250068, -123.114646",
"lat/lng_30.055498, -89.960838",
"lat/lng_35.205878, -80.841194",
"home/away_MANU @ NOP",
"home/away_MANU @ CHH",
"home/away_MANU vs. PHO",
"range_of_shot_Back Court Shot",
"area_of_shot_Mid Ground(MG)",
"game_season_2013-14",
"home/away_MANU vs. SAN",
"lat/lng_33.513157, -112.082793",
"home/away_MANU @ PHO",
"home/away_MANU @ BKN",
"lat/lng_40.623199, -73.951223",
"shot_basics_Mid Ground Line",
"home/away_MANU @ UTH",
"home/away_MANU vs. BKN"],axis=1)

In [None]:
#######LOCAL TRAIN-VALIDATION SPLIT########
#Rescale all feature values:
mm_scaler = preprocessing.MinMaxScaler()

#global train-test split:
train = data[data['is_goal'].notnull()]
y = train['is_goal']
test = data[data['is_goal'].isnull()]

# local training / validation performance:
train['local_train']=np.random.uniform(0,1,len(train))<=0.85
local_train, local_validation = train[train['local_train']==True], train[train['local_train']==False]
y_local_train = local_train['is_goal']
y_local_validation=local_validation['is_goal']
local_train=local_train.drop(['is_goal'],axis=1)
local_validation =local_validation.drop(['is_goal'],axis=1)

features = local_train.columns[:-1]
# print(features)
local_train = local_train[features]
local_validation = local_validation[features]
print("local train size:", len(local_train))
print("local validation size:", len(local_validation))
print("test size:", len(test))

# preprocessing local-train+validation:
imp= SimpleImputer(missing_values=np.nan,strategy='mean')
imp = imp.fit(local_train)
local_train_modified = imp.transform(local_train)
imp = imp.fit(local_validation)
local_validation_modified = imp.transform(local_validation)
# local_train_modified = mm_scaler.fit_transform(local_train_modified)
# local_validation_modified = mm_scaler.fit_transform(local_train_modified)

In [None]:
#RF Grid Search:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 40, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(local_train_modified, y_local_train)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 13, 16, 20, 23, 26, 30, 33, 36, 40, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}
Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  4.8min


In [27]:
#RF classifier for train-validation perf:
clf = RandomForestClassifier(n_jobs=-1,
                             min_samples_leaf=5,
                             max_depth = 30,
                             n_estimators=500, random_state=0)
clf.fit(local_train_modified, y_local_train)
p = clf.predict_proba(local_validation_modified)
y_validation_pred_binary = clf.predict(local_validation_modified)
y_validation_pred_prob = []
for x,y in p:
    y_validation_pred_prob.append(y)
count_match = 0
count_error = 0
deviation = 0.0
# print("type,",type(y_validation_pred),type(y_local_validation))
assert(len(y_validation_pred_prob)==len(y_local_validation))
validation_gtruth=np.asarray(y_local_validation)
for i in range(len(y_local_validation)):
    deviation +=abs(y_validation_pred_prob[i]-validation_gtruth[i])
    if (int(y_validation_pred_binary[i])==int(validation_gtruth[i])):
        count_match+=1
    else:
        count_error+=1
validation_accuracy = count_match/(count_match+count_error)*100.0
print("validation a/c:", validation_accuracy)
print("score:", 1.0/(1.0+deviation*1.0/(count_match+count_error)))
max_depth = list()
for tree in clf.estimators_:
    max_depth.append(tree.tree_.max_depth)
#     print(tree.tree_.max_depth)
print("avg max depth %0.1f" % (sum(max_depth) / len(max_depth)))
features_imp = pd.DataFrame(clf.feature_importances_, index=features,columns=['importance']).sort_values('importance', ascending=False)
print(features_imp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


local train size: 20798
local validation size: 3631
test size: 6268
validation a/c: 63.45359405122556
score: 0.6874831216810208
avg max depth 30.0
                                importance
location_y                        0.063466
distance_of_shot                  0.058811
match_id                          0.056026
match_event_id                    0.055620
distance_of_shot.1                0.054441
location_x                        0.053219
remaining_sec.1                   0.051266
remaining_sec                     0.049765
remaining_min.1                   0.042080
remaining_min                     0.033367
power_of_shot.1                   0.031065
type_of_shot_shot - 39            0.026403
shot_basics_Goal Area             0.026123
knockout_match.1                  0.025260
type_of_combined_shot_shot - 1    0.024743
range_of_shot_Less Than 8 ft.     0.021055
power_of_shot                     0.018011
type_of_shot_shot - 44            0.015320
type_of_combined_shot_shot - 3    0.

In [None]:
#######ACTUAL TRAIN-TEST PERDICTION########
y_train = train['is_goal']
train=train.drop(['is_goal'],axis=1)
test =test.drop(['is_goal'],axis=1)
features = local_train.columns[:-1]
train = train[features]
test = test[features]
print("Train size:", len(train))
print("Test size:", len(test))

# preprocessing local-train+validation:
imp= SimpleImputer(missing_values=np.nan,strategy='mean')
imp = imp.fit(train)
train_modified = imp.transform(train)
imp = imp.fit(test)
test_modified = imp.transform(test)

#RF classifier for train-validation perf:
clf2 = RandomForestClassifier(n_jobs=2, n_estimators=100, random_state=0)
clf2.fit(train_modified, y_train)
# y_test_pred = clf2.predict(test_modified)
p = clf2.predict_proba(test_modified)
prediction = []
for x,y in p:
    prediction.append(y)
count_match = 0
count_error = 0

# assert(len(y_test_pred)==len(y_local_validation))
# validation_gtruth=np.asarray(y_local_validation)
# for i in range(len(y_local_validation)):
#     if (int(y_validation_pred[i])==int(validation_gtruth[i])):
#         count_match+=1
#     else:
#         count_error+=1
# validation_accuracy = count_match/(count_match+count_error)*100.0
# print("validation a/c:", validation_accuracy)

In [None]:
#write outputs:
# shot_arr = np.asarray(test['shot_id_number'])
test_rows = data.index[data.is_goal.isnull()]
count = 0
f = open("submissionAP.csv","w+")
print("shot_id_number,is_goal", file=f)
# print("shot_id_number,is_goal", file=f)
for i in range(len(test_rows)):
    print(str(int(test_rows[i]+1))+","+str(prediction[i]), file=f)
    count+=1
print(count)
f.close()

In [None]:
print(validation_gtruth)

In [None]:
print(len(test))

In [None]:
# list(zip(train[features],clf2.feature_importances_))
features_imp = pd.DataFrame(clf.feature_importances_, index=features,columns=['importance']).sort_values('importance', ascending=False)
print(features_imp)

In [None]:

print(prediction)
    

In [None]:
print(test_rows)