In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import seaborn as sn
import matplotlib.pyplot as plt
import smogn

In [8]:
df = pd.read_csv('data/auction_stats.csv')

In [9]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train.reset_index(inplace=True)

In [10]:
df_train_smogn = smogn.smoter(
    
    ## main arguments
    data = df_train,           ## pandas dataframe
    y = 'Amount',          ## string ('header name')
    k = 9,                    ## positive integer (k < n)
    samp_method = 'extreme',  ## string ('balance' or 'extreme')

    ## phi relevance arguments
    rel_thres = 0.80,         ## positive real number (0 < R < 1)
    rel_method = 'auto',      ## string ('auto' or 'manual')
    rel_xtrm_type = 'high',   ## string ('low' or 'both' or 'high')
    rel_coef = 2.25           ## positive real number (0 < R)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
dist_matrix: 100%|####################################################################| 16/16 [00:00<00:00, 103.50it/s]
synth_matrix: 100%|####################################################################| 16/16 [00:01<00:00, 11.89it/s]
r_index: 100%|##########################################################################| 3/3 [00:00<00:00, 115.69it/s]


In [11]:
X_train = df_train_smogn.drop(columns=['index', 'player_pkey', 'Player', 'Team', 'Year', 'Amount'])
y_train = df_train_smogn['Amount']
X_test = df_test.drop(columns=['player_pkey', 'Player', 'Team', 'Year', 'Amount'])
y_test = df_test['Amount']

In [12]:
X_test

Unnamed: 0,isOverseas,isBatsman,isBowler,isWicketKeeper,isCapped,UncappedAndPerformed,runs_per_match,matches_bat,strike_rate_bat,average_bat,wickets_per_match,matches_bowl,economy_rate,strike_rate_bowl,average_bowl,bat_powerplay,bat_death,bowl_powerplay,bowl_death,base
180,1,0,1,0,1,0,9.880000,25,150.609756,14.529412,1.090909,44,7.782562,25.104167,19.354167,60.526000,134.543381,13.802980,14.892003,2.00
154,1,1,1,0,0,0,0.000000,1,0.000000,0.000000,0.500000,4,8.500000,59.500000,42.000000,0.000000,0.000000,29.368421,15.896104,0.50
111,1,0,1,0,1,0,33.477273,44,148.637740,38.763158,0.000000,0,0.000000,0.000000,0.000000,418.607701,148.671465,0.000000,0.000000,2.00
247,0,0,1,0,0,0,5.916667,12,94.666667,8.875000,0.794872,39,7.300920,29.870968,24.548387,28.920667,44.182526,19.651624,10.165289,0.50
60,1,1,0,0,1,0,33.871429,70,134.486670,41.596491,0.000000,1,12.000000,0.000000,0.000000,647.945352,186.134405,0.000000,0.000000,2.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218,0,1,1,0,0,0,6.000000,3,78.260870,9.000000,1.000000,3,6.625000,17.666667,16.000000,0.000000,0.000000,0.000000,0.000000,0.20
104,0,0,1,0,1,0,6.727273,22,116.535433,12.333333,1.066667,60,8.553797,28.156250,19.750000,0.000000,57.927043,15.313051,14.934981,0.75
302,0,1,1,0,0,1,17.875000,8,115.322581,17.875000,0.000000,1,7.000000,0.000000,0.000000,83.214529,0.000000,0.000000,0.000000,1.00
194,0,0,1,0,1,0,9.800000,30,134.246575,14.000000,1.133333,60,7.521531,23.117647,18.441176,0.000000,62.617417,15.916490,13.377955,2.00


In [13]:
result = pd.DataFrame()

models = [LinearRegression(), BayesianRidge(), SVR(C=1.0, epsilon=0.2), 
          XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8),
         RandomForestRegressor()]

for model in models:
    model.fit(X_train, y_train)
    a = model.score(X_train, y_train)
    b = model.score(X_test, y_test)
    result = result.append({'name': str(model), 'Train': a, 'Test': b}, ignore_index=True)

In [14]:
result

Unnamed: 0,name,Train,Test
0,LinearRegression(),0.681867,-0.033162
1,BayesianRidge(),0.681003,-0.022702
2,SVR(epsilon=0.2),0.339685,-1.240414
3,"XGBRegressor(base_score=0.5, booster='gbtree',...",0.989183,0.254736
4,RandomForestRegressor(),0.969659,0.410449
