In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn import linear_model

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.svm import SVR

In [None]:
from sklearn.multioutput import MultiOutputRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

**BUILDING DUMMY DATASET**

In [None]:
df = pd.DataFrame(columns=["h_feat_1", "h_feat_2", "h_feat_3", "a_feat_1", "a_feat_2", "a_feat_3", "h_goals", "a_goals"])

In [None]:
feats = np.random.randint(8, size=(8, 50))

In [None]:
df["h_feat_1"] = feats[0]
df["h_feat_2"] = feats[1]
df["h_feat_3"] = feats[2]
df["a_feat_1"] = feats[3]
df["a_feat_2"] = feats[4]
df["a_feat_3"] = feats[5]
df["h_goals"] = feats[6]
df["a_goals"] = feats[7]

In [None]:
df.head()

Unnamed: 0,h_feat_1,h_feat_2,h_feat_3,a_feat_1,a_feat_2,a_feat_3,h_goals,a_goals
0,2,4,4,2,1,4,2,3
1,6,6,6,3,5,1,4,3
2,6,2,4,6,0,4,1,5
3,2,5,1,1,3,3,1,7
4,7,3,2,1,1,4,3,4


**SPLITTING DATA INTO X AND Y**

In [None]:
X = df.drop(columns=["h_goals", "a_goals"])
y = df[["h_goals", "a_goals"]]

In [None]:
X_np = np.array(X)
y_np = np.array(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, test_size=0.2)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(40, 6)
(10, 6)
(40, 2)
(10, 2)


**INSIGHTS**

CORRELATION

In [None]:
X_t = X_np.T
y_t = y_np.T

In [None]:
X_t.shape

(6, 50)

In [None]:
y_t.shape

(2, 50)

In [None]:
corrs_1 = []
corrs_2 = []
for i in range(6):
  temp_x = pd.Series(X_t[i])
  temp_y1 = pd.Series(y_t[0])
  temp_y2 = pd.Series(y_t[1])
  corrs_1.append(temp_y1.corr(temp_x))
  corrs_2.append(temp_y2.corr(temp_x))

In [None]:
corrs_1

[-0.16591898873880306,
 0.16491626674072948,
 0.10788988552112636,
 -0.025304949438879128,
 -0.07651794438563209,
 0.22868296790971385]

In [None]:
corrs_2

[0.2602650136878104,
 0.15526966484187646,
 -0.056994145277039714,
 -0.02840539938050075,
 0.019005101860027112,
 -0.03084143665330193]

More resrouces/ideas for things to do - 
https://realpython.com/pandas-python-explore-dataset/, 
https://towardsdatascience.com/a-beginners-guide-to-data-analysis-in-python-188706df5447

**LINEAR REGRESSION**

In [None]:
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
preds = regr.predict(X_test)
preds.astype(int)

array([[3, 1],
       [3, 2],
       [3, 3],
       [3, 1],
       [3, 4],
       [3, 2],
       [4, 2],
       [3, 3],
       [5, 1],
       [4, 2]])

In [None]:
y_test

array([[3, 7],
       [4, 3],
       [3, 4],
       [1, 7],
       [1, 1],
       [4, 4],
       [6, 0],
       [4, 1],
       [7, 6],
       [6, 4]])

**SUPPORT VECTOR REGRESSOR**

In [None]:
svr = SVR(epsilon=0.2)
mor = MultiOutputRegressor(svr)
mor = mor.fit(X_train, y_train)
preds = mor.predict(X_test)
preds.astype(int)

array([[3, 1],
       [4, 2],
       [3, 3],
       [3, 2],
       [4, 2],
       [2, 2],
       [4, 2],
       [4, 2],
       [4, 2],
       [4, 2]])

**RANDOM FOREST REGRESSOR**

In [None]:
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=2, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [None]:
preds = regr.predict(X_test)
preds.astype(int)

array([[3, 2],
       [3, 3],
       [3, 4],
       [3, 2],
       [3, 3],
       [3, 2],
       [3, 3],
       [4, 2],
       [3, 2],
       [3, 2]])