# About
This notebook contains a first simple ML model

In [88]:
# Necessary to import custom modules
import os
os.chdir("/home/jovyan/work")

from neomodel import db
import pandas as pd

from src.utils import init_connection

In [89]:
init_connection()

In [90]:
cypher = """
MATCH 
    (t:Team)-[:SCORED]->(s:Score)-[:IN_GAME]->(g:Game)<-[:IN_GAME]-(s2:Score)<-[:SCORED]-(t2:Team), (sea:Season), (arena_game:Arena), (arena_t:Arena)
WHERE 
    (g)-[:TOOK_PLACE_IN]->(sea)
AND
	(g)-[:LOCATED_IN]->(arena_game)
AND
	(t)-[:HAVE_HOME_COURT_AT]->(arena_t)
RETURN 
    t.name as team, 
    sum(
    	CASE WHEN s.score > s2.score 
        AND g.game_type = "regular_season" 
        AND arena_game.name = arena_t.name
        THEN 1 ELSE 0 END) 
    as wins_reg_home,
    sum(
    	CASE WHEN s.score > s2.score 
        AND g.game_type = "regular_season" 
        AND arena_game.name <> arena_t.name
        THEN 1 ELSE 0 END) 
    as wins_reg_away,
    sum(
    	CASE WHEN s.score < s2.score 
        AND g.game_type = "regular_season" 
        AND arena_game.name = arena_t.name
        THEN 1 ELSE 0 END) 
    as losses_reg_home,
    sum(
    	CASE WHEN s.score < s2.score
        AND g.game_type = "regular_season" 
        AND arena_game.name <> arena_t.name
        THEN 1 ELSE 0 END) as losses_reg_away,
	sea.name as season
ORDER BY 
    team, season
"""

In [91]:
data, columns = db.cypher_query(cypher)

In [92]:
df_results = pd.DataFrame(data=data, columns=columns)

In [93]:
df_test = df_results.pivot(index="team", columns="season")

In [94]:
cols = []
for a, b in zip(df_test.columns.get_level_values(0), df_test.columns.get_level_values(1)):
    cols.append(a+"_"+b)

In [95]:
cols

['wins_reg_home_2015/2016',
 'wins_reg_home_2016/2017',
 'wins_reg_home_2017/2018',
 'wins_reg_away_2015/2016',
 'wins_reg_away_2016/2017',
 'wins_reg_away_2017/2018',
 'losses_reg_home_2015/2016',
 'losses_reg_home_2016/2017',
 'losses_reg_home_2017/2018',
 'losses_reg_away_2015/2016',
 'losses_reg_away_2016/2017',
 'losses_reg_away_2017/2018']

In [96]:
df_test.columns = cols

In [97]:
df_test.head()

Unnamed: 0_level_0,wins_reg_home_2015/2016,wins_reg_home_2016/2017,wins_reg_home_2017/2018,wins_reg_away_2015/2016,wins_reg_away_2016/2017,wins_reg_away_2017/2018,losses_reg_home_2015/2016,losses_reg_home_2016/2017,losses_reg_home_2017/2018,losses_reg_away_2015/2016,losses_reg_away_2016/2017,losses_reg_away_2017/2018
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Atlanta Hawks,27,23,16,21,20,8,14,18,25,20,21,33
Boston Celtics,28,30,27,20,23,28,13,11,14,21,18,13
Brooklyn Nets,14,13,15,7,7,13,27,28,26,34,34,28
Charlotte Hornets,30,22,21,18,14,15,11,19,20,23,27,26
Chicago Bulls,26,25,17,16,16,10,15,16,24,25,25,31


#### Extract Matchup

```
MATCH 
	(t:Team)-[:SCORED]->(s:Score)-[:IN_GAME]->(g:Game)<-[:IN_GAME]-(s2:Score)<-[:SCORED]-(t2:Team), (sea:Season)
WHERE 
    (g)-[:TOOK_PLACE_IN]->(sea)
AND
	sea.name = "2017/2018"
AND
	g.game_type = "regular_season"
WITH g, collect(t.name) as teams, collect(s.score) as scores
RETURN
	HEAD(teams) as teamA, HEAD(TAIL(scores)) as scoreA, HEAD(TAIL(teams)) as teamB, HEAD(scores) as scoreB
```

In [152]:
cypher = """
MATCH 
	(t:Team)-[:SCORED]->(s:Score)-[:IN_GAME]->(g:Game)<-[:IN_GAME]-(s2:Score)<-[:SCORED]-(t2:Team), (sea:Season)
WHERE 
    (g)-[:TOOK_PLACE_IN]->(sea)
AND
	sea.name = "2017/2018"
AND
	g.game_type = "regular_season"
WITH g, collect(t.name) as teams, collect(s.score) as scores
RETURN
	HEAD(teams) as teamA, HEAD(TAIL(scores)) as scoreA, HEAD(TAIL(teams)) as teamB, HEAD(scores) as scoreB
"""

In [153]:
data, cols = db.cypher_query(cypher)

In [154]:
df_matches = pd.DataFrame(data=data, columns=cols)

In [155]:
df_matches["teamAWinner"] =df_matches["scoreA"] > df_matches["scoreB"]

In [156]:
df_test.head().reset_index()

Unnamed: 0,team,wins_reg_home_2015/2016,wins_reg_home_2016/2017,wins_reg_home_2017/2018,wins_reg_away_2015/2016,wins_reg_away_2016/2017,wins_reg_away_2017/2018,losses_reg_home_2015/2016,losses_reg_home_2016/2017,losses_reg_home_2017/2018,losses_reg_away_2015/2016,losses_reg_away_2016/2017,losses_reg_away_2017/2018
0,Atlanta Hawks,27,23,16,21,20,8,14,18,25,20,21,33
1,Boston Celtics,28,30,27,20,23,28,13,11,14,21,18,13
2,Brooklyn Nets,14,13,15,7,7,13,27,28,26,34,34,28
3,Charlotte Hornets,30,22,21,18,14,15,11,19,20,23,27,26
4,Chicago Bulls,26,25,17,16,16,10,15,16,24,25,25,31


In [157]:
df_matches.head()

Unnamed: 0,teamA,scoreA,teamB,scoreB,teamAWinner
0,Miami Heat,79,Cleveland Cavaliers,98,False
1,Atlanta Hawks,117,Milwaukee Bucks,106,True
2,Utah Jazz,106,Brooklyn Nets,114,False
3,Indiana Pacers,95,Cleveland Cavaliers,97,False
4,Cleveland Cavaliers,111,Memphis Grizzlies,116,False


In [158]:
pd.merge(df_matches, df_test.reset_index(), left_on="teamA", right_on="team").head()

Unnamed: 0,teamA,scoreA,teamB,scoreB,teamAWinner,team,wins_reg_home_2015/2016,wins_reg_home_2016/2017,wins_reg_home_2017/2018,wins_reg_away_2015/2016,wins_reg_away_2016/2017,wins_reg_away_2017/2018,losses_reg_home_2015/2016,losses_reg_home_2016/2017,losses_reg_home_2017/2018,losses_reg_away_2015/2016,losses_reg_away_2016/2017,losses_reg_away_2017/2018
0,Miami Heat,79,Cleveland Cavaliers,98,False,Miami Heat,28,23,26,20,18,18,13,18,15,21,23,23
1,Miami Heat,141,Denver Nuggets,149,False,Miami Heat,28,23,26,20,18,18,13,18,15,21,23,23
2,Miami Heat,96,Boston Celtics,90,True,Miami Heat,28,23,26,20,18,18,13,18,15,21,23,23
3,Miami Heat,109,New Orleans Pelicans,94,True,Miami Heat,28,23,26,20,18,18,13,18,15,21,23,23
4,Miami Heat,120,Indiana Pacers,95,True,Miami Heat,28,23,26,20,18,18,13,18,15,21,23,23


In [159]:
df_ml = pd.merge(
    pd.merge(df_matches, df_test.reset_index(), left_on="teamA", right_on="team"),
    df_test.reset_index(), 
    left_on="teamB", 
    right_on="team",
    suffixes=["_A", "_B"]
).drop(
    ["scoreA", "scoreB", "teamA", "teamB", "team_A", "team_B", "wins_reg_home_2017/2018_A", "wins_reg_away_2017/2018_A", "wins_reg_home_2017/2018_B", "wins_reg_away_2017/2018_B", "losses_reg_home_2017/2018_A", "losses_reg_away_2017/2018_A", "losses_reg_home_2017/2018_B", "losses_reg_away_2017/2018_B"], 
    axis=1)

In [160]:
df_ml.head()

Unnamed: 0,teamAWinner,wins_reg_home_2015/2016_A,wins_reg_home_2016/2017_A,wins_reg_away_2015/2016_A,wins_reg_away_2016/2017_A,losses_reg_home_2015/2016_A,losses_reg_home_2016/2017_A,losses_reg_away_2015/2016_A,losses_reg_away_2016/2017_A,wins_reg_home_2015/2016_B,wins_reg_home_2016/2017_B,wins_reg_away_2015/2016_B,wins_reg_away_2016/2017_B,losses_reg_home_2015/2016_B,losses_reg_home_2016/2017_B,losses_reg_away_2015/2016_B,losses_reg_away_2016/2017_B
0,False,28,23,20,18,13,18,21,23,33,31,24,20,8,10,17,21
1,True,27,23,21,20,14,18,20,21,33,31,24,20,8,10,17,21
2,True,27,23,21,20,14,18,20,21,33,31,24,20,8,10,17,21
3,False,24,29,16,22,17,12,25,19,33,31,24,20,8,10,17,21
4,False,26,29,19,13,15,12,22,28,33,31,24,20,8,10,17,21


In [28]:
from sklearn.linear_model import LogisticRegression

In [29]:
lr = LogisticRegression()

In [30]:
from sklearn.model_selection import train_test_split

In [33]:
X_train, X_test, y_train, y_test = train_test_split(
    df_ml.drop("teamAWinner", axis=1), 
    df_ml["teamAWinner"]
)

lr.fit(X_train, y_train)

lr.score(X_test, y_test)

0.62987012987012991

# Average Win Margin per Team 

## Validierung der eigentlichen Cypher-Query

In [161]:
cypher = """
MATCH 
	(t:Team)-[:SCORED]->(s:Score)-[:IN_GAME]->(g:Game)<-[:IN_GAME]-(s2:Score)<-[:SCORED]-(t2:Team), (sea:Season)
WHERE 
    (g)-[:TOOK_PLACE_IN]->(sea)
AND
	sea.name = "2017/2018"
AND
	g.game_type = "regular_season"
WITH g, collect(t.name) as teams, collect(s.score) as scores
RETURN
	HEAD(teams) as teamA, HEAD(TAIL(scores)) as scoreA, HEAD(TAIL(teams)) as teamB, HEAD(scores) as scoreB
"""

In [162]:
data, cols = db.cypher_query(cypher)

In [163]:
df_matchesX = pd.DataFrame(data=data, columns=cols)

In [164]:
df_matchesX.head()

Unnamed: 0,teamA,scoreA,teamB,scoreB
0,Miami Heat,79,Cleveland Cavaliers,98
1,Atlanta Hawks,117,Milwaukee Bucks,106
2,Utah Jazz,106,Brooklyn Nets,114
3,Indiana Pacers,95,Cleveland Cavaliers,97
4,Cleveland Cavaliers,111,Memphis Grizzlies,116


In [165]:
df_matches2 = df_matchesX

In [166]:
df_matches2["scoreMargin"] = abs(df_matches2["scoreA"]-df_matches2["scoreB"])

In [167]:
df_matches2.head()

Unnamed: 0,teamA,scoreA,teamB,scoreB,scoreMargin
0,Miami Heat,79,Cleveland Cavaliers,98,19
1,Atlanta Hawks,117,Milwaukee Bucks,106,11
2,Utah Jazz,106,Brooklyn Nets,114,8
3,Indiana Pacers,95,Cleveland Cavaliers,97,2
4,Cleveland Cavaliers,111,Memphis Grizzlies,116,5


In [168]:
a = ["Atlanta Hawks"]

In [169]:
df_onlyBOSmatches = df_matches2[(df_matches2['teamA'].isin(a)) | (df_matches2['teamB'].isin(a)) ]

In [170]:
df_onlyBOSmatches["scoreMargin"].mean()

10.865853658536585

## Actual Cypher Query

In [171]:
cypher = """
MATCH 
    (t:Team)-[:SCORED]->(s:Score)-[:IN_GAME]->(g:Game)<-[:IN_GAME]-(s2:Score)<-[:SCORED]-(t2:Team), (sea:Season)
WHERE 
    (g)-[:TOOK_PLACE_IN]->(sea)
AND
	g.game_type= "regular_season"
RETURN 
    t.name as team, 
    avg(abs(s.score - s2.score)) as scoreMargin,
    sea.name as season
ORDER BY 
    team, season

"""

In [172]:
data, cols = db.cypher_query(cypher)

In [173]:
df_scoreMargin = pd.DataFrame(data=data, columns=cols)

In [174]:
df_scoreMargin.head()

Unnamed: 0,team,scoreMargin,season
0,Atlanta Hawks,11.634146,2015/2016
1,Atlanta Hawks,11.170732,2016/2017
2,Atlanta Hawks,10.865854,2017/2018
3,Boston Celtics,10.695122,2015/2016
4,Boston Celtics,9.195122,2016/2017


In [175]:
df_scoreMargin_test = df_scoreMargin.pivot(index="team", columns="season")

In [176]:
df_scoreMargin_test.head()

Unnamed: 0_level_0,scoreMargin,scoreMargin,scoreMargin
season,2015/2016,2016/2017,2017/2018
team,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Atlanta Hawks,11.634146,11.170732,10.865854
Boston Celtics,10.695122,9.195122,9.219512
Brooklyn Nets,10.743902,12.195122,10.231707
Charlotte Hornets,11.134146,10.195122,11.768293
Chicago Bulls,9.865854,12.036585,12.231707


In [177]:
cols = []
for a, b in zip(df_scoreMargin_test.columns.get_level_values(0), df_scoreMargin_test.columns.get_level_values(1)):
    cols.append(a+"_"+b)

In [178]:
cols

['scoreMargin_2015/2016', 'scoreMargin_2016/2017', 'scoreMargin_2017/2018']

In [179]:
df_scoreMargin_test.columns = cols

In [180]:
df_scoreMargin_test.head()

Unnamed: 0_level_0,scoreMargin_2015/2016,scoreMargin_2016/2017,scoreMargin_2017/2018
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Atlanta Hawks,11.634146,11.170732,10.865854
Boston Celtics,10.695122,9.195122,9.219512
Brooklyn Nets,10.743902,12.195122,10.231707
Charlotte Hornets,11.134146,10.195122,11.768293
Chicago Bulls,9.865854,12.036585,12.231707


In [181]:
df_ml_1 = pd.merge(
    pd.merge(df_matches, df_test.reset_index(), left_on="teamA", right_on="team"),
    df_test.reset_index(), 
    left_on="teamB", 
    right_on="team",
    suffixes=["_A", "_B"]
)

In [182]:
df_ml_2 = pd.merge(
    pd.merge(df_ml_1, df_scoreMargin_test.reset_index(), left_on="teamA", right_on="team"),
    df_scoreMargin_test.reset_index(), 
    left_on="teamB", 
    right_on="team",
    suffixes=["_A", "_B"]
).drop(
    ["scoreA", "scoreB", "teamA", "teamB", "team_A", "team_B", "wins_reg_2017/2018_A", "losses_reg_2017/2018_A", "wins_reg_2017/2018_B", "losses_reg_2017/2018_B", "scoreMargin_2017/2018_A", "scoreMargin_2017/2018_B" ], 
    axis=1)

In [183]:
df_ml_2.head()

Unnamed: 0,teamAWinner,wins_reg_home_2015/2016_A,wins_reg_home_2016/2017_A,wins_reg_home_2017/2018_A,wins_reg_away_2015/2016_A,wins_reg_away_2016/2017_A,wins_reg_away_2017/2018_A,losses_reg_home_2015/2016_A,losses_reg_home_2016/2017_A,losses_reg_home_2017/2018_A,...,losses_reg_home_2015/2016_B,losses_reg_home_2016/2017_B,losses_reg_home_2017/2018_B,losses_reg_away_2015/2016_B,losses_reg_away_2016/2017_B,losses_reg_away_2017/2018_B,scoreMargin_2015/2016_A,scoreMargin_2016/2017_A,scoreMargin_2015/2016_B,scoreMargin_2016/2017_B
0,False,28,23,26,20,18,18,13,18,15,...,8,10,12,17,21,20,11.060976,9.890244,11.707317,12.134146
1,True,27,23,16,21,20,8,14,18,25,...,8,10,12,17,21,20,11.634146,11.170732,11.707317,12.134146
2,True,27,23,16,21,20,8,14,18,25,...,8,10,12,17,21,20,11.634146,11.170732,11.707317,12.134146
3,False,24,29,28,16,22,20,17,12,13,...,8,10,12,17,21,20,11.378049,11.207317,11.707317,12.134146
4,False,26,29,27,19,13,21,15,12,14,...,8,10,12,17,21,20,9.658537,11.707317,11.707317,12.134146


## Score Margin splitted into home and away

In [184]:
cypher = """
MATCH 
    (t:Team)-[:SCORED]->(s:Score)-[:IN_GAME]->(g:Game)<-[:IN_GAME]-(s2:Score)<-[:SCORED]-(t2:Team), (sea:Season)
WHERE 
    (g)-[:TOOK_PLACE_IN]->(sea)
AND
	g.game_type= "regular_season"
RETURN 
    t.name as team, 
    sea.name as season,
    CASE WHEN s.score > s2.score THEN avg(abs(s.score - s2.score)) ELSE 0 END as scoreMarginWins,
    CASE WHEN s.score < s2.score THEN avg(abs(s.score - s2.score)) ELSE 0 END as scoreMarginLosses

ORDER BY 
    team, season

"""

In [185]:
data, cols = db.cypher_query(cypher)

In [186]:
df_scoreMargin = pd.DataFrame(data=data, columns=cols)

In [187]:
df_scoreMargin.head()

Unnamed: 0,team,season,scoreMarginWins,scoreMarginLosses
0,Atlanta Hawks,2015/2016,0.0,9.676471
1,Atlanta Hawks,2015/2016,13.020833,0.0
2,Atlanta Hawks,2016/2017,0.0,12.641026
3,Atlanta Hawks,2016/2017,9.837209,0.0
4,Atlanta Hawks,2017/2018,0.0,11.534483


In [188]:
df_scoreMargin_home_away = df_scoreMargin.groupby(["team", "season"]).sum().reset_index()

In [189]:
df_scoreMargin_test = df_scoreMargin_home_away.pivot(index="team", columns="season")

In [190]:
cols = []
for a, b in zip(df_scoreMargin_test.columns.get_level_values(0), df_scoreMargin_test.columns.get_level_values(1)):
    cols.append(a+"_"+b)

In [191]:
df_scoreMargin_test.columns = cols

## Validation of numbers

In [192]:
df_scoreMargin_home_away[df_scoreMargin_home_away["team"] == "Houston Rockets"]

Unnamed: 0,team,season,scoreMarginWins,scoreMarginLosses
30,Houston Rockets,2015/2016,10.268293,9.878049
31,Houston Rockets,2016/2017,13.490909,9.962963
32,Houston Rockets,2017/2018,13.076923,9.117647


(4+5+1+18+7+11+2+3+17+13)

(6+10+6+10+16+8+8)

In [193]:
test = [14,5,1,18,7,11,2,3,17,13,6,10,6,10,16,8,8]

In [194]:
import numpy as np
np.mean(test)

9.117647058823529

# Models

In [195]:
from sklearn.linear_model import LogisticRegression

In [196]:
lr = LogisticRegression()

In [197]:
from sklearn.model_selection import train_test_split

In [205]:
X_train, X_test, y_train, y_test = train_test_split(
    df_ml_2.drop("teamAWinner", axis=1), 
    df_ml_2["teamAWinner"]
)

lr.fit(X_train, y_train)

lr.score(X_test, y_test)

0.7142857142857143

In [202]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

In [200]:
X_train, X_test, y_train, y_test = train_test_split(
    df_ml_2.drop("teamAWinner", axis=1), 
    df_ml_2["teamAWinner"]
)
clf.fit(X_train, y_train) 
clf.score(X_test, y_test)

0.59740259740259738