### Task: Explore the Datasets and Seggregate the forwads from other players based on the position

### Import the libraries

In [1]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from pymongo import MongoClient
import matplotlib.pyplot as plt

In [18]:
# To print multiple outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Set up the MongoDB connection in Python

In [2]:
client=MongoClient('localhost',27017)
db=client['Wyscout']

### select only forwards from player_advance_stats and write to a DF

In [5]:
forward_details=db.player_advance_stats.aggregate([
{'$unwind': "$positions"

},
{'$project':{"_id":0,"playerId":1,"competitionId":1,"seasonId":1,"positions.position.code":1,"positions.position.name":1}
},
{'$match' : { '$or': [{"positions.position.code":"lwf"},{"positions.position.code":"rwf"},{"positions.position.code":"rw"},{"positions.position.code":"lw"},{"positions.position.code":"ss"},{"positions.position.code":"cf"}]} 
}
  
])
forward_details_df=pd.DataFrame(forward_details)

In [6]:
forward_details_df.head(5)

Unnamed: 0,playerId,competitionId,seasonId,positions
0,25413,364,185727,"{'position': {'name': 'Striker', 'code': 'cf'}}"
1,25867,364,185727,"{'position': {'name': 'Striker', 'code': 'cf'}}"
2,247671,364,185727,"{'position': {'name': 'Right Wing Forward', 'c..."
3,247671,364,185727,"{'position': {'name': 'Striker', 'code': 'cf'}}"
4,346158,364,185727,"{'position': {'name': 'Striker', 'code': 'cf'}}"


### consider only the required columns

In [7]:
forward_details_df1 = forward_details_df[['playerId','competitionId','seasonId']]

In [8]:
len(forward_details_df1)

213

### Remove duplicate records & compare the number of records

In [9]:
forward_details_df1=forward_details_df1.drop_duplicates()

In [10]:
len(forward_details_df1)

150

#### while merging getting an error: 'trying to merge on object and int64 columns' so tried converting all the relevant cols to 'int32'
#### it didn't work and hence it should be converted to csv and read it back as DF, reference:https://stackoverflow.com/questions/50649853/trying-to-merge-2-dataframes-but-get-valueerror

In [11]:
forward_details_df1.to_csv('forward_details_df1.csv')

In [12]:
forward_details_df2 = pd.read_csv('forward_details_df1.csv', index_col=0)

In [13]:
# forward_details_df1.playerId.astype(int)

#### Read the 'matches_events' into DF & as mentioned above convert it into CSV & read it back into DF

In [19]:
event_details=db.matches_events_scaling_v02.find({},{"_id":0 })
event_details_df=pd.DataFrame(event_details)
event_details_df.head(5)

Unnamed: 0,index,id,playerId,teamId,matchId,matchPeriod,eventSec,eventId,eventName,subEventId,...,y start,x stop,y stop,tags,match_id,team_id,x start length (meters),y start length (meters),x stop length (meters),y stop length(meters)
0,0,613566210,346158,1609,2829960,1H,0.811034,8,Pass,85,...,51,28.0,63.0,1801.0,2829960,1609,52.5,34.68,29.4,42.84
1,1,613566221,20612,1609,2829960,1H,3.969.135,8,Pass,83,...,63,72.0,6.0,1801.0,2829960,1609,29.4,42.84,75.6,4.08
2,2,613566223,25867,1609,2829960,1H,7.277.111,7,Others on the ball,72,...,6,77.0,0.0,1302.0,2829960,1609,75.6,4.08,80.85,0.0
3,3,613565944,0,1625,2829960,1H,9.668.171,5,Interruption,50,...,100,,,,2829960,1609,24.15,68.0,0.0,0.0
4,4,613565945,8277,1625,2829960,1H,16.161.527,3,Free Kick,36,...,100,3.0,50.0,1801.0,2829960,1609,17.85,68.0,3.15,34.0


In [20]:
event_details_df.to_csv('event_details_df.csv')

In [21]:
event_details_df = pd.read_csv('event_details_df.csv', index_col=0)

In [22]:
event_details_df.head(2)

Unnamed: 0,index,id,playerId,teamId,matchId,matchPeriod,eventSec,eventId,eventName,subEventId,...,y start,x stop,y stop,tags,match_id,team_id,x start length (meters),y start length (meters),x stop length (meters),y stop length(meters)
0,0,613566210,346158,1609,2829960,1H,0.811034,8,Pass,85,...,51,28.0,63.0,1801,2829960,1609,52.5,34.68,29.4,42.84
1,1,613566221,20612,1609,2829960,1H,3.969.135,8,Pass,83,...,63,72.0,6.0,1801,2829960,1609,29.4,42.84,75.6,4.08


### Merge both the dataframes on 'playerId' : Final Result

In [23]:
ForwardEventDetails=forward_details_df2.merge(event_details_df, on='playerId', how = 'left')

In [24]:
pd.set_option('display.max_columns',24)     # all the 24 columns will be displayed
ForwardEventDetails.head(3)

Unnamed: 0,playerId,competitionId,seasonId,index,id,teamId,matchId,matchPeriod,eventSec,eventId,eventName,subEventId,subEventName,x start,y start,x stop,y stop,tags,match_id,team_id,x start length (meters),y start length (meters),x stop length (meters),y stop length(meters)
0,25413,364,185727,1266.0,613567906.0,1609.0,2829960.0,2H,1.591.887.326,8.0,Pass,85.0,Simple pass,38.0,61.0,29.0,46.0,1801,2829960.0,1609.0,39.9,41.48,30.45,31.28
1,25413,364,185727,1291.0,613568133.0,1609.0,2829960.0,2H,1.791.359.704,1.0,Duel,12.0,Ground defending duel,34.0,74.0,37.0,87.0,5047011802,2829960.0,1609.0,35.7,50.32,38.85,59.16
2,25413,364,185727,1341.0,613567939.0,1609.0,2829960.0,2H,1.959.299.888,1.0,Duel,11.0,Ground attacking duel,36.0,9.0,35.0,9.0,7031801,2829960.0,1609.0,37.8,6.12,36.75,6.12


In [26]:
ForwardEventDetails.isnull().values.any()   # There are null values
ForwardEventDetails.isnull().sum().sum()

True

5064

In [27]:
ForwardEventDetails=ForwardEventDetails.replace(np.nan,0)  # replace 'NaN with 0'
ForwardEventDetails.head(5)

Unnamed: 0,playerId,competitionId,seasonId,index,id,teamId,matchId,matchPeriod,eventSec,eventId,eventName,subEventId,subEventName,x start,y start,x stop,y stop,tags,match_id,team_id,x start length (meters),y start length (meters),x stop length (meters),y stop length(meters)
0,25413,364,185727,1266.0,613567906.0,1609.0,2829960.0,2H,1.591.887.326,8.0,Pass,85.0,Simple pass,38.0,61.0,29.0,46.0,1801,2829960.0,1609.0,39.9,41.48,30.45,31.28
1,25413,364,185727,1291.0,613568133.0,1609.0,2829960.0,2H,1.791.359.704,1.0,Duel,12.0,Ground defending duel,34.0,74.0,37.0,87.0,5047011802,2829960.0,1609.0,35.7,50.32,38.85,59.16
2,25413,364,185727,1341.0,613567939.0,1609.0,2829960.0,2H,1.959.299.888,1.0,Duel,11.0,Ground attacking duel,36.0,9.0,35.0,9.0,7031801,2829960.0,1609.0,37.8,6.12,36.75,6.12
3,25413,364,185727,1351.0,613567949.0,1609.0,2829960.0,2H,2.010.140.716,8.0,Pass,85.0,Simple pass,25.0,43.0,22.0,29.0,14011801,2829960.0,1609.0,26.25,29.24,23.1,19.72
4,25413,364,185727,1417.0,613567995.0,1609.0,2829960.0,2H,2.843.618.148,8.0,Pass,85.0,Simple pass,73.0,17.0,75.0,25.0,1802,2829960.0,1609.0,76.65,11.56,78.75,17.0
