# Anna: Google Colab Import

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
# Import and read the final merged dataset file
from google.colab import files
 
 
uploaded = files.upload()

Saving Both_Sources_CleanedCombined.csv to Both_Sources_CleanedCombined.csv


In [18]:
# Import and read the dataset
import pandas as pd 
Merged_data_df = pd.read_csv("Both_Sources_CleanedCombined.csv")
Merged_data_df.head()

Unnamed: 0.1,Unnamed: 0,match_number,date,home_team,away_team,home_team_continent,away_team_continent,home_team_fifa_rank,away_team_fifa_rank,tournament,...,home_team_result,short_name,age,dob,height_cm,weight_kg,nationality_name,preferred_foot,body_type,player_traits
0,121946,23365,2022-01-12,Iceland,Uganda,Europe,Africa,62,82,Friendly,...,Draw,B. Sævarsson,36,11/11/84,187,74,Iceland,Right,Lean (185+),
1,121947,23365,2022-01-12,Iceland,Uganda,Europe,Africa,62,82,Friendly,...,Draw,H. Halldórsson,37,4/27/84,193,88,Iceland,Right,Normal (185+),Cautious With Crosses
2,121948,23365,2022-01-12,Iceland,Uganda,Europe,Africa,62,82,Friendly,...,Draw,K. Árnason,38,10/13/82,191,82,Iceland,Right,Lean (185+),"Long Throw-in, Leadership"
3,121949,23365,2022-01-12,Iceland,Uganda,Europe,Africa,62,82,Friendly,...,Draw,B. Bjarnason,33,5/27/88,183,77,Iceland,Right,Normal (170-185),"Long Throw-in, Power Header"
4,121950,23365,2022-01-12,Iceland,Uganda,Europe,Africa,62,82,Friendly,...,Draw,A. Sampsted,23,4/6/98,180,72,Iceland,Right,Lean (170-185),


In [19]:
Merged_data_df.drop(["date", "match_number", "short_name", "Unnamed: 0"], axis=1, inplace=True)

# Bess: Import from Postgres Code

In [None]:
# Import Password for Database & dependencies
from config import db_password
import pandas as pd
from sqlalchemy import create_engine

In [None]:
# Create database connection string
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/fifa_world_cup"

In [None]:
# Create the database engine
engine = create_engine(db_string)

In [None]:
# Connect to Postgres Database
# dataframe name = pd.read_sql_query('select * from "table_name"',con=engine)
Merged_data_df = pd.read_sql_query('select * from "combined_table"',con=engine)

In [None]:
Merged_data_df.drop(["date", "match_number", "short_name"], axis=1, inplace=True)

# Anna: Machine Learning Code

In [20]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf

In [21]:
# Determine the number of unique values in each column
Merged_data_df.nunique()

home_team               31
away_team               67
home_team_continent      5
away_team_continent      6
home_team_fifa_rank     42
away_team_fifa_rank     67
tournament               5
country                 33
home_team_result         3
age                     23
dob                    649
height_cm               36
weight_kg               40
nationality_name        31
preferred_foot           2
body_type               10
player_traits          281
dtype: int64

In [23]:
# Generate our categorical variable list
home_team_result = Merged_data_df.dtypes[Merged_data_df.dtypes == "object"].index.tolist()

In [24]:
# Check the number of unique values in each column
Merged_data_df[home_team_result].nunique()

home_team               31
away_team               67
home_team_continent      5
away_team_continent      6
tournament               5
country                 33
home_team_result         3
dob                    649
nationality_name        31
preferred_foot           2
body_type               10
player_traits          281
dtype: int64

In [25]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(Merged_data_df[home_team_result]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(home_team_result)
encode_df.head()



Unnamed: 0,home_team_Argentina,home_team_Australia,home_team_Austria,home_team_Belgium,home_team_Brazil,home_team_Canada,home_team_China PR,home_team_Czech Republic,home_team_Denmark,home_team_England,...,"player_traits_Speed Dribbler (AI), Outside Foot Shot","player_traits_Speed Dribbler (AI), Power Header, Team Player","player_traits_Speed Dribbler (AI), Team Player","player_traits_Speed Dribbler (AI), Technical Dribbler (AI)",player_traits_Team Player,"player_traits_Team Player, Comes For Crosses","player_traits_Team Player, Rushes Out Of Goal, Comes For Crosses","player_traits_Team Player, Technical Dribbler (AI)",player_traits_Technical Dribbler (AI),player_traits_nan
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [26]:
# Merge one-hot encoded features and drop the originals
Merged_data_df = Merged_data_df.merge(encode_df,left_index=True, right_index=True)
Merged_data_df = Merged_data_df.drop(home_team_result,1)
Merged_data_df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,home_team_fifa_rank,away_team_fifa_rank,age,height_cm,weight_kg,home_team_Argentina,home_team_Australia,home_team_Austria,home_team_Belgium,home_team_Brazil,...,"player_traits_Speed Dribbler (AI), Outside Foot Shot","player_traits_Speed Dribbler (AI), Power Header, Team Player","player_traits_Speed Dribbler (AI), Team Player","player_traits_Speed Dribbler (AI), Technical Dribbler (AI)",player_traits_Team Player,"player_traits_Team Player, Comes For Crosses","player_traits_Team Player, Rushes Out Of Goal, Comes For Crosses","player_traits_Team Player, Technical Dribbler (AI)",player_traits_Technical Dribbler (AI),player_traits_nan
0,62,82,36,187,74,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,62,82,37,193,88,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,62,82,38,191,82,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,62,82,33,183,77,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,62,82,23,180,72,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [32]:
training_data, testing_data = train_test_split(Merged_data_df, test_size=0.2, random_state=50)

print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

No. of training examples: 1950
No. of testing examples: 488


In [33]:
# Split our preprocessed data into our features and target arrays
y = Merged_data_df["home_team_result_Win"].values
X = Merged_data_df.drop(["home_team_result_Lose", "home_team_result_Win"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=50)

  This is separate from the ipykernel package so we can avoid doing imports until


In [34]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [35]:
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 100
hidden_nodes_layer2 = 80
hidden_nodes_layer3 = 10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 100)               112800    
                                                                 
 dense_5 (Dense)             (None, 80)                8080      
                                                                 
 dense_6 (Dense)             (None, 10)                810       
                                                                 
 dense_7 (Dense)             (None, 1)                 11        
                                                                 
Total params: 121,701
Trainable params: 121,701
Non-trainable params: 0
_________________________________________________________________


In [36]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [37]:
# Train the model
fit_model = nn.fit(X_train,y_train,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [38]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

20/20 - 0s - loss: 0.0029 - accuracy: 0.9984 - 124ms/epoch - 6ms/step
Loss: 0.002855140483006835, Accuracy: 0.9983606338500977
