# Data Cleaning
This is the full data cleaning report.

## Import Libraries

In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math

## Import CSV Files

In [27]:
inter_mth=pd.read_csv('international_matches.csv')
wld_mth=pd.read_csv('world_cup_matches.csv')
fifa_rank=pd.read_csv('2022_world_cup_groups.csv')

## Brief Overview

There are 3 csv files in this analysis and I imported them all. Let me break it down:<br>

1st file: international_matches.csv => This is a table of international soccer matches since 1872.<br>
2nd file: 2022_world_cup_matches.csv => This is a table of the scheduled matches for the 2022 World Cup.<br>
3rd file: 2022_world_cup_groups.csv => This is a table of the current groups for the 2022 World Cup.<br>

In [28]:
inter_mth.head(5)

Unnamed: 0,ID,Tournament,Date,Home Team,Home Goals,Away Goals,Away Team,Winning Team,Losing Team,Win Conditions,Home Stadium
0,1,Friendly,1872-11-30,Scotland,0,0,England,,,,True
1,2,Friendly,1873-03-08,England,4,2,Scotland,England,Scotland,,True
2,3,Friendly,1874-03-07,Scotland,2,1,England,Scotland,England,,True
3,4,Friendly,1875-03-06,England,2,2,Scotland,,,,True
4,5,Friendly,1876-03-04,Scotland,3,0,England,Scotland,England,,True


In [29]:
wld_mth.head(5)

Unnamed: 0,ID,Year,Date,Stage,Home Team,Home Goals,Away Goals,Away Team,Winning Team,Losing Team,Win Conditions,Host Team
0,1,1930,1930/7/13,Group stage,France,4,1,Mexico,France,Mexico,,False
1,2,1930,1930/7/13,Group stage,United States,3,0,Belgium,United States,Belgium,,False
2,3,1930,1930/7/14,Group stage,Yugoslavia,2,1,Brazil,Yugoslavia,Brazil,,False
3,4,1930,1930/7/14,Group stage,Romania,3,1,Peru,Romania,Peru,,False
4,5,1930,1930/7/15,Group stage,Argentina,1,0,France,Argentina,France,,False


In [30]:
fifa_rank.head(5)

Unnamed: 0,Group,Team,FIFA Ranking
0,A,Qatar,50
1,A,Ecuador,44
2,A,Senegal,18
3,A,Netherlands,8
4,B,England,5


## Evaluate data types across all 3 imported files

In [31]:
inter_mth.dtypes

ID                 int64
Tournament        object
Date              object
Home Team         object
Home Goals         int64
Away Goals         int64
Away Team         object
Winning Team      object
Losing Team       object
Win Conditions    object
Home Stadium        bool
dtype: object

In [32]:
wld_mth.dtypes

ID                 int64
Year               int64
Date              object
Stage             object
Home Team         object
Home Goals         int64
Away Goals         int64
Away Team         object
Winning Team      object
Losing Team       object
Win Conditions    object
Host Team           bool
dtype: object

In [33]:
fifa_rank.dtypes

Group           object
Team            object
FIFA Ranking     int64
dtype: object

## Dropping Column
I will remove the Win Condition column as it has null data in both international_matches.csv & 2022_world_cup_matches.csv files.

In [34]:
inter_mth = inter_mth.drop(columns=['Win Conditions'])
inter_mth.head()

Unnamed: 0,ID,Tournament,Date,Home Team,Home Goals,Away Goals,Away Team,Winning Team,Losing Team,Home Stadium
0,1,Friendly,1872-11-30,Scotland,0,0,England,,,True
1,2,Friendly,1873-03-08,England,4,2,Scotland,England,Scotland,True
2,3,Friendly,1874-03-07,Scotland,2,1,England,Scotland,England,True
3,4,Friendly,1875-03-06,England,2,2,Scotland,,,True
4,5,Friendly,1876-03-04,Scotland,3,0,England,Scotland,England,True


In [35]:
wld_math = wld_mth.drop(columns=['Win Conditions'])
wld_mth.head()

Unnamed: 0,ID,Year,Date,Stage,Home Team,Home Goals,Away Goals,Away Team,Winning Team,Losing Team,Win Conditions,Host Team
0,1,1930,1930/7/13,Group stage,France,4,1,Mexico,France,Mexico,,False
1,2,1930,1930/7/13,Group stage,United States,3,0,Belgium,United States,Belgium,,False
2,3,1930,1930/7/14,Group stage,Yugoslavia,2,1,Brazil,Yugoslavia,Brazil,,False
3,4,1930,1930/7/14,Group stage,Romania,3,1,Peru,Romania,Peru,,False
4,5,1930,1930/7/15,Group stage,Argentina,1,0,France,Argentina,France,,False


## Converting Date Column into Year

In [36]:
inter_mth['Year']=pd.DatetimeIndex(inter_mth['Date']).year

In [37]:
grp=fifa_rank.groupby('Group')
Fifa_grp_2022=list(fifa_rank['Team'].unique())
Fifa_grp_2022

['Qatar',
 'Ecuador',
 'Senegal',
 'Netherlands',
 'England',
 'Iran',
 'United States',
 'Wales',
 'Argentina',
 'Saudi Arabia',
 'Mexico',
 'Poland',
 'France',
 'Australia',
 'Denmark',
 'Tunisia',
 'Spain',
 'Costa Rica',
 'Germany',
 'Japan',
 'Belgium',
 'Canada',
 'Morocco',
 'Croatia',
 'Brazil',
 'Serbia',
 'Switzerland',
 'Cameroon',
 'Portugal',
 'Ghana',
 'Uruguay',
 'South Korea']

## Final Shape of the Data Frames:

In [40]:
inter_mth.shape

(17769, 11)

In [41]:
wld_math.shape

(900, 11)