# Data Inconsistencies / Anomalies

In [40]:
import pandas as pd

In [53]:
data = {
    'date': ['2021-12-01', '01-12-2022', '2022/12/01', '12-01-2021'],
    'country': ['USA', 'U.S.A.', 'America', 'United States'],
    'name': ['Zeeshan', 'Zeeeshan', 'Ali', 'alii'],
    'sales_2020': [100, 200, None, 200],
    'sales_2021': [None, 150, 300, 150]
}

df = pd.DataFrame(data)

In [54]:
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,Zeeshan,100.0,
1,01-12-2022,U.S.A.,Zeeeshan,200.0,150.0
2,2022/12/01,America,Ali,,300.0
3,12-01-2021,United States,alii,200.0,150.0


## Standardize date format

In [55]:

df['date'] = df['date'].apply(lambda x: pd.to_datetime(x, dayfirst=True, errors='coerce').strftime('%d-%m-%Y'))
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,12-01-2021,USA,Zeeshan,100.0,
1,01-12-2022,U.S.A.,Zeeeshan,200.0,150.0
2,12-01-2022,America,Ali,,300.0
3,12-01-2021,United States,alii,200.0,150.0


## Naming Convention

In [56]:
mapping = { 'USA' : 'United States', 
           'U.S.A.' : 'United States', 
           'America' : 'United States' }

df['country'] = df['country'].replace(mapping)
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,12-01-2021,United States,Zeeshan,100.0,
1,01-12-2022,United States,Zeeeshan,200.0,150.0
2,12-01-2022,United States,Ali,,300.0
3,12-01-2021,United States,alii,200.0,150.0


## TypoErrors

In [57]:
df['name'] = df['name'].replace({'Zeeeshan' : 'Zeeshan', 'alii' : 'Ali'})
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,12-01-2021,United States,Zeeshan,100.0,
1,01-12-2022,United States,Zeeshan,200.0,150.0
2,12-01-2022,United States,Ali,,300.0
3,12-01-2021,United States,Ali,200.0,150.0


## Duplicates

In [52]:
df = df.drop_duplicates(subset='name')
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,12-01-2021,United States,Zeeshan,100.0,
2,12-01-2022,United States,Ali,,300.0


In [59]:
df = df.drop(df[df['sales_2021'] <= df['sales_2020']].index)
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,12-01-2021,United States,Zeeshan,100.0,
2,12-01-2022,United States,Ali,,300.0
