# Data Validation

In [78]:
# Import Libraries

# data manipulation and analysis
import pandas as pd

# multi-dimensional arrays and matrices
# mathematical functions
import numpy as np

# parsing & processing Python Source Code
# convert strings of Python code into executable code
import ast # Abstract Syntax Trees (AST) module

# data visualization
import matplotlib.pyplot as plt # creating static, animated, and interactive visualizations
import seaborn as sns # interface for drawing & statistical graphics
import mplcursors # interactive data cursors
import plotly.express as px

# database adapter for Python
import psycopg2 # allows interaction with PostgreSQL
import pandas as pd

# probability distributions and statistical functions
from scipy.stats import norm 
from scipy.stats import ttest_ind
from scipy.stats import f_oneway
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [79]:
# Load Data
df_1 = pd.read_csv(r'C:\Users\ashwi\Documents\Data Analytics\Portfolio\IGN VIdeo Game Rarings\SQL\sql_cleaned_games.csv', index_col=False, delimiter=',')

## Data Inspection

### Score

In [80]:
# find min score min and max
min_score = df_1['score'].min()
max_score = df_1['score'].max()

print(f'minimum score: {min_score}')
print(f'maximum score: {max_score}')

minimum score: 0.5
maximum score: 10.0


In [81]:
## Data Integrity

invalid_score = df_1[df_1['score'] < 0]
if not invalid_score.empty:
    print("Invalid score values found:")
    print(invalid_score)

### Release Year

In [82]:
# find min release year and max
min_rel_yr = df_1['release_year'].min()
max_rel_yr = df_1['release_year'].max()

print(f'minimum release year: {min_rel_yr}')
print(f'maximum release year: {max_rel_yr}')

minimum release year: 1970
maximum release year: 2016


In [83]:
invalid_release_year = df_1[df_1['release_year'] < 0]
if not invalid_release_year.empty:
    print("Invalid release_year values found:")
    print(invalid_release_year)

### Release Month

In [84]:
# find min release year and max
min_rel_mon = df_1['release_month'].min()
max_rel_mon = df_1['release_month'].max()

print(f'minimum release month: {min_rel_mon}')
print(f'maximum release month: {max_rel_mon}')

minimum release month: 1
maximum release month: 12


In [85]:
invalid_release_month = df_1[df_1['release_month'] < 0]
if not invalid_release_month.empty:
    print("Invalid release_month values found:")
    print(invalid_release_month)

### Release Day

In [86]:
# find min release year and max
min_rel_day = df_1['release_day'].min()
max_rel_day = df_1['release_day'].max()

print(f'minimum release day: {min_rel_day}')
print(f'maximum release day: {max_rel_day}')

minimum release day: 1
maximum release day: 31


In [87]:
invalid_release_day = df_1[df_1['release_day'] < 0]
if not invalid_release_day.empty:
    print("Invalid release_day values found:")
    print(invalid_release_day)

## Duplicates

In [88]:
# Count duplicate rows
num_duplicates = df_1.duplicated().sum()

num_duplicates

48

In [89]:
# Find duplicate rows
duplicate_rows = df_1[df_1.duplicated(keep=False)]
duplicate_rows

Unnamed: 0,title,score,score_phrase,platform,genre,release_year,release_month,release_day
173,Call of Duty: Modern Warfare 2 (Hardened Edition),9.5,Amazing,PlayStation 3,Shooter,2009,11,10
174,Call of Duty: Modern Warfare 2 (Hardened Edition),9.5,Amazing,PlayStation 3,Shooter,2009,11,10
1662,Steel Talons,9.0,Amazing,Lynx,"Flight, Action",1999,7,6
1663,Steel Talons,9.0,Amazing,Lynx,"Flight, Action",1999,7,6
1835,WipEout XL,9.0,Amazing,PlayStation,Racing,1996,11,26
...,...,...,...,...,...,...,...,...
18109,Raven Project,3.0,Awful,PlayStation,Action,1996,11,25
18355,Big League Sports,2.3,Painful,Wii,"Sports, Compilation",2008,11,25
18356,Big League Sports,2.3,Painful,Wii,"Sports, Compilation",2008,11,25
18616,Revolution X,1.0,Unbearable,PlayStation,Action,1996,11,25


In [90]:
# Drop duplicate rows within the duplicate_rows DataFrame
unique_within_duplicates = duplicate_rows.drop_duplicates()

# Find rows that are unique within the duplicate_rows DataFrame
# These are the rows that appear only once after dropping duplicates
unique_rows_from_duplicates = unique_within_duplicates[
    ~unique_within_duplicates.duplicated(keep=False)
]

# Display the result
unique_rows_from_duplicates

Unnamed: 0,title,score,score_phrase,platform,genre,release_year,release_month,release_day
173,Call of Duty: Modern Warfare 2 (Hardened Edition),9.5,Amazing,PlayStation 3,Shooter,2009,11,10
1662,Steel Talons,9.0,Amazing,Lynx,"Flight, Action",1999,7,6
1835,WipEout XL,9.0,Amazing,PlayStation,Racing,1996,11,26
2264,Need for Speed Underground,8.8,Great,GameCube,Racing,2003,11,14
2541,Scarface: The World is Yours,8.7,Great,PC,Action,2006,10,6
2593,Tobal No. 1,8.7,Great,PlayStation,Fighting,1996,11,26
2984,Dr. Muto,8.5,Great,Xbox,Platformer,2002,11,11
3082,Final Fantasy I & II: Dawn of Souls,8.5,Great,Game Boy Advance,"RPG, Compilation",2004,11,30
3667,The Sims,8.5,Great,Xbox,Simulation,2003,3,24
5509,Guitar Hero On Tour: Decades (Game Only Edition),8.0,Great,Nintendo DS,Music,2008,11,17


In [91]:
unique_rows_from_duplicates.shape

(48, 8)

In [96]:
df_1.dtypes

title             object
score            float64
score_phrase      object
platform          object
genre             object
release_year       int64
release_month      int64
release_day        int64
dtype: object

In [92]:
# Step 1: Find duplicate rows
duplicate_rows = df_1[df_1.duplicated(keep=False)]

# Step 2: Find unique rows within the duplicate rows
unique_within_duplicates = duplicate_rows.drop_duplicates()

# Step 3: Remove unique rows within the duplicate rows from the original DataFrame
df_2 = df_1[~df_1.isin(unique_within_duplicates)].dropna()

# Display the result
df_2

Unnamed: 0,title,score,score_phrase,platform,genre,release_year,release_month,release_day
0,Checkered Flag,10.0,Masterpiece,Lynx,Racing,1999.0,7.0,6.0
1,Chrono Trigger,10.0,Masterpiece,Wii,"Action, RPG",2011.0,5.0,25.0
2,Dragon Warrior III,10.0,Masterpiece,Game Boy Color,RPG,2001.0,7.0,20.0
3,Grand Theft Auto IV,10.0,Masterpiece,Xbox 360,"Action, Adventure",2008.0,4.0,25.0
4,Grand Theft Auto IV,10.0,Masterpiece,PlayStation 3,"Action, Adventure",2008.0,4.0,25.0
...,...,...,...,...,...,...,...,...
18620,The Crow: City of Angels,1.0,Unbearable,PlayStation,Action,1997.0,3.0,11.0
18621,The Simpsons Wrestling,1.0,Unbearable,PlayStation,Action,2001.0,4.0,6.0
18622,Action Girlz Racing,0.8,Disaster,Wii,Racing,2009.0,2.0,11.0
18623,Extreme PaintBrawl,0.7,Disaster,PC,Action,1998.0,10.0,29.0


In [95]:
df_2.dtypes

title             object
score            float64
score_phrase      object
platform          object
genre             object
release_year     float64
release_month    float64
release_day      float64
dtype: object

In [70]:
df_2.shape

(18541, 8)

In [71]:
df_2.duplicated().sum()

0

## Data Integrity

In [72]:
df_2.columns

Index(['title', 'score', 'score_phrase', 'platform', 'genre', 'release_year',
       'release_month', 'release_day'],
      dtype='object')

In [73]:
# Display values not containing following characters
df_2[~df_2['title'].str.contains(r'[a-zA-Z0-9,.()\s]')]

Unnamed: 0,title,score,score_phrase,platform,genre,release_year,release_month,release_day


In [74]:
df_2[~df_2['score_phrase'].str.contains(r'[a-zA-Z0-9,.()\s]')]

Unnamed: 0,title,score,score_phrase,platform,genre,release_year,release_month,release_day


In [75]:
df_2[~df_2['platform'].str.contains(r'[a-zA-Z0-9,.()\s]')]

Unnamed: 0,title,score,score_phrase,platform,genre,release_year,release_month,release_day


In [76]:
df_2[~df_2['genre'].str.contains(r'[a-zA-Z0-9,.()\s]')]

Unnamed: 0,title,score,score_phrase,platform,genre,release_year,release_month,release_day


In [77]:
df_2.dtypes

title             object
score            float64
score_phrase      object
platform          object
genre             object
release_year     float64
release_month    float64
release_day      float64
dtype: object

In [97]:
df_2.columns

Index(['title', 'score', 'score_phrase', 'platform', 'genre', 'release_year',
       'release_month', 'release_day'],
      dtype='object')

In [98]:
# Convert release_year, release_month, and release_day columns to integers
df_2['release_year'] = df_2['release_year'].astype(int)
df_2['release_month'] = df_2['release_month'].astype(int)
df_2['release_day'] = df_2['release_day'].astype(int)

In [99]:
df_2.dtypes

title             object
score            float64
score_phrase      object
platform          object
genre             object
release_year       int32
release_month      int32
release_day        int32
dtype: object

In [100]:
df_2.head(1)

Unnamed: 0,title,score,score_phrase,platform,genre,release_year,release_month,release_day
0,Checkered Flag,10.0,Masterpiece,Lynx,Racing,1999,7,6


In [102]:
# Display non-integer numbers
df_2[~(df_2['release_year'].astype(int) == df_2['release_year'])]

Unnamed: 0,title,score,score_phrase,platform,genre,release_year,release_month,release_day


In [103]:
df_2[~(df_2['release_month'].astype(int) == df_2['release_month'])]

Unnamed: 0,title,score,score_phrase,platform,genre,release_year,release_month,release_day


In [104]:
df_2[~(df_2['release_day'].astype(int) == df_2['release_day'])]

Unnamed: 0,title,score,score_phrase,platform,genre,release_year,release_month,release_day


## Outliers