# Evaluate Data Quality Issues in 'users.json'

In [1]:
#Importing Libraries
import pandas as pd
import json
import numpy as np

In [2]:
df_users = pd.read_json('original_data/users.json',lines=True)
df_users.head()

Unnamed: 0,_id,active,createdDate,lastLogin,role,signUpSource,state
0,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
1,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
2,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
3,{'$oid': '5ff1e1eacfcf6c399c274ae6'},True,{'$date': 1609687530554},{'$date': 1609687530597},consumer,Email,WI
4,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI


In [3]:
df_users.shape

(495, 7)

### Lets understand the data type for each column

In [4]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495 entries, 0 to 494
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   _id           495 non-null    object
 1   active        495 non-null    bool  
 2   createdDate   495 non-null    object
 3   lastLogin     433 non-null    object
 4   role          495 non-null    object
 5   signUpSource  447 non-null    object
 6   state         439 non-null    object
dtypes: bool(1), object(6)
memory usage: 23.8+ KB


### We have 6 columns with type as object, checking  how the first row looks

In [5]:
for i in df_users.columns:
    if df_users[i].dtype == 'object':
        print(f"{i}: {df_users[i][0]}\n")

_id: {'$oid': '5ff1e194b6a9d73a3a9f1052'}

createdDate: {'$date': 1609687444800}

lastLogin: {'$date': 1609687537858}

role: consumer

signUpSource: Email

state: WI



### Correcting format for '_id' column and changing the name of '_id' to 'user_id'

In [6]:
df_users['_id'] = df_users['_id'].apply(lambda x: x['$oid'])
df_users.rename(columns={'_id': 'user_id'}, inplace=True)

### Correcting format for date columns 

In [7]:
date_columns = ['createdDate', 'lastLogin']
def convert_date(x):
    try:
        if pd.isnull(x):
            return None
        else:
            return pd.to_datetime(x['$date'], unit='ms')
    except TypeError:
        return None
for col in date_columns:
    df_users[col] = df_users[col].apply(convert_date)
df_users.head()

Unnamed: 0,user_id,active,createdDate,lastLogin,role,signUpSource,state
0,5ff1e194b6a9d73a3a9f1052,True,2021-01-03 15:24:04.800,2021-01-03 15:25:37.858,consumer,Email,WI
1,5ff1e194b6a9d73a3a9f1052,True,2021-01-03 15:24:04.800,2021-01-03 15:25:37.858,consumer,Email,WI
2,5ff1e194b6a9d73a3a9f1052,True,2021-01-03 15:24:04.800,2021-01-03 15:25:37.858,consumer,Email,WI
3,5ff1e1eacfcf6c399c274ae6,True,2021-01-03 15:25:30.554,2021-01-03 15:25:30.597,consumer,Email,WI
4,5ff1e194b6a9d73a3a9f1052,True,2021-01-03 15:24:04.800,2021-01-03 15:25:37.858,consumer,Email,WI


### Checking if the createdDate comes before lastLogin date

In [8]:
date_check = ((df_users['createdDate'] <= df_users['lastLogin']) | (df_users['createdDate'].notna()) | (df_users['lastLogin'].notna())).all()
date_check

True

Create date is before the last login of user , which shows data quality is good interms of date columns

### Checking number of Missing values

In [9]:
print(df_users.shape)
df_users.isnull().sum()

(495, 7)


user_id          0
active           0
createdDate      0
lastLogin       62
role             0
signUpSource    48
state           56
dtype: int64

In [10]:
# !pip install dataqualityreport
from dataqualityreport import dqr_table
dqr_table(df_users)

Building summary df...
Constructing box plots...
Spreading hist plots...


Unnamed: 0_level_0,Type,Card *Unique,% Missing Heatmap,% Missing,% Zeros,% Negative,Box Plot,Robust Histogram
Column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
user_id,O,212,,,,,,
active,B,2,,,,,,
createdDate,D,212,,,,,,
lastLogin,D,172,,,,,,
role,O,2,,,,,,
signUpSource,O,2,,,,,,
state,O,8,,,,,,


In [11]:
for column in ['active','role','signUpSource','state']:
    unique_values = df_users[column].unique()
    print(f"Unique values in {column}: {unique_values}")

Unique values in active: [ True False]
Unique values in role: ['consumer' 'fetch-staff']
Unique values in signUpSource: ['Email' 'Google' nan]
Unique values in state: ['WI' 'KY' 'AL' 'CO' 'IL' nan 'OH' 'SC' 'NH']


### We can see in Histogram of 'state' column, there exists one major state with lot of rows, lets check 

In [12]:
df_users['state'].value_counts()/len(df_users['state'])

WI    0.800000
NH    0.040404
AL    0.024242
OH    0.010101
IL    0.006061
KY    0.002020
CO    0.002020
SC    0.002020
Name: state, dtype: float64

80% of the users have signed up from State 'WI'

### Checking redundant/duplicate rows

In [13]:
redundant_records = df_users.duplicated()

# Display the redundant records
print(df_users[redundant_records])

                      user_id  active             createdDate  \
1    5ff1e194b6a9d73a3a9f1052    True 2021-01-03 15:24:04.800   
2    5ff1e194b6a9d73a3a9f1052    True 2021-01-03 15:24:04.800   
4    5ff1e194b6a9d73a3a9f1052    True 2021-01-03 15:24:04.800   
5    5ff1e194b6a9d73a3a9f1052    True 2021-01-03 15:24:04.800   
8    5ff1e194b6a9d73a3a9f1052    True 2021-01-03 15:24:04.800   
..                        ...     ...                     ...   
490  54943462e4b07e684157a532    True 2014-12-19 14:21:22.381   
491  54943462e4b07e684157a532    True 2014-12-19 14:21:22.381   
492  54943462e4b07e684157a532    True 2014-12-19 14:21:22.381   
493  54943462e4b07e684157a532    True 2014-12-19 14:21:22.381   
494  54943462e4b07e684157a532    True 2014-12-19 14:21:22.381   

                  lastLogin         role signUpSource state  
1   2021-01-03 15:25:37.858     consumer        Email    WI  
2   2021-01-03 15:25:37.858     consumer        Email    WI  
4   2021-01-03 15:25:37.858     c

We can see that there are 283 duplicate rows out of 495 rows, i.e. almost half of the user table has duplicate values

# Overall , we can summarize, data quality is not good for users table.

### 1. 283 duplicate rows out of 495 rows, shows the records are being stored wihtout considering the primary key (user_id) which is causing Redundancy

### 2. 3 columns  'lastLogin' 'signUpSource' and 'state' have missing values.


In [14]:
# Store the df_users DataFrame in a JSON file
df_users.to_json('new_data/df_users.json', orient='records', lines=True)