# Create a data quality report

In [5]:
import pandas as pd

#import accidents data file
accidents_data_file='accidents.csv'
accidents=pd.read_csv(accidents_data_file,
                     sep=',',
                     index_col=False,
                      parse_dates=['Date'],
                      dayfirst=True,
                      tupleize_cols=False,
                      error_bad_lines=True,
                      warn_bad_lines=True,
                      skip_blank_lines=True
                     )

accidents.head()

Unnamed: 0,Accident_Index,Location_Easting_OSGR,Location_Northing_OSGR,Longitude,Latitude,Police_Force,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Date,...,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident,LSOA_of_Accident_Location
0,200812D100026,460830,452510,-1.074261,53.965099,12,3,1,1,2008-01-10,...,0,0,4,2,2,0,0,1,1,E01013369
1,2009120000327,460220,446720,-1.084698,53.913137,12,3,1,1,2009-01-01,...,0,0,6,1,1,0,0,2,1,E01013450
2,2009120001621,461460,448010,-1.065564,53.924585,12,3,1,1,2009-01-03,...,0,0,4,1,4,0,0,2,1,E01013365
3,2009120002018,465130,451620,-1.008917,53.956579,12,2,2,1,2009-01-04,...,0,0,4,1,2,0,0,2,1,E01013351
4,2009120002510,460850,452490,-1.07396,53.964917,12,3,2,1,2009-01-05,...,0,8,4,1,1,0,0,1,1,E01013369


In [6]:
#creating a second dataframe containing columns of the accidents dataframe
columns=pd.DataFrame(list(accidents.columns.values))
columns

Unnamed: 0,0
0,Accident_Index
1,Location_Easting_OSGR
2,Location_Northing_OSGR
3,Longitude
4,Latitude
5,Police_Force
6,Accident_Severity
7,Number_of_Vehicles
8,Number_of_Casualties
9,Date


In [7]:
#creating a dataframe of the datatype of each column
data_types=pd.DataFrame(accidents.dtypes,columns=['Data Type'])
data_types

Unnamed: 0,Data Type
Accident_Index,object
Location_Easting_OSGR,int64
Location_Northing_OSGR,int64
Longitude,float64
Latitude,float64
Police_Force,int64
Accident_Severity,int64
Number_of_Vehicles,int64
Number_of_Casualties,int64
Date,datetime64[ns]


In [9]:
#creating a dataframe with the count of missing values in the dataframe
missing_data_counts=pd.DataFrame(accidents.isnull().sum(),columns=['Missing Values'])
missing_data_counts

Unnamed: 0,Missing Values
Accident_Index,0
Location_Easting_OSGR,0
Location_Northing_OSGR,0
Longitude,0
Latitude,0
Police_Force,0
Accident_Severity,0
Number_of_Vehicles,0
Number_of_Casualties,0
Date,0


In [10]:
#creating a dataframe with the count of present values in each column
present_data_counts=pd.DataFrame(accidents.count(),columns=['Present Values'])
present_data_counts

Unnamed: 0,Present Values
Accident_Index,5709
Location_Easting_OSGR,5709
Location_Northing_OSGR,5709
Longitude,5709
Latitude,5709
Police_Force,5709
Accident_Severity,5709
Number_of_Vehicles,5709
Number_of_Casualties,5709
Date,5709


In [11]:
#creating a dataframe with the count of unique values in each column of the dataframe
unique_value_counts = pd.DataFrame(columns=['Unique Values'])
for v in list(accidents.columns.values):
    unique_value_counts.loc[v] = [accidents[v].nunique()]
unique_value_counts

Unnamed: 0,Unique Values
Accident_Index,5709
Location_Easting_OSGR,2755
Location_Northing_OSGR,2721
Longitude,4966
Latitude,4840
Police_Force,1
Accident_Severity,3
Number_of_Vehicles,7
Number_of_Casualties,8
Date,3127


In [21]:
# Merge all the DataFrames together by the index
data_quality_report = data_types.join(present_data_counts).join(missing_data_counts).join(unique_value_counts)
data_quality_report

Unnamed: 0,Data Type,Present Values,Missing Values,Unique Values
Accident_Index,object,5709,0,5709
Location_Easting_OSGR,int64,5709,0,2755
Location_Northing_OSGR,int64,5709,0,2721
Longitude,float64,5709,0,4966
Latitude,float64,5709,0,4840
Police_Force,int64,5709,0,1
Accident_Severity,int64,5709,0,3
Number_of_Vehicles,int64,5709,0,7
Number_of_Casualties,int64,5709,0,8
Date,datetime64[ns],5709,0,3127


In [23]:
# Print out a nice report
print("\nData Quality Report")
print("Total records: {}".format(len(accidents.index)))
data_quality_report



Data Quality Report
Total records: 5709


Unnamed: 0,Data Type,Present Values,Missing Values,Unique Values
Accident_Index,object,5709,0,5709
Location_Easting_OSGR,int64,5709,0,2755
Location_Northing_OSGR,int64,5709,0,2721
Longitude,float64,5709,0,4966
Latitude,float64,5709,0,4840
Police_Force,int64,5709,0,1
Accident_Severity,int64,5709,0,3
Number_of_Vehicles,int64,5709,0,7
Number_of_Casualties,int64,5709,0,8
Date,datetime64[ns],5709,0,3127
