<a href="https://colab.research.google.com/github/booluckgmie/training/blob/main/Data_Summaries_in_Python_and_R_in_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
import pandas as pd

df = pd.read_csv('https://github.com/dosm-malaysia/data-open/raw/main/datasets/census/census_district.csv')
df = df.iloc[:,:10]
df.head()

Unnamed: 0,state,code_state,district,code_district,code_state_district,year,population_total,area_km2,population_growth,housing_total
0,Johor,1,Batu Pahat,1,1_1,1970,249596.0,1966,,
1,Johor,1,Batu Pahat,1,1_1,1980,274625.0,1966,1.0,
2,Johor,1,Batu Pahat,1,1_1,1991,294056.0,1966,0.7,66791.0
3,Johor,1,Batu Pahat,1,1_1,2000,336509.0,1966,1.3,87184.0
4,Johor,1,Batu Pahat,1,1_1,2010,401902.0,1966,1.8,107904.0


## Python Code

In [40]:
import pandas as pd
import numpy as np

def summarize_data(data: pd.DataFrame, columns: list) -> pd.DataFrame:
    summary = pd.DataFrame()

    for col in columns:
        col_data = data[col]

        if pd.api.types.is_numeric_dtype(col_data):
            col_summary = pd.DataFrame({
                'Column': [col],
                'Data Type': ['Numeric'],
                'Count': [col_data.count()],
                'Mean': [col_data.mean()],
                'Median': [col_data.median()],
                'Min': [col_data.min()],
                'Max': [col_data.max()],
                'Standard Deviation': [col_data.std()],  # Added line for standard deviation
                '25%': [col_data.quantile(0.25)],
                '50%': [col_data.quantile(0.5)],
                '75%': [col_data.quantile(0.75)],
                'Missing': [col_data.isna().sum()],
                '% Missing': [col_data.isna().mean()],
                '% Complete': [col_data.notna().mean()],
                'NAN': [col_data.isnull().sum()],
                'INF': [col_data.isin([np.inf, -np.inf]).sum()],
                'Unique': [col_data.nunique()],
            })

        elif pd.api.types.is_datetime64_any_dtype(col_data):
            col_summary = pd.DataFrame({
                'Column': [col],
                'Data Type': ['Date/Time'],
                'Count': [col_data.count()],
                'Earliest': [col_data.min()],
                'Latest': [col_data.max()],
                'Missing': [col_data.isna().sum()],
                '% Missing': [col_data.isna().mean()],
                '% Complete': [col_data.notna().mean()],
                'NAN': [col_data.isnull().sum()],
                'INF': [col_data.isin([np.inf, -np.inf]).sum()],
                'Unique': [col_data.nunique()],
            })

        else:
            col_summary = pd.DataFrame({
                'Column': [col],
                'Data Type': ['Categorical'],
                'Count': [col_data.count()],
                'Categories': [",".join(col_data.dropna().astype(str).unique())],
                'Mode': [col_data.mode().values[0]],
                'Missing': [col_data.isna().sum()],
                '% Missing': [col_data.isna().mean()],
                '% Complete': [col_data.notna().mean()],
                'NAN': [col_data.isnull().sum()],
                'INF': [col_data.isin([np.inf, -np.inf]).sum()],
                'Unique': [col_data.nunique()],
            })

        summary = pd.concat([summary, col_summary], ignore_index=True)

    return summary

# Example usage:
# data = pd.read_csv("your_data.csv")
# columns_to_summarize = ['column1', 'column2', 'column3']
# summary_result = summarize_data(data, columns_to_summarize)
# print(summary_result)


In [41]:
summary_result = summarize_data(df, df.columns)
summary_result

Unnamed: 0,Column,Data Type,Count,Categories,Mode,Missing,% Missing,% Complete,NAN,INF,Unique,Mean,Median,Min,Max,Standard Deviation,25%,50%,75%
0,state,Categorical,960,"Johor,Kedah,Kelantan,Melaka,Negeri Sembilan,Pa...",Sarawak,0,0.0,1.0,0,0,16,,,,,,,,
1,code_state,Numeric,960,,,0,0.0,1.0,0,0,16,8.71875,10.0,1.0,16.0,4.306561,5.0,10.0,13.0
2,district,Categorical,960,"Batu Pahat,Johor Bahru,Kluang,Kota Tinggi,Mers...",Alor Gajah,0,0.0,1.0,0,0,160,,,,,,,,
3,code_district,Numeric,960,,,0,0.0,1.0,0,0,40,10.55,7.0,1.0,40.0,9.632323,3.75,7.0,14.0
4,code_state_district,Categorical,960,"1_1,1_2,1_3,1_4,1_5,1_6,1_7,1_8,1_9,1_10,2_1,2...",10_1,0,0.0,1.0,0,0,160,,,,,,,,
5,year,Numeric,960,,,0,0.0,1.0,0,0,6,1995.166667,1995.5,1970.0,2020.0,17.042342,1980.0,1995.5,2010.0
6,population_total,Numeric,809,,,151,0.157292,0.842708,151,0,809,152272.687268,85952.0,5746.0,2298130.0,222838.201191,42035.0,85952.0,166685.0
7,area_km2,Numeric,960,,,0,0.0,1.0,0,0,154,2065.125,1295.5,40.0,19050.0,2407.851294,703.75,1295.5,2440.75
8,population_growth,Numeric,648,,,312,0.325,0.675,312,0,115,1.621142,1.4,-12.1,12.3,2.308294,0.6,1.4,2.525
9,housing_total,Numeric,572,,,388,0.404167,0.595833,388,0,568,46417.72028,23646.5,2604.0,679812.0,74017.260558,12138.75,23646.5,49636.5


## R Programming

In [42]:
# Load the rpy2 extension
%load_ext rpy2.ipython

# Assuming you have a DataFrame called 'df' in Python
# Transfer 'df' from Python to R
%R -i df


The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


  for name, values in obj.iteritems():


In [43]:
%%R

# Display the first few rows of the 'df' DataFrame
head(df)

  state code_state   district code_district code_state_district year
0 Johor          1 Batu Pahat             1                 1_1 1970
1 Johor          1 Batu Pahat             1                 1_1 1980
2 Johor          1 Batu Pahat             1                 1_1 1991
3 Johor          1 Batu Pahat             1                 1_1 2000
4 Johor          1 Batu Pahat             1                 1_1 2010
5 Johor          1 Batu Pahat             1                 1_1 2020
  population_total area_km2 population_growth housing_total
0           249596     1966               NaN           NaN
1           274625     1966               1.0           NaN
2           294056     1966               0.7         66791
3           336509     1966               1.3         87184
4           401902     1966               1.8        107904
5           495338     1966               2.1        129486


In [44]:
%%R

library(dplyr)

get_mode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}

summarize_data <- function(data, columns) {
  summary <- data.frame()

  for (col in columns) {
    col_data <- data[[col]]

    if (is.numeric(col_data)) {
      col_summary <- data.frame(
        Column = col,
        Data_Type = 'Numeric',
        Count = sum(!is.na(col_data)),
        Mean = mean(col_data, na.rm = TRUE),
        Median = median(col_data, na.rm = TRUE),
        Min = min(col_data, na.rm = TRUE),
        Max = max(col_data, na.rm = TRUE),
        Standard_Deviation = sd(col_data, na.rm = TRUE),
        `25%` = quantile(col_data, 0.25, na.rm = TRUE),
        `50%` = quantile(col_data, 0.5, na.rm = TRUE),
        `75%` = quantile(col_data, 0.75, na.rm = TRUE),
        Missing = sum(is.na(col_data)),
        `% Missing` = mean(is.na(col_data)),
        `% Complete` = mean(!is.na(col_data)),
        NAN = sum(is.na(col_data)),
        INF = sum(is.infinite(col_data)),
        Unique = length(unique(col_data))
      )

    } else if (any(class(col_data) == "POSIXct")) {
      col_summary <- data.frame(
        Column = col,
        Data_Type = 'Date/Time',
        Count = sum(!is.na(col_data)),
        Earliest = min(col_data, na.rm = TRUE),
        Latest = max(col_data, na.rm = TRUE),
        Missing = sum(is.na(col_data)),
        `% Missing` = mean(is.na(col_data)),
        `% Complete` = mean(!is.na(col_data)),
        NAN = sum(is.na(col_data)),
        INF = sum(is.infinite(col_data)),
        Unique = length(unique(col_data))
      )

    } else {
      col_summary <- data.frame(
        Column = col,
        Data_Type = 'Categorical',
        Count = sum(!is.na(col_data)),
        Categories = paste(unique(col_data[!is.na(col_data)]), collapse = ','),
        Mode = as.character(get_mode(col_data)),
        Missing = sum(is.na(col_data)),
        `% Missing` = mean(is.na(col_data)),
        `% Complete` = mean(!is.na(col_data)),
        NAN = sum(is.na(col_data)),
        INF = sum(is.infinite(col_data)),
        Unique = length(unique(col_data))
      )
    }

    summary <- bind_rows(summary, col_summary)
  }

  return(summary)
}

# Example usage:
# data <- read.csv("your_data.csv")
# columns_to_summarize <- colnames(data)
# summary_result <- summarize_data(data, columns_to_summarize)
# print(summary_result)


In [45]:
%%R

summary_result <- summarize_data(df, colnames(df))
print(summary_result)

                      Column   Data_Type Count
...1                   state Categorical   960
25%...2           code_state     Numeric   960
...3                district Categorical   960
25%...4        code_district     Numeric   960
...5     code_state_district Categorical   960
25%...6                 year     Numeric   960
25%...7     population_total     Numeric   809
25%...8             area_km2     Numeric   960
25%...9    population_growth     Numeric   648
25%...10       housing_total     Numeric   572
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

## EDA in R

In [33]:
%%R
install.packages("DataExplorer")

(as ‘lib’ is unspecified)












































	‘/tmp/RtmpboZPcE/downloaded_packages’



In [46]:
%%R

# Load the DataExplorer library
library(DataExplorer)

# Assuming df is your dataset
report <- create_report(df)

# Print the report
print(report)



processing file: report.rmd



1/42                                 
2/42 [global_options]                
3/42                                 
4/42 [introduce]                     
5/42                                 
6/42 [plot_intro]                    
7/42                                 
8/42 [data_structure]                
9/42                                 
10/42 [missing_profile]               
11/42                                 
12/42 [univariate_distribution_header]
13/42                                 
14/42 [plot_histogram]                
15/42                                 
16/42 [plot_density]                  
17/42                                 
18/42 [plot_frequency_bar]            
19/42                                 
20/42 [plot_response_bar]             
21/42                                 
22/42 [plot_with_bar]                 
23/42                                 
24/42 [plot_normal_qq]                
25/42                                 
26/42 [plot_response_qq]          





/usr/bin/pandoc +RTS -K512m -RTS /content/report.knit.md --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output /content/report.html --lua-filter /usr/lib/R/site-library/rmarkdown/rmarkdown/lua/pagebreak.lua --lua-filter /usr/lib/R/site-library/rmarkdown/rmarkdown/lua/latex-div.lua --self-contained --variable bs3=TRUE --section-divs --table-of-contents --toc-depth 6 --template /usr/lib/R/site-library/rmarkdown/rmd/h/default.html --no-highlight --variable highlightjs=1 --variable theme=yeti --mathjax --variable 'mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML' --include-in-header /tmp/RtmpboZPcE/rmarkdown-str1fc4f3b9980.html 


Output created: report.html



[1] 0
