# 1. SETUP & PACKAGE INSTALLATION

In [14]:
cat("Setting up environment...\n")

if (!require("ggplot2")) install.packages("ggplot2", quiet = TRUE)
if (!require("dplyr")) install.packages("dplyr", quiet = TRUE)
if (!require("tidyr")) install.packages("tidyr", quiet = TRUE)

suppressPackageStartupMessages({
  library(ggplot2)
  library(dplyr)
  library(tidyr)
})

cat("✅ Packages loaded successfully\n\n")


Setting up environment...
✅ Packages loaded successfully



# 2. DATA LOADING

In [15]:
cat("Loading dataset...\n")
data <- read.csv("/kaggle/input/online-food-dataset/onlinefoods.csv")
cat("✅ Dataset loaded:", nrow(data), "rows ×", ncol(data), "columns\n\n")

# Initial glimpse
cat("=== DATASET PREVIEW ===\n")
glimpse(data)
cat("\n")

Loading dataset...
✅ Dataset loaded: 388 rows × 13 columns

=== DATASET PREVIEW ===
Rows: 388
Columns: 13
$ Age                        [3m[90m<int>[39m[23m 20, 24, 22, 22, 22, 27, 22, 24, 23, 23, 22,…
$ Gender                     [3m[90m<chr>[39m[23m "Female", "Female", "Male", "Female", "Male…
$ Marital.Status             [3m[90m<chr>[39m[23m "Single", "Single", "Single", "Single", "Si…
$ Occupation                 [3m[90m<chr>[39m[23m "Student", "Student", "Student", "Student",…
$ Monthly.Income             [3m[90m<chr>[39m[23m "No Income", "Below Rs.10000", "Below Rs.10…
$ Educational.Qualifications [3m[90m<chr>[39m[23m "Post Graduate", "Graduate", "Post Graduate…
$ Family.size                [3m[90m<int>[39m[23m 4, 3, 3, 6, 4, 2, 3, 3, 2, 4, 5, 2, 5, 4, 5…
$ latitude                   [3m[90m<dbl>[39m[23m 12.9766, 12.9770, 12.9551, 12.9473, 12.9850…
$ longitude                  [3m[90m<dbl>[39m[23m 77.5993, 77.5773, 77.6593, 77.5616, 77.5533…
$ P

# 3. DATA CLEANING: COLUMN NAMES

In [16]:

# ============================================
cat("Standardizing column names...\n")
original_names <- names(data)
cleaned_names <- tolower(gsub('\\.', '_', original_names))
names(data) <- cleaned_names

cat("Column name standardization complete:\n")
name_comparison <- data.frame(
  Original = original_names,
  Cleaned = cleaned_names
)
print(name_comparison)
cat("\n")

Standardizing column names...
Column name standardization complete:
                     Original                    Cleaned
1                         Age                        age
2                      Gender                     gender
3              Marital.Status             marital_status
4                  Occupation                 occupation
5              Monthly.Income             monthly_income
6  Educational.Qualifications educational_qualifications
7                 Family.size                family_size
8                    latitude                   latitude
9                   longitude                  longitude
10                   Pin.code                   pin_code
11                     Output                     output
12                   Feedback                   feedback
13                          X                          x



# 4. MISSING VALUES ANALYSIS FUNCTION

In [17]:
analyze_missing_values <- function(df) {
  cat("ANALYZING MISSING VALUES\n")
  cat(paste(rep("=", 50), collapse = ""), "\n\n")
  
  # Calculate different types of missing values
  missing_stats <- data.frame(
    Column = names(df),
    Type = sapply(df, class),
    NA_Count = colSums(is.na(df)),
    Empty_String_Count = sapply(df, function(x) sum(x == "", na.rm = TRUE)),
    NA_String_Count = sapply(df, function(x) 
      sum(toupper(as.character(x)) %in% c("NA", "N/A", "NULL", "NAN"), na.rm = TRUE))
  )
  
  # Calculate percentages
  missing_stats$NA_Percent <- round(missing_stats$NA_Count / nrow(df) * 100, 2)
  missing_stats$Total_Missing <- missing_stats$NA_Count + 
    missing_stats$Empty_String_Count + 
    missing_stats$NA_String_Count
  missing_stats$Total_Missing_Percent <- round(missing_stats$Total_Missing / nrow(df) * 100, 2)
  
  # Sort by missing percentage
  missing_stats <- missing_stats[order(-missing_stats$Total_Missing_Percent), ]
  
  # Display results
  print(missing_stats)
  
  # Summary statistics
  cat("\nSUMMARY STATISTICS\n")
  cat(paste(rep("-", 30), collapse = ""), "\n")
  cat("Complete cases (no missing values):", 
      sum(complete.cases(df)), "rows\n")
  cat("Rows with at least one missing value:", 
      sum(!complete.cases(df)), "rows (",
      round(sum(!complete.cases(df)) / nrow(df) * 100, 1), "%)\n")
  cat("Total missing values (all types):", 
      sum(missing_stats$Total_Missing), "\n")
  
  return(missing_stats)
}

# 5. EXECUTE MISSING VALUES ANALYSIS
# ============================================
missing_analysis <- analyze_missing_values(data)

ANALYZING MISSING VALUES

                                               Column      Type NA_Count
age                                               age   integer        0
gender                                         gender character        0
marital_status                         marital_status character        0
occupation                                 occupation character        0
monthly_income                         monthly_income character        0
educational_qualifications educational_qualifications character        0
family_size                               family_size   integer        0
latitude                                     latitude   numeric        0
longitude                                   longitude   numeric        0
pin_code                                     pin_code   integer        0
output                                         output character        0
feedback                                     feedback character        0
x                        

# 5. DUPLICATE RECORDS ANALYSIS

In [18]:
# Check for exact duplicates
exact_duplicates <- sum(duplicated(data));
cat('Exact duplicated rows: ', exact_duplicates, '\n')

if(exact_duplicates > 0) {
  duplicated_rows <- data[duplicated(data) | duplicated(data, fromLast = TRUE), ]
  cat("Percentage of data that are duplicates: ",round((exact_duplicates/nrow(data)) * 100, 2),"%\n")

}

# Visualization of duplicates
dup_data <- data.frame(
  Type = c('Unique', 'Duplicate'),
  Count = c(nrow(data) - exact_duplicates, exact_duplicates)
)
head(dup_data)

Exact duplicated rows:  103 
Percentage of data that are duplicates:  26.55 %


Unnamed: 0_level_0,Type,Count
Unnamed: 0_level_1,<chr>,<int>
1,Unique,285
2,Duplicate,103


# 6. DATA TYPE ANALYSIS

In [19]:
data_types <- sapply(X = data, FUN = class)
type_summary <- as.data.frame(table(data_types))
names(type_summary) <- c('data_type', 'count')

# Display type distribution
print(type_summary)

  data_type count
1 character     8
2   integer     3
3   numeric     2
