# **LINKEDIN ANALYSIS | EXTENSIVE FEATURE ENGINEERING + DATA VISUALIZATIONS**

# Importing Libraries

In [None]:
library(tidyverse)

# Exploring Data

In [None]:
df <- read.csv("/kaggle/input/linkedin-job-data/linkdin_Job_data.csv")
str(df)

**We are going to select columns that will be used for this analysis**

In [None]:
df <- df %>% 
  select(-job_ID,-company_id,-company_name,-Hiring_person,-hiring_person_link,-job_details,-Column1,-alumni)

**Let's see NA values**

In [None]:
sapply(df, function(x){sum(is.na(x))})

**We need to check uniques as well**

In [None]:
sapply(df, function(x){length(unique(x))})

In [None]:
unique(df$full_time_remote)

**This column is mixing values of different types like level, no of employees and work type we will work without this.**
**Below we are going to check how many jobs we have with these key words**

In [None]:
str_detect(str_to_lower(df$job),"^bi|analytics|statician|tableau|powerbi|power bi|dev|developer|data") %>% sum()

# Feature Engineering
**We are going to divide these jobs in three main categories, because we have so much job titles**

In [None]:
df$job_field <- case_when(
  str_detect(str_to_lower(df$job),"developer|dev") ~ "dev",
  str_detect(str_to_lower(df$job),"^bi|analytics|statician|tableau|powerbi|power bi|data") ~ "data",
  TRUE ~ "other"
)

head(df)

**Now We are going to separate to transform later, posted day ago in minutes**

In [None]:
df <- separate(data = df, col = posted_day_ago, into = c("no_posted_day_ago","unit_posted_day_ago"))
head(df)

In [None]:
df$unit_posted_day_ago <- case_when(
  str_detect(string = df$unit_posted_day_ago, pattern = "^hou") ~ 60,
  str_detect(string = df$unit_posted_day_ago, pattern = "^min") ~ 1,
  str_detect(string = df$unit_posted_day_ago, pattern = "^day") ~ 24*60,
  str_detect(string = df$unit_posted_day_ago, pattern = "^wee") ~ 7*24*60,
  str_detect(string = df$unit_posted_day_ago, pattern = "^sec") ~ 1/60
)

df$minutes_posted_ago <- df$unit_posted_day_ago * as.numeric(df$no_posted_day_ago)

**Now we are going to transform followers to numeric**

In [None]:
df$linkedin_followers <- str_replace_all(df$linkedin_followers,pattern = "[[:punct:]]",replacement = "") %>% 
                          str_replace(pattern = " followers| follower", replacement = "") %>%
                          as.numeric()

**We need also to separate the number of employees from company field**

In [None]:
df <- separate(data = df, col = no_of_employ, sep = "·", into = c("no_employees","company_field"))
head(df)

**We are going to transform no_employees until we can transform it on a factor**

In [None]:
unique(df$no_employees)
length(unique(df$no_employees))

In [None]:
str_remove_all(df$no_employees,pattern = " ") %>% unique() %>% length()

In [None]:
df$no_employees <- str_remove_all(df$no_employees,pattern = " ") %>% 
                      str_remove_all(pattern = "employees")

**Here we are going to impute NA values for empty values for two different columns**

In [None]:
df[df$no_employees == "", "no_employees"] <- NA
df[df$work_type == "", "work_type"] <- NA

**Let's look another time to see what's next**

In [None]:
head(df)

**Let's separate location in three columns**

In [None]:
df <- separate(df, location, into = c("city","state","country"), sep = ", ")
head(df)

**Let's delete some columns we are not going to use anymore**

In [None]:
df <- df %>% 
  select(-no_posted_day_ago,-unit_posted_day_ago, -full_time_remote, -country)

**Now we will Work with NA's after featuring engineering**

In [None]:
sapply(df, function(x){sum(is.na(x))})

**Above we see inconsistent data, because no_of_application contains "hours" as its mode, so we have to impute NA for these values and after that try to impute numbers on this column**

In [None]:
df$no_of_application <- as.numeric(df$no_of_application)

**Finding mode values to replace NA's**

In [None]:
fmode <- function(x){
  names(sort(table(x),decreasing = T))[1]
}

mode_vec <- sapply(df, fmode)
data.frame(mode = mode_vec)

**Replacing NA's with mode**

In [None]:
df %>% head()

In [None]:
for (i in 1:length(mode_vec)) {
  df[,i] <- ifelse(is.na(df[,i]), mode_vec[i], df[,i])
}

**Transforming no_employees in factor**

In [None]:
l <- c("1-10","11-50","51-200","201-500","501-1,000","1,001-5,000","5,001-10,000","10,001+")
df$no_employees <- factor(df$no_employees, levels = l)

**Let's convert our numeric variables which are characters**

In [None]:
df <- df %>% 
  mutate(
    no_of_application = as.numeric(no_of_application),
    linkedin_followers = as.numeric(linkedin_followers),
    minutes_posted_ago = as.numeric(minutes_posted_ago)
  )

## **Now our data is good! let's create some visualizations ;)**

# **Data Visualizations**

In [None]:
# Adjusting size of ggplot
library(repr)
options(repr.plot.width=15, repr.plot.height=10)

In [None]:
ggplot(df, aes(job_field, fill = work_type))+
  geom_bar(position = "dodge")+
  labs(title = "Amount of Jobs per Field", fill = "Work Type")+
  geom_text(aes(label = ..count..),stat = "count", size = 5, color = "white", position = position_dodge(width=0.9), vjust = 1)+
  xlab("")+
  ylab("")+
  theme(title = element_text(size = 20),
       axis.text = element_text(size = 20),
       legend.text = element_text(size = 15))

**We see that for Data field jobs companies prefer the professionals on site or Hybrid. We can also see that Devs have more flexibility wwith remote jobs. For other jobs on site is preferable by companies.**

In [None]:
df %>% 
  group_by(work_type) %>% 
  summarise(qtt = sum(no_of_application)) %>% 
ggplot(aes(work_type, qtt))+
  geom_col(fill = "deepskyblue")+
  labs(title = "Total of Applications per Work Type")+
  geom_text(aes(label = qtt), vjust = 1, size = 5, color = "white")+
  xlab("")+
  ylab("")+
  scale_y_continuous(labels = scales::comma)+
  theme(title = element_text(size = 20),
       axis.text = element_text(size = 20),
       legend.text = element_text(size = 15))

**Applicants of course want to work at home.**

In [None]:
df %>% 
  group_by(job_field) %>% 
  summarise(qtt = sum(no_of_application)) %>% 
ggplot(aes(job_field, qtt))+
  geom_col(fill = "deepskyblue")+
  labs(title = "Total of Applications per Job Field")+
  geom_text(aes(label = qtt), vjust = 1, size = 5, color = "white")+
  xlab("")+
  ylab("")+
  scale_y_continuous(labels = scales::comma)+
  theme(title = element_text(size = 20),
       axis.text = element_text(size = 20),
       legend.text = element_text(size = 15))

**Dev jobs have almost two times applications than Data jobs**

In [None]:
df %>% 
  group_by(state) %>% 
  summarise(qtt = sum(no_of_application)) %>%
  arrange(desc(qtt)) %>% 
  head(5) %>% 
ggplot(aes(qtt, reorder(state, qtt)))+
  geom_col(fill = "cornflowerblue")+
  labs(title = "Top 5 - Total of Applications per State")+
  geom_text(aes(label = qtt), hjust = 1, size = 5, color = "white")+
  xlab("")+
  ylab("")+
  scale_x_continuous(labels = scales::comma)+
  theme(title = element_text(size = 20),
       axis.text = element_text(size = 20),
       legend.text = element_text(size = 15))

In [None]:
data_chart <- df %>% 
  group_by(job_field) %>% 
  summarise(applications = sum(no_of_application),
            amount_of_jobs = n()) %>% 
  mutate(no_of_applicants_per_job = applications / amount_of_jobs)

pie(data_chart$no_of_applicants_per_job,
    labels = paste(data_chart$job_field, round(data_chart$no_of_applicants_per_job, 1), sep = ": "), 
    border = "white", 
    col = c("blue","cornflowerblue","deepskyblue"), 
    radius = 1.5,
    cex = 2)
    title(main = "Total of applicants per Job Field",cex.main = 2)
    

**We see that Data field has more applicants per job**

In [None]:
df %>% 
  group_by(no_employees) %>% 
  summarise(qtt = sum(no_of_application)) %>% 
ggplot(aes(qtt, reorder(no_employees, qtt)))+
  geom_col(fill = "cornflowerblue")+
  labs(title = "Total of Applications per Company Size")+
  geom_text(aes(label = qtt), hjust = 1, size = 5, color = "white")+
  xlab("")+
  ylab("Amount of Employees")+
  scale_x_continuous(labels = scales::comma)+
  theme(title = element_text(size = 20),
       axis.text = element_text(size = 20),
       legend.text = element_text(size = 15))

**We could do many other analysis, but that's it for while, Upvote if you like! thanks!**