In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

In [None]:
file_list = glob("./data/*.csv")
file_list

In [None]:
mca = pd.read_csv(file_list[2], encoding="cp437", low_memory=False)
mca.shape

In [None]:
pd.set_option("display.max_colwidth", None)

In [None]:
mca["BlogsPodcastsNewslettersSelect"].head()

In [None]:
pd.reset_option("display.max_colwidth")

In [None]:
media_series = mca["BlogsPodcastsNewslettersSelect"].str.split(",", expand=True).stack().reset_index(level=1, drop=True)
media_series.shape

In [None]:
media_count_series = media_series.value_counts().head(20)
media_count_series

In [None]:
sns.set(font_scale=0.8, font="Malgun Gothic")

In [None]:
plt.figure(figsize=(12, 8))
sns.barplot(y=media_count_series.index, x=media_count_series.values)

In [None]:
media_origin_series = mca["BlogsPodcastsNewslettersSelect"].value_counts().head(20)
media_origin_series

In [None]:
plt.figure(figsize=(12, 8))
sns.barplot(x=media_origin_series.values, y=media_origin_series.index)

In [None]:
course_series = mca["CoursePlatformSelect"].str.split(",", expand=True).stack().reset_index(level=1, drop=True)
course_series.head()

In [None]:
course_count_series = course_series.value_counts()
course_count_series

In [None]:
plt.title("Most popular course platforms")
sns.barplot(y=course_count_series.index, x=course_count_series.values)

In [None]:
skill_list = list(mca.columns[mca.columns.str.contains("^JobSkillImportance[^Other]", regex=True)])
skill_list

In [None]:
mca["JobSkillImportanceBigData"].value_counts()

In [None]:
skill_rate_dict = {}
for skill in skill_list:
    temp = mca[skill].value_counts(normalize=True)
    skill_rate_dict[skill[len("JobSkillImportance"):]] = temp
skill_rate_dict


In [None]:
skill_rate_df = pd.DataFrame(skill_rate_dict).transpose()
skill_rate_df.head()

In [None]:
sns.heatmap(skill_rate_df.sort_values("Necessary", ascending=False), annot=True, fmt=".2f", cmap="Blues")

In [None]:
skill_rate_plot = skill_rate_df.plot.bar(figsize=(12, 6))
plt.xticks(rotation=60)
skill_rate_plot

In [None]:
mca["CompensationAmount"].shape

In [None]:
mca.replace({"CompensationAmount": {"[,-]": ""}}, regex=True, inplace=True)
mca["CompensationAmount"].str.contains("[,-]", regex=True, na=False).unique()

In [None]:
currency_rate_df = pd.read_csv(file_list[0])
currency_rate_df.drop("Unnamed: 0", axis=1, inplace=True)
currency_rate_df

In [None]:
salary_df = mca[["CompensationAmount", "CompensationCurrency", "GenderSelect", "Country", "CurrentJobTitleSelect"]].dropna().copy()
salary_df.head()

In [None]:
salary_df = salary_df.merge(currency_rate_df, "left", left_on="CompensationCurrency", right_on="originCountry")
salary_df.head()

In [None]:
salary_df.info()

In [None]:
salary_df["CompensationAmount"].replace({"": 0}, inplace=True)

In [None]:
salary_df["salary"] = pd.to_numeric(salary_df["CompensationAmount"]) * salary_df["exchangeRate"]
salary_df.head()

In [None]:
# salary_df["salary"].describe()
print(f"Maximum Salary is USD $ {salary_df['salary'].max():.0f}",
      f"Minimum Salary is USD $ {salary_df['salary'].min():.0f}",
      f"Median Salary is USD $ {salary_df['salary'].median():.0f}", sep="\n")

In [None]:
sns.histplot(salary_df, x="salary")