In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

In [None]:
file_list = glob("./data/*")
file_list

In [None]:
mcq = pd.read_csv(file_list[4])
mcq.info()

In [None]:
mcq.head()

In [None]:
mca = pd.read_csv(file_list[2], encoding="cp437", low_memory=False)
mca.info()

In [None]:
mca.head()

In [None]:
mcq[mcq["Column"] == "LanguageRecommendationSelect"]

In [None]:
mca.loc[:, mca.columns == "LanguageRecommendationSelect"].value_counts().head()

In [None]:
mca["LanguageRecommendationSelect"].value_counts().index

In [None]:
plt.figure(figsize=(12, 6))
sns.set(font_scale=0.8)
sns.countplot(mca, y="LanguageRecommendationSelect", order=mca["LanguageRecommendationSelect"].value_counts().index)

In [None]:
mca["CurrentJobTitleSelect"].value_counts().head()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(mca, y="CurrentJobTitleSelect", order=mca["CurrentJobTitleSelect"].value_counts().index)

In [None]:
# count() Method는 Nan 컬럼은 세지 않는다.
mca["CurrentJobTitleSelect"].count()

In [None]:
# 컬럼 값이 Null(Nan)인 아이들만...
mca["CurrentJobTitleSelect"].isnull().sum()

In [None]:
# 컬럼 수 세기
mca["CurrentJobTitleSelect"].notnull().count()

In [None]:
mca["LanguageRecommendationSelect"].value_counts()

In [None]:
data = mca[(mca["LanguageRecommendationSelect"] == "Python") | (mca["LanguageRecommendationSelect"] == "R")]
data.shape

In [None]:
mca[mca["CurrentJobTitleSelect"].notnull() & ((mca["LanguageRecommendationSelect"] == "Python") | (mca["LanguageRecommendationSelect"] == "R"))]["CurrentJobTitleSelect"].count()

In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(data,
              y="CurrentJobTitleSelect",
              hue="LanguageRecommendationSelect",
              order=data["CurrentJobTitleSelect"].value_counts().index)

In [None]:
data.pivot_table("EmploymentStatus", index=["CurrentJobTitleSelect"], columns=["LanguageRecommendationSelect"], aggfunc="count")

In [None]:
data.groupby(["CurrentJobTitleSelect", "LanguageRecommendationSelect"])["CurrentJobTitleSelect"].count()

In [None]:
tool_count_df = pd.DataFrame(mca["MLToolNextYearSelect"].value_counts())
tool_count_df

In [None]:
tool_rate_df = pd.DataFrame(mca["MLToolNextYearSelect"].value_counts(normalize=True))
tool_rate_df

In [None]:
tool_df = pd.concat([tool_count_df, tool_rate_df], axis=1)
tool_df.columns = ["응답수", "비율"]
data = tool_df.head(20)
data

In [None]:
sns.set(font="Malgun Gothic", font_scale=0.8)
sns.barplot(data, y=data.index, x="응답수")

In [None]:
data =mca["MLMethodNextYearSelect"].value_counts().head(15)
data.values

In [None]:
sns.barplot(y=data.index, x=data.values)

In [None]:
# pd.set_option("display.max_colwidth", None)
# pd.set_option("display.max_rows", None)
# pd.set_option("display.max_columns", None)

In [None]:
# platform_df = mca["LearningPlatformSelect"].str.split(",", expand=True).melt().drop("variable", axis=1).dropna()
# platform_df.columns = ["LearningPlatformName"]
# platform_df.head()

In [None]:
# platform_df["LearningPlatformName"].value_counts()

In [None]:
# mca["LearningPlatformSelect"].str.split(",", expand=True)

In [None]:
# mca["LearningPlatformSelect"].str.split(",", expand=True).melt(ignore_index=False).drop("variable", axis=1).loc[0].dropna()

In [None]:
# mca["LearningPlatformSelect"].str.split(",", expand=True).stack()

In [None]:
platform_df = mca["LearningPlatformSelect"].str.split(",", expand=True).stack().reset_index(level=1, drop=True)
data = platform_df.value_counts().head(15)
data

In [None]:
sns.barplot(y=data.index, x=data.values)

In [None]:
# pd.reset_option("display.max_colwidth")
# pd.reset_option("display.max_rows")
# pd.reset_option("display.max_columns")

In [None]:
usefulness_list = mca.columns[mca.columns.str.contains("LearningPlatformUsefulness")].tolist()
usefulness_list

In [None]:
# mca["LearningPlatformUsefulnessArxiv"].value_counts(normalize=True)

In [None]:
usefulness_rate_dict = {}
for usefulness in usefulness_list:
    temp = mca[usefulness].value_counts(normalize=True)
    usefulness_rate_dict[usefulness[len("LearningPlatformUsefulness"):]] = temp

for k, v in usefulness_rate_dict.items():
    print (k, v, sep="\n", end="\n\n")


In [None]:
usefulness_rate_df = pd.DataFrame(usefulness_rate_dict).transpose().sort_values("Very useful", ascending=False).sort_index(axis=1, ascending=False)
usefulness_rate_df.head()

In [None]:
plt.figure(figsize=(8, 8))
sns.heatmap(usefulness_rate_df, annot=True, fmt=".2f")

In [None]:
plot = usefulness_rate_df.plot.bar(figsize=(16, 8), title="Usefulness of Learning Platforms")
plt.xticks(rotation=60)
plot

In [None]:
category_list = list(mca.columns[mca.columns.str.contains("LearningCategory", regex=True)])
category_list

In [None]:
category_mean_dict = {}
for category in category_list:
    temp = mca[category].mean()
    category_mean_dict[category[len("LearningCategory"):]] = temp
category_series = pd.Series(category_mean_dict)
category_series

In [None]:
category_series.plot.pie(autopct="%.1f%%")

In [None]:
mcq[mcq["Column"].str.contains("HardwarePersonalProjectsSelect")].iloc[0, 1]

In [None]:
mca[mca["HardwarePersonalProjectsSelect"].notnull()]["HardwarePersonalProjectsSelect"].shape

In [None]:
# mca["HardwarePersonalProjectsSelect"].str.split(",", expand=True).melt(ignore_index=False).drop("variable", axis=1).dropna()

In [None]:
hardware_series = mca["HardwarePersonalProjectsSelect"] \
    .str.split(",", expand=True) \
    .stack() \
    .reset_index(level=1, drop=True) \
    .value_counts()

hardware_series.name = "hardware"

pd.DataFrame(hardware_series)

In [None]:
mca.groupby(["TimeSpentStudying", "EmploymentStatus"])["EmploymentStatus"].count()

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(mca,
              y="TimeSpentStudying",
              hue="EmploymentStatus")

In [None]:
mca["EmploymentStatus"].value_counts()

In [None]:
fulltime_df = mca[mca["EmploymentStatus"] == "Employed full-time"]
fulltime_df.shape

In [None]:
parttime_df = mca[mca["EmploymentStatus"] == "Not employed, but looking for work"]
parttime_df.shape

In [None]:
figure, (ax1, ax2) = plt.subplots(ncols=2)
figure.set_size_inches(12, 8)

sns.countplot(fulltime_df,
              x="TimeSpentStudying",
              hue="EmploymentStatus",
              ax=ax1)

sns.countplot(parttime_df,
              x="TimeSpentStudying",
              hue="EmploymentStatus",
              ax=ax2)