In [None]:
"""For documentation about the code or if you want to reuse parts of it, look into `analyze_data.py`."""
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from datetime import timedelta
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.stats import spearmanr

In [None]:
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [10, 6]})

In [None]:
df = pd.read_json('python_jobs.json').convert_dtypes()
category_columns = ["proposals", "client_location", "type", "experience_level", "time_estimate"]
integer_columns = ['budget', 'client_jobs_posted', 'client_total_spent']
float_columns = ['client_hire_rate', 'client_hourly_rate']
df[category_columns] = df[category_columns].astype('category')
df[integer_columns] = df[integer_columns].apply(lambda series: pd.to_numeric(series, downcast='unsigned'))
df[float_columns] = df[float_columns].apply(lambda series: pd.to_numeric(series, downcast='float'))
df['time'] = pd.to_datetime(df['time'], unit='s')

In [None]:
df.info()

In [None]:
# Drop duplicated
df = df[~df.drop(['skills', 'time'], axis=1).duplicated()].reset_index(drop=True)

In [None]:
df['type'].value_counts()

In [None]:
df['experience_level'].value_counts()

In [None]:
df['client_hourly_rate'].describe()

In [None]:
df['time_estimate'].value_counts()

In [None]:
df.loc[df['type'] == 'Fixed']['budget'].describe()

In [None]:
df.loc[df['type'] == 'Hourly']['budget'].describe()

In [None]:
budget_groups = ['<10$', '10-20$', '20-30$', '30-40$', '40-50$', '50-100$', '100-200$', '200-300$', '300-400$', '400-500$', '500-1000$', '1000-5000$', '5000-10000$', "10000-50000$", ">50000$"]
budget_bins = [0, 10, 20, 30, 40, 50, 100, 200, 300, 400, 500, 1000, 5000, 10000, 50_000, int(1e9)]
budget_ranges = pd.cut(df['budget'], bins=budget_bins, labels=budget_groups)
sns.countplot(x=budget_ranges, order=budget_groups).set(title="Budget ranges count", xlabel="Budget Range", ylabel="Count", yticks=range(0, budget_ranges.value_counts().max(), 20))
plt.xticks(rotation=45)
plt.show()  # To get rid of the text printed before the plot

In [None]:
df_one_week = df[df['time'] >= (df['time'].max() - timedelta(days=7))].copy()
df_one_week['day'] = df_one_week['time'].dt.day_name()
sns.countplot(df_one_week, x='day', order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.show()

In [None]:
counts = df.dropna(subset=['budget'])['client_location'].value_counts()
no_four_trick_ponies = df[df['client_location'].isin(counts.index[counts > 4])]
top_15_countries = (
    no_four_trick_ponies
    .groupby("client_location", observed=False)
    .budget
    .mean()
    .reset_index()
    .sort_values('budget', ascending=False)
    .head(15)
)
top_15_countries['client_location'] = top_15_countries['client_location'].astype('string')
# Convert category dtype to string because seaborn will display all the categories even if they are not
# present in chosen dataframe, this is probably a bug with seaborn.
sns.barplot(top_15_countries, x='client_location', y='budget')
plt.xticks(rotation=90)
plt.show()

In [None]:
skills_counter = {}
for skills_set in df['skills']:
    for skill in skills_set:
        skills_counter[skill] = skills_counter.get(skill, 0) + 1
top_n = 20  # The number of most common skills to choose.
most_common_skills = dict(sorted(skills_counter.items(), key=lambda x: x[1], reverse=True)[:20])  # Ordered from most to least common
sns.barplot(most_common_skills, orient='h').set(title="Skills count", xlabel="Skill", ylabel="Count")
plt.show()

In [None]:
filtered_df = df.dropna(subset=['budget', 'proposals'], ignore_index=True).copy()
budget_cap = int(filtered_df['budget'].quantile(0.99))
filtered_df['budget'] = filtered_df['budget'].clip(upper=budget_cap)

In [None]:
# Transform Skills into Binary Indicator Columns
mlb = MultiLabelBinarizer()
skills_transformed = mlb.fit_transform(filtered_df['skills'])
skills_df = pd.DataFrame(skills_transformed, columns=mlb.classes_)
df_skills_binary = pd.concat([filtered_df, skills_df], axis=1).drop('skills', axis=1)

In [None]:
high_frequency_skills = [skill for skill, count in sorted(skills_counter.items(), key=lambda x: x[1], reverse=True) if count >= 30]

In [None]:
high_budget_corr_skills = []
for skill in mlb.classes_:
    corr, p_value = spearmanr(df_skills_binary[skill], df_skills_binary['budget'])
    if p_value <= 0.05 <= corr:
        high_budget_corr_skills.append(skill)

In [None]:
skills_of_interest = set(high_frequency_skills).intersection(high_budget_corr_skills)

In [None]:
df_melted = df_skills_binary.melt(id_vars=['budget', 'proposals'], value_vars=skills_of_interest, var_name='skill', value_name='presence')
# Filter only the rows where the skill is present
df_melted = df_melted[df_melted['presence'] == 1].drop('presence', axis=1)

In [None]:
sns.boxplot(df_melted, x='skill', y='budget', order=skills_of_interest)
plt.xticks(rotation=90)
plt.title('Distribution of Budgets by Skill Presence')
plt.show()

In [None]:
g = sns.FacetGrid(df_melted, col="proposals", hue='proposals', col_wrap=2, height=8, sharex=False, sharey=False, col_order=['Less than 5', '5 to 10', '10 to 15', '15 to 20', '20 to 50', '50+'])
g.map(sns.boxplot, "skill", "budget", order=skills_of_interest)
for ax in g.axes.flat:
    for label in ax.get_xticklabels():
        label.set_rotation(90)
g.fig.suptitle('Distribution of Budgets by Skill Presence and Number of Proposals')
g.fig.tight_layout()  # Adjust spacing.
plt.show()

In [None]:
contingency_table = pd.crosstab(df_melted['proposals'], df_melted['skill'], normalize='columns').reindex(['Less than 5', '5 to 10', '10 to 15', '15 to 20', '20 to 50', '50+']).T
sns.heatmap(contingency_table, annot=True, fmt='.2g');