In [47]:
import pandas as pd
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport

In [50]:
%matplotlib inline

In [31]:
df = pd.read_csv('psy-data.csv')

In [33]:
# Check for duplicates
df.drop_duplicates(subset=["Name"], inplace=True)

In [34]:
# Drop rows where verification is expired or they don't have a bio (low effort into their profile)

df.drop(df[df.Verified == 'Expired'].index, inplace=True)
df.drop(df[df.Bio == 'No'].index, inplace=True)

df.replace("Yese", "Yes", inplace=True) # Fix typo

In [None]:
# Pandas profile report
modified_df = df.drop(['Top Specialties', 'Expertise', 'Qualifications', 'Name'], axis=1)

profile = ProfileReport(modified_df)
profile.to_file("psy-analytics-profile-report.html")

### Province

In [None]:
frequencies = df.Province.value_counts()

plt.figure(figsize=(4,3))
plt.bar(frequencies.index, frequencies.values, color="Green")
plt.xlabel("Province of Practitioner")
plt.ylabel("Number of Practitioners")
plt.show()

### Gender

In [None]:
frequencies = df.Gender.value_counts()

plt.figure(figsize=(4,3))
plt.bar(frequencies.index, frequencies.values, color="Green")
plt.xlabel("Gender of Practitioner")
plt.ylabel("Number of Practitioners")
plt.show()

### Top Specialty

In [None]:
tspecs_dict = dict()
col = df[["Top Specialties"]]
col.dropna(inplace=True)

for index, row in col.iterrows():
    tspecs = row["Top Specialties"][1:-1].split(",")
    for spec in tspecs:
        spec = spec.strip()
        if spec not in tspecs_dict:
            tspecs_dict[spec] = 1
        else:
            tspecs_dict[spec] = tspecs_dict[spec] + 1


In [94]:
sorted_items = sorted(tspecs_dict.items(), key=lambda item: item[1], reverse=True)

top10 = sorted_items[:10]
bottom10 = sorted_items[-10:]

In [None]:
# Most common top specialties among practitioners

labels, frequencies = zip(*top10)

plt.figure(figsize=(4,3))
plt.bar(labels, frequencies, color="Green")
plt.xlabel("Top Specialty")
plt.ylabel("Number of Practitioners")
plt.xticks(rotation=90)
plt.show()

In [None]:
# Least common top specialties among practitioners

labels, frequencies = zip(*bottom10)

plt.figure(figsize=(4,3))
plt.bar(labels, frequencies, color="Green")
plt.xlabel("Top Specialty")
plt.ylabel("Number of Practitioners")
plt.xticks(rotation=90)
plt.show()

### Expertise

In [None]:
expertise_dict = dict()
col = df[["Expertise"]]
col.dropna(inplace=True)

for index, row in col.iterrows():
    expertise = row["Expertise"][1:-1].split(",")
    for item in expertise:
        item = item.strip()
        if item not in expertise_dict:
            expertise_dict[item] = 1
        else:
            expertise_dict[item] = expertise_dict[item] + 1

In [None]:
sorted_items = sorted(expertise_dict.items(), key=lambda item: item[1], reverse=True)
top15 = sorted_items[:15]

labels, frequencies = zip(*top15)

plt.figure(figsize=(4,3))
plt.bar(labels, frequencies, color="Green")
plt.xlabel("Expertise")
plt.ylabel("Number of Practitioners")
plt.xticks(rotation=90)
plt.show()


### Qualifications

In [None]:
qualifications = dict()
col = df[["Qualifications"]]
col.dropna(inplace=True)

for index, row in col.iterrows():
    quals = row["Qualifications"][1:-1].split(",")
    for qual in quals:
        qual = qual.strip()
        if qual not in qualifications:
            qualifications[qual] = 1
        else:
            qualifications[qual] = qualifications[qual] + 1

In [None]:
sorted_qualifications = sorted(qualifications.items(), key=lambda item: item[1], reverse=True)

top10_quals = sorted_qualifications[:10]
labels, frequencies = zip(*top10_quals)

plt.figure(figsize=(4,3))
plt.bar(labels, frequencies, color="Green")
plt.xlabel("Qualification")
plt.ylabel("Number of Practitioners")
plt.xticks(rotation=90)
plt.show()