In [None]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

import os

## Чтение агрегированных данных

In [None]:
PROFILES_DIR = "profiles"
REVIEWS_DIR = "reviews"
AGGREGATE_DIR = "aggregate"
DATA_DIR = "data"

In [None]:
full_df = pd.read_csv(os.path.join(DATA_DIR, "full_data.csv"), delimiter=",")

In [None]:
sizes = dict()
for file in os.listdir(os.path.join(DATA_DIR, AGGREGATE_DIR)):
    df = pd.read_csv(os.path.join(DATA_DIR, AGGREGATE_DIR, file))
    sizes[file] = len(df)

files = sorted(list(sizes.items()), key=lambda x: x[1], reverse=True)[:10]
services = [x[0].split(".")[0] for x in files]
print("Top 10 services:\nTitle\tCount")
print(*files, sep="\n")
top_10 = []
for file in files:
    df = pd.read_csv(os.path.join(DATA_DIR, AGGREGATE_DIR, file[0]))
    top_10.append(df)

In [None]:
full_df

In [None]:
top_10[0].head()

## Общая обработка и сравнительные графики

In [None]:
def calc_ration(total, *counts):
    return [cnt / total for cnt in counts]

def calc_gender_ratio(df):
    total = len(df)
    male = len(df[df["gender"] == "MALE"])
    female = len(df[df["gender"] == "FEMALE"])
    nan = len(df[pd.isnull(df["gender"])])
    return calc_ration(total, male, female, nan)


def draw_by_gender(percents, title=""):
    count = len(percents[0])
    fig, ax = plt.subplots(figsize=(20, 6))
    plt.grid(True)
    plt.gray()
    width = 0.2

    b1 = ax.bar([i + 0.00 for i in range(1, count + 1)], percents[0], width, label="MALE")
    b2 = ax.bar([i + 0.2 for i in range(1, count + 1)], percents[1], width, label="FEMALE")
    b3 = ax.bar([i + 0.4 for i in range(1, count + 1)], percents[2], width, label="NaN")
    if len(percents) > 3:
        b4 = ax.bar([i + 0.6 for i in range(1, count + 1)], percents[3], width, label="All")

    ax.set_xticks([i + 0.3 for i in range(1, count + 1)])
    ax.set_xticklabels(["Все"] + services, rotation=90)
    ax.set_ylabel("Ratio")
    ax.legend(loc='upper left')
    plt.title("Распределение предметам " + title)
    plt.show()

In [None]:
gr_total = [[], [], []]
for df in [full_df] + top_10:
    percent = calc_gender_ratio(df)
    gr_total[0].append(percent[0])
    gr_total[1].append(percent[1])
    gr_total[2].append(percent[2])

draw_by_gender(gr_total)

In [None]:
def calc_best_ratio(df):
    return len(df[df["rating"] == 5.0]) / len(df)

gr_best = [[], [], [], []]
for df in [full_df] + top_10:
    gr_best[0].append(calc_best_ratio(df[df["gender"] == "MALE"]))
    gr_best[1].append(calc_best_ratio(df[df["gender"] == "FEMALE"]))
    gr_best[2].append(calc_best_ratio(df[pd.isnull(df["gender"])]))
    gr_best[3].append(calc_best_ratio(df))

draw_by_gender(gr_best, title="преподавателей с рейтингом 5.0")

## Обработка отдельных предметов

In [None]:
def build(dataset, service):
    print("Total describe")
    print(dataset.describe().to_string())

    male_df = dataset[dataset["gender"] == "MALE"].sort_values(by=["rating"])
    female_df = dataset[dataset["gender"] == "FEMALE"].sort_values(by=["rating"])
    nan_df = dataset[pd.isnull(dataset["gender"])].sort_values(by=["rating"])

    print()
    print("MALE count:", len(male_df))
    print("FEMALE count:", len(female_df))
    print("NaN count:", len(nan_df))
    
    print("\nMALE describe")
    print(male_df.describe().to_string())
    print("\nFEMALE describe")
    print(female_df.describe().to_string())
    print("\nNaN describe")
    print(nan_df.describe().to_string())

    plt.figure(figsize=(15, 6))
    plt.grid(True)
    plt.hist(female_df["rating"].to_list(), label="FEMALE", alpha=0.5)
    plt.hist(male_df["rating"].to_list(), label="MALE", alpha=0.5)
    plt.hist(nan_df["rating"].to_list(), label="None", alpha=0.25)
    plt.hist(dataset["rating"].to_list(), label="All", alpha=0.25)
    plt.legend()
    plt.ylabel("Count")
    plt.xlabel("Rating")
    plt.title(f"Распределение рейтинга преподавателей, {service}")
    plt.show()

    
    best_df = dataset[dataset["rating"] == 5.0]

    plt.figure(figsize=(15, 6))
    plt.grid(True)
    plt.hist(best_df.sort_values(by=["reviews"])["reviews"].to_list(), bins=100)
    plt.ylabel("Count")
    plt.xlabel("Reviews count")
    plt.title(f"Распределение числа отзывов по преподавателям с рейтингом 5.0, {service}")
    plt.show()

In [None]:
for index in range(len(files)):
    print(f"Start {services[index]}, rows count: {files[index][1]}")
    build(top_10[index], services[index])
    print("\n\n\n\n\n")