In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score

In [None]:
def get_statistics(attributes):

    statistics = {
        'Среднее': attributes.mean(),
        'Медиана': attributes.median(),
        'Мода': attributes.mode().iloc[0],
        'Минимум': attributes.min(),
        'Максимум': attributes.max(),
        'Среднее отклонение': attributes.std()
    }

    return statistics


In [None]:
def get_corr(attributes):
    correlation_matrix = attributes.corr()

    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

    most_correlated = correlation_matrix.unstack().sort_values(ascending=False)
    most_correlated = most_correlated[most_correlated != 1]
    most_correlated_pairs = [most_correlated.index[0], most_correlated.index[2]]

    least_correlated_pairs = [most_correlated.index[-1], most_correlated.index[-3]]

    plt.figure(figsize=(8, 6))
    sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title("Correlation Matrix Heatmap")
    plt.show()

    print("Наиболее скоррелированные переменные:")
    print(most_correlated_pairs)

    print("\nНаименее скоррелированные переменные:")
    print(least_correlated_pairs)

In [None]:
def get_workers_by_dep(workers):
    current_workers = workers[workers['left'] == 0]

    department_counts = current_workers.groupby('department').size().reset_index(name='employee_count')

    return department_counts

In [None]:
def get_salary_graph(salary):
    plt.figure(figsize=(8, 6))
    sns.set(style="whitegrid")
    sns.countplot(x=salary, palette="Set3")
    plt.title("Распределение сотрудников по зарплатам")
    plt.xlabel("Зарплата")
    plt.ylabel("Количество сотрудников")

    plt.show()

In [None]:
def get_salary_graph_by_dep(attributes):
    departments = attributes['department'].unique()

    plt.figure(figsize=(16, 10))

    num_departments = len(departments)
    rows = (num_departments + 1) // 2

    for i, department in enumerate(departments):
        plt.subplot(rows, 2, i + 1)
        department_data = attributes[attributes['department'] == department]
        sns.countplot(data=department_data, x='salary', palette="Set2")
        plt.title(f"Распределение зарплат в департаменте '{department}'")
        plt.xlabel("Зарплата")
        plt.ylabel("Количество сотрудников")

    plt.tight_layout()
    plt.show()

In [None]:
def check_hors_salary_hypothesis(attributes):
    high_salary_group = attributes[attributes['salary'] == 'high']['average_montly_hours']
    low_salary_group = attributes[attributes['salary'] == 'low']['average_montly_hours']

    t_statistic, p_value = stats.ttest_ind(high_salary_group, low_salary_group)

    alpha = 0.05

    if p_value < alpha:
        return "Гипотеза не отвергается: сотрудники с высоким зп проводят на работе больше времени."
    else:
        return "Гипотеза отвергается: нет значимой разницы между зп и времени проведенным на работе."


In [None]:
def get_left_no_left_stat(attributes):
    left = attributes[attributes['left'] == 1]
    not_left = attributes[attributes['left'] == 0]

    left_promotion_rate = round(left['promotion_last_5years'].mean(), 3) * 100
    left_avg_satisfaction = round(left['satisfaction_level'].mean(), 3) * 100
    left_avg_projects = round(left['number_project'].mean())

    not_left_promotion_rate = round(not_left['promotion_last_5years'].mean(), 3) * 100
    not_left_avg_satisfaction = round(not_left['satisfaction_level'].mean(), 3) * 100
    not_left_avg_projects = round(not_left['number_project'].mean())

    print("Доля сотрудников с повышением за последние 5 лет (уволившиеся):", left_promotion_rate)
    print("Средняя степень удовлетворенности (уволившиеся):", left_avg_satisfaction)
    print("Среднее количество проектов (уволившиеся):", left_avg_projects)
    print("\nДоля сотрудников с повышением за последние 5 лет (не уволившиеся):", not_left_promotion_rate)
    print("Средняя степень удовлетворенности (не уволившиеся):", not_left_avg_satisfaction)
    print("Среднее количество проектов (не уволившиеся):", not_left_avg_projects)

In [None]:
def left_or_no_model(workers):
    X = workers.drop(['department', 'salary', 'left'], axis=1)
    y = workers['left']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    lda = LinearDiscriminantAnalysis()
    lda.fit(X_train, y_train)

    y_pred = lda.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy модели на тестовой выборке: {round(accuracy, 2) * 100}%")