In [None]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [None]:
df = pd.read_csv('dataset.csv')

df_clean = df.replace('?', pd.NA).dropna()

df_clean['salary'] = pd.to_numeric(df_clean['salary'], errors='coerce')
df_clean['salary $K'] = pd.to_numeric(df_clean['salary $K'], errors='coerce')

df_clean = df_clean.dropna(subset=['salary', 'salary $K'])

salary_mismatch = df_clean[~(df_clean['salary'] / 1000 == df_clean['salary $K'])]
if not salary_mismatch.empty:
    print("⚠️ Увага: знайдено невідповідності в зарплатах!")
    print(salary_mismatch[['salary', 'salary $K']])
else:
    print("✅ Зарплати перевірено: salary $K = salary / 1000")

print("\n🔹 Початковий розмір даних:", df.shape)
print("🔹 Очищений розмір даних:", df_clean.shape)
print("\nПерші 5 рядків після очищення:")
display(df_clean.head())

# Task 1
Print the count of men and women in the dataset.

In [None]:
gender_counts = df_clean['sex'].value_counts()
print(gender_counts)

# Task 2
Find the average age of men in dataset

In [None]:
average_age_men = df_clean[df_clean['sex'] == 'Male']['age'].mean()

# Task 3
Get the percentage of people from Poland (native-country)

In [None]:
total_people = len(df_clean)
polish_count = len(df_clean[df_clean['native-country'] == 'Poland'])
polish_percentage = (polish_count / total_people) * 100

# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [1]:
df_clean['salary'] = pd.to_numeric(df_clean['salary'])

# Define income groups
high_income = df_clean[df_clean['salary'] > 50000]
low_income = df_clean[df_clean['salary'] <= 50000]

# Calculate statistics for high income group
high_income_age_mean = high_income['age'].mean()
high_income_age_std = high_income['age'].std()

# Calculate statistics for low income group
low_income_age_mean = low_income['age'].mean()
low_income_age_std = low_income['age'].std()

results = pd.DataFrame({
    'Income Group': ['>50K', '≤50K'],
    'Mean Age': [high_income_age_mean, low_income_age_mean],
    'Std Dev Age': [high_income_age_std, low_income_age_std],
    'Count': [len(high_income), len(low_income)]
})

NameError: name 'pd' is not defined

# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [3]:
higher_education = [
    'Bachelors', 'Prof-school', 'Assoc-acdm',
    'Assoc-voc', 'Masters', 'Doctorate'
]

# Filter for people without higher education but earning >50K
high_earners_no_he = df_clean[
    (~df_clean['education'].isin(higher_education)) &
    (df_clean['salary'] > 50000)
]

# Display results
print(f"Found {len(high_earners_no_he)} individuals without higher education earning >50K")
print("\nBreakdown of their education levels:")
print(high_earners_no_he['education'].value_counts())

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [None]:
age_stats_by_education = df_clean.groupby('education')['age'].describe()

# Format the output for better readability
age_stats_by_education = age_stats_by_education.rename(columns={
    'count': 'Count',
    'mean': 'Mean Age',
    'std': 'Std Dev',
    'min': 'Min Age',
    '25%': '25th Percentile',
    '50%': 'Median',
    '75%': '75th Percentile',
    'max': 'Max Age'
})

# Display the full statistics
print("Complete Age Statistics by Education Level:")
display(age_stats_by_education)

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [2]:
men_data = df_clean[df_clean['sex'] == 'Male']

# Create marital status groups
married_men = men_data[men_data['marital-status'].str.startswith('Married')]
non_married_men = men_data[~men_data['marital-status'].str.startswith('Married')]

married_high = len(married_men[married_men['salary'] > 50000]) / len(married_men) * 100
non_married_high = len(non_married_men[non_married_men['salary'] > 50000]) / len(non_married_men) * 100

comparison = pd.DataFrame({
    'Group': ['Married Men', 'Non-Married Men'],
    'Total Count': [len(married_men), len(non_married_men)],
    'High Earners (>50K)': [len(married_men[married_men['salary'] > 50000]),
                           len(non_married_men[non_married_men['salary'] > 50000])],
    '% High Earners': [married_high, non_married_high]
})

# Format the output
comparison['% High Earners'] = comparison['% High Earners'].round(1)
comparison['High Earner Proportion'] = comparison['High Earners (>50K)'].astype(str) + ' (' + comparison['% High Earners'].astype(str) + '%)'

display(comparison[['Group', 'Total Count', 'High Earner Proportion']])

NameError: name 'pd' is not defined

# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [6]:
max_hours = df_clean['hours-per-week'].max()

# Count how many people work this maximum amount
max_hours_count = len(df_clean[df_clean['hours-per-week'] == max_hours])

# Get additional statistics about these individuals
max_hours_workers = df_clean[df_clean['hours-per-week'] == max_hours]
occupation_stats = max_hours_workers['occupation'].value_counts()
income_stats = max_hours_workers['salary'].apply(lambda x: '>50K' if x > 50000 else '<=50K').value_counts()

# Display results
print(f"Maximum hours worked per week: {max_hours} hours")
print(f"Number of people working {max_hours} hours/week: {max_hours_count}")
print(f"\nThis represents {max_hours_count/len(df_clean)*100:.2f}% of the dataset")

# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [7]:
import seaborn as sns
import matplotlib.pyplot as plt

# Prepare numeric data for correlation analysis
df_numeric = df_clean.copy()
categorical_cols = ['workclass', 'education', 'marital-status', 'occupation',
                   'relationship', 'race', 'sex', 'native-country']

# Convert categorical columns to numeric codes
for col in categorical_cols:
    if col in df_numeric.columns:
        df_numeric[col] = df_numeric[col].astype('category').cat.codes

plt.figure(figsize=(10, 8))
corr_matrix = df_numeric.corr()
sns.heatmap(corr_matrix[(corr_matrix > 0.3) | (corr_matrix < -0.3)],
            annot=True, fmt=".1f", cmap='coolwarm', center=0)
plt.title("Significant Correlations (|r| > 0.3)")
plt.show()

corr_pairs = corr_matrix.unstack().sort_values(key=abs, ascending=False)
for pair in corr_pairs[5:15:2].index:
    if pair[0] != pair[1]:
        print(f"{pair[0]} ↔ {pair[1]}: {corr_matrix.loc[pair[0], pair[1]]:.2f}")