In [None]:
import numpy as np
import pandas as pd

# Task 0
Read the dataset from csv file & perform data cleaning - remove all rows, which contains `?` in some columns.
Also check for data correctness (salary & salary $K).

In [None]:
# Read and clean data
df = pd.read_csv('../data/adult.csv', na_values='?')
df_clean = df.dropna().copy()

# Convert salary columns to numeric
df_clean['salary'] = pd.to_numeric(df_clean['salary'])
df_clean['salary $K'] = pd.to_numeric(df_clean['salary $K'])

# Verify salary columns
salary_check = (df_clean['salary'] / 1000 == df_clean['salary $K']).all()
print(f"Salary columns consistent: {salary_check}")

# Task 1
Print the count of men and women in the dataset.

In [None]:
gender_counts = df_clean['sex'].value_counts()
print(gender_counts)

# Task 2
Find the average age of men in dataset

In [None]:
avg_age_men = df_clean.loc[df_clean['sex'] == 'Male', 'age'].mean()
print(f"Average age of men: {avg_age_men:.1f}")

# Task 3
Get the percentage of people from Poland (native-country)

In [None]:
poland_pct = (df_clean['native-country'] == 'Poland').mean() * 100
print(f"Percentage from Germany: {german_pct:.2f}%")

# Task 4
Get the mean and standard deviation of the age for people who earn > 50K per year. After this, get it for those who earn <= 50K.

In [1]:
age_stats = df_clean.groupby(df_clean['salary'] > 50000)['age'].agg(['mean', 'std'])
print(age_stats)

NameError: name 'pd' is not defined

# Task 5
Check, if there are some people without higher education (education: Bachelors, Prof-school, Assoc-acdm, Assoc-voc, Masters, Doctorate), but with > 50K salary

In [3]:
higher_ed = ['Bachelors', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', 'Masters', 'Doctorate']
mask = (~df_clean['education'].isin(higher_ed)) & (df_clean['salary'] > 50000)
count = mask.sum()
print(f"High earners without higher education: {count}")

# Task 6
Get the statistics of age for each type of education. Use `groupby` and `describe` for this.

In [None]:
age_stats = df_clean.groupby('education')['age'].describe()
print(age_stats)

# Task 7
Compare the married and non-married men salaries. Who earns more? (>50K or <=50K)
Married men are those, whom `marital-status` starts with "Married". Others are not.

In [2]:
men = df_clean[df_clean['sex'] == 'Male']
result = men.groupby(men['marital-status'].str.startswith('Married'))['salary'].agg(
    total_count='count',
    high_earners=lambda x: (x > 50000).sum(),
    pct_high_earners=lambda x: (x > 50000).mean() * 100
)
print(result)

NameError: name 'pd' is not defined

# Task 8
Get the max hours per week some person works. How many people works the same amount of hours per week?

In [6]:
max_hours = df_clean['hours-per-week'].max()
count = (df_clean['hours-per-week'] == max_hours).sum()
print(f"Max hours/week: {max_hours} ({count} people work this amount)")

# Task 9
Analyze the correlation between data in dataset. Understand connected fields in it and print highlight thier connection.

In [7]:
import seaborn as sns
import matplotlib.pyplot as plt

# Convert categorical columns
df_numeric = df_clean.copy()
cat_cols = df_numeric.select_dtypes(include=['object']).columns
df_numeric[cat_cols] = df_numeric[cat_cols].apply(lambda x: x.astype('category').cat.codes)

# Correlation matrix
corr = df_numeric.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr[abs(corr) > 0.3], annot=True, fmt=".2f", cmap='coolwarm')
plt.show()

# Top correlations
corr_pairs = corr.mask(np.triu(np.ones_like(corr, dtype=bool))).stack()
top_corrs = corr_pairs.abs().sort_values(ascending=False).head(10)
print("Top Correlations:")
print(top_corrs)